In [None]:
pip install pandas scikit-learn transformers torch nltk


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from wordcloud import WordCloud

# Step 1: Load Dataset
df = pd.read_csv("/content/intern_feedback_balanced.csv")

# Step 2: Visualize Sentiment Distribution
plt.figure(figsize=(6, 4))
sns.countplot(data=df, x='sentiment', palette='Set2')
plt.title("Sentiment Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

# Step 3: WordClouds
def generate_wordcloud(text, title):
    wc = WordCloud(width=800, height=400, background_color='white', stopwords='english').generate(" ".join(text))
    plt.figure(figsize=(10, 5))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.show()

generate_wordcloud(df[df['sentiment']=='positive']['feedback'], "WordCloud - Positive Feedback")
generate_wordcloud(df[df['sentiment']=='negative']['feedback'], "WordCloud - Negative Feedback")

# Step 4: Encode sentiment
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Step 5: Stratified Split to balance classes in both sets
X_train, X_test, y_train, y_test = train_test_split(
    df['feedback'], df['label'], test_size=0.2, stratify=df['label'], random_state=42)

# Step 6: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Step 7: Train Model with Class Balancing
model = LogisticRegression(class_weight='balanced', random_state=42)
model.fit(X_train_vec, y_train)
y_pred = model.predict(X_test_vec)

# Step 8: Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["Negative", "Positive"]))

# Step 9: Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()

# Step 10: Most Influential Words
feature_names = vectorizer.get_feature_names_out()
coefficients = model.coef_[0]

top_pos_indices = coefficients.argsort()[::-1][:10]
top_neg_indices = coefficients.argsort()[:10]

top_pos_words = [(feature_names[i], coefficients[i]) for i in top_pos_indices]
top_neg_words = [(feature_names[i], coefficients[i]) for i in top_neg_indices]

# Plot Function
def plot_words(words, title, color):
    labels, values = zip(*words)
    plt.figure(figsize=(10, 5))
    sns.barplot(x=values, y=labels, palette=color)
    plt.title(title)
    plt.xlabel("TF-IDF Weight (Model Coefficient)")
    plt.tight_layout()
    plt.show()

plot_words(top_pos_words, "Top Words Predicting Positive Sentiment", "Greens")
plot_words(top_neg_words, "Top Words Predicting Negative Sentiment", "Reds")


**LDA topic modeling**

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from gensim import corpora, models

# download stopwords
nltk.download('punkt')
nltk.download('stopwords')

# Step 1: Load dataset and filter negative feedback
df = pd.read_csv("/content/intern_feedback_balanced.csv")
df = df[df['sentiment'] == 'negative']  # Only negative feedback

# Step 2: Preprocess and tokenize
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = nltk.word_tokenize(text.lower())
    return [word for word in tokens if word.isalpha() and word not in stop_words]

df['tokens'] = df['feedback'].apply(preprocess)

# Step 3: Topic Modeling using LDA
dictionary = corpora.Dictionary(df['tokens'])
corpus = [dictionary.doc2bow(text) for text in df['tokens']]

lda_model = models.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=10, random_state=42)

# Step 4: Show topics
print("\n🔍 Top complaint themes (topics):")
for i, topic in lda_model.print_topics():
    print(f"Topic {i+1}: {topic}")


**Transformer-based Model (BERT)**

In [None]:
pip install -U transformers datasets


In [None]:
pip install transformers datasets


In [None]:
!pip install -U transformers


In [None]:
from transformers import pipeline

# Load pretrained sentiment pipeline
sentiment_analyzer = pipeline("sentiment-analysis")

# Example usage
sample_feedback = [
    "The team was very supportive and helped me learn a lot.",
    "I felt ignored and didn't get enough guidance."
]

results = sentiment_analyzer(sample_feedback)

for fb, res in zip(sample_feedback, results):
    print(f"\nFeedback: {fb}\nSentiment: {res['label']} (Score: {res['score']:.2f})")


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load dataset
df = pd.read_csv("/content/intern_feedback_balanced.csv")
df['label'] = df['sentiment'].map({'negative': 0, 'positive': 1})

# Step 2: Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['feedback'].tolist(), df['label'].tolist(), test_size=0.2, stratify=df['label'], random_state=42)

# Step 3: Tokenization
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

train_df = pd.DataFrame({"text": train_texts, "label": train_labels})
test_df = pd.DataFrame({"text": test_texts, "label": test_labels})

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Step 4: Load pre-trained BERT for binary classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Step 5: Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    do_train=True,
    do_eval=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="no"  # avoids trying to save checkpoints
)

# Step 6: Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Step 7: Train the model
trainer.train()

# Step 8: Predict and evaluate
predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(axis=1)

print("\nClassification Report:")
print(classification_report(test_labels, y_pred, target_names=["Negative", "Positive"]))

# Step 9: Confusion Matrix
cm = confusion_matrix(test_labels, y_pred)
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("BERT Confusion Matrix")
plt.tight_layout()
plt.show()


In [None]:
pip install gensim