In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
print("Libraries imported successfully")

Libraries imported successfully


In [2]:
from sklearn.model_selection import train_test_split

full_df = pd.read_csv('training_data_complete.csv')
train_df, test_df = train_test_split(full_df, test_size=0.2, random_state=42, stratify=full_df['Sentiment'])

print("Train-Test Split (80-20):")
print(f"  Total records: {len(full_df)}")
print(f"  Training set: {len(train_df)} ({len(train_df)/len(full_df):.1%})")
print(f"  Test set: {len(test_df)} ({len(test_df)/len(full_df):.1%})")
print(f"\nColumns: {list(train_df.columns)}")

print("\nSentiment Distribution:")
print("Training set:")
for sentiment in train_df['Sentiment'].unique():
    count = (train_df['Sentiment'] == sentiment).sum()
    pct = round(count / len(train_df) * 100, 1)
    print(f"  {sentiment}: {count} ({pct}%)")
print("Test set:")
for sentiment in test_df['Sentiment'].unique():
    count = (test_df['Sentiment'] == sentiment).sum()
    pct = round(count / len(test_df) * 100, 1)
    print(f"  {sentiment}: {count} ({pct}%)")

Train-Test Split (80-20):
  Total records: 500
  Training set: 400 (80.0%)
  Test set: 100 (20.0%)

Columns: ['Employee_ID', 'Associate_Name', 'Department', 'Evaluation_Result', 'Skill_Feedback_1', 'Skill_Feedback_2', 'Skill_Feedback_3', 'Overall_Feedback', 'Sentiment']

Sentiment Distribution:
Training set:
  negative: 85 (21.2%)
  neutral: 137 (34.2%)
  positive: 178 (44.5%)
Test set:
  positive: 45 (45.0%)
  neutral: 34 (34.0%)
  negative: 21 (21.0%)


In [3]:
train_df['combined_feedback'] = (train_df['Skill_Feedback_1'].fillna('') + ' ' + 
                                 train_df['Skill_Feedback_2'].fillna('') + ' ' + 
                                 train_df['Skill_Feedback_3'].fillna('') + ' ' + 
                                 train_df['Overall_Feedback'].fillna(''))

test_df['combined_feedback'] = (test_df['Skill_Feedback_1'].fillna('') + ' ' + 
                                test_df['Skill_Feedback_2'].fillna('') + ' ' + 
                                test_df['Skill_Feedback_3'].fillna('') + ' ' + 
                                test_df['Overall_Feedback'].fillna(''))

X_train_text = train_df['combined_feedback']
X_train_cat = train_df[['Evaluation_Result']]
y_train = train_df['Sentiment']

X_test_text = test_df['combined_feedback']
X_test_cat = test_df[['Evaluation_Result']]
y_test = test_df['Sentiment']

tfidf = TfidfVectorizer(max_features=500, min_df=2, max_df=0.8, ngram_range=(1, 2), lowercase=True, stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)

X_train_cat_encoded = pd.get_dummies(X_train_cat, drop_first=False)
X_test_cat_encoded = pd.get_dummies(X_test_cat, drop_first=False)

all_cols = set(X_train_cat_encoded.columns) | set(X_test_cat_encoded.columns)
for col in all_cols:
    if col not in X_train_cat_encoded.columns: X_train_cat_encoded[col] = 0
    if col not in X_test_cat_encoded.columns: X_test_cat_encoded[col] = 0
X_test_cat_encoded = X_test_cat_encoded[X_train_cat_encoded.columns]

X_train_tfidf_dense = X_train_tfidf.toarray()
X_test_tfidf_dense = X_test_tfidf.toarray()
X_train_combined = np.hstack([X_train_tfidf_dense, X_train_cat_encoded.values])
X_test_combined = np.hstack([X_test_tfidf_dense, X_test_cat_encoded.values])

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

print("Feature Engineering Complete:")
print(f"  TF-IDF features: {X_train_tfidf_dense.shape[1]}")
print(f"  Categorical features: {X_train_cat_encoded.shape[1]}")
print(f"  Total features: {X_train_combined.shape[1]}")
print(f"  Target classes: {list(le.classes_)}")
print(f"  Training samples: {X_train_combined.shape[0]}")
print(f"  Test samples: {X_test_combined.shape[0]}")

Feature Engineering Complete:
  TF-IDF features: 498
  Categorical features: 3
  Total features: 501
  Target classes: ['negative', 'neutral', 'positive']
  Training samples: 400
  Test samples: 100


In [4]:
print("TRAINING MODELS WITH BALANCED CLASS WEIGHTS\n")
models = {}
results = {}

print("1. Logistic Regression...", end=" ")
lr = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1, class_weight='balanced')
lr.fit(X_train_combined, y_train_encoded)
y_pred_lr = lr.predict(X_test_combined)
acc_lr = accuracy_score(y_test_encoded, y_pred_lr)
results['Logistic Regression'] = {'accuracy': acc_lr, 'predictions': y_pred_lr}
models['Logistic Regression'] = lr
print(f"{acc_lr:.4f}")

print("2. Random Forest...", end=" ")
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight='balanced')
rf.fit(X_train_combined, y_train_encoded)
y_pred_rf = rf.predict(X_test_combined)
acc_rf = accuracy_score(y_test_encoded, y_pred_rf)
results['Random Forest'] = {'accuracy': acc_rf, 'predictions': y_pred_rf}
models['Random Forest'] = rf
print(f"{acc_rf:.4f}")

print("3. Gradient Boosting...", end=" ")
gb = GradientBoostingClassifier(n_estimators=100, random_state=42, learning_rate=0.1)
gb.fit(X_train_combined, y_train_encoded)
y_pred_gb = gb.predict(X_test_combined)
acc_gb = accuracy_score(y_test_encoded, y_pred_gb)
results['Gradient Boosting'] = {'accuracy': acc_gb, 'predictions': y_pred_gb}
models['Gradient Boosting'] = gb
print(f"{acc_gb:.4f}")

print("4. SVM...", end=" ")
svm = SVC(kernel='rbf', random_state=42, probability=True, class_weight='balanced')
svm.fit(X_train_combined, y_train_encoded)
y_pred_svm = svm.predict(X_test_combined)
acc_svm = accuracy_score(y_test_encoded, y_pred_svm)
results['SVM'] = {'accuracy': acc_svm, 'predictions': y_pred_svm}
models['SVM'] = svm
print(f"{acc_svm:.4f}")

print("\nMODEL PERFORMANCE SUMMARY:")
for model_name in sorted(results.keys(), key=lambda x: results[x]['accuracy'], reverse=True):
    acc = results[model_name]['accuracy']
    print(f"  {model_name}: {acc:.4f} ({int(acc*len(y_test_encoded))}/{len(y_test_encoded)})")

best_model_name = max(results.items(), key=lambda x: x[1]['accuracy'])[0]
best_model = models[best_model_name]
best_predictions = results[best_model_name]['predictions']
print(f"\nBEST MODEL: {best_model_name} ({results[best_model_name]['accuracy']:.4f})")

TRAINING MODELS WITH BALANCED CLASS WEIGHTS

1. Logistic Regression... 1.0000
2. Random Forest... 1.0000
3. Gradient Boosting... 1.0000
4. SVM... 1.0000

MODEL PERFORMANCE SUMMARY:
  Logistic Regression: 1.0000 (100/100)
  Random Forest: 1.0000 (100/100)
  Gradient Boosting: 1.0000 (100/100)
  SVM: 1.0000 (100/100)

BEST MODEL: Logistic Regression (1.0000)


In [5]:
class SentimentPredictor:
    def __init__(self, model, tfidf_vectorizer, label_encoder, cat_encoder_cols):
        self.model = model
        self.tfidf = tfidf_vectorizer
        self.le = label_encoder
        self.cat_cols = cat_encoder_cols
        
    def _prepare_features(self, skill_fb1, skill_fb2, skill_fb3, overall_fb, eval_result):
        combined_text = f"{skill_fb1} {skill_fb2} {skill_fb3} {overall_fb}"
        X_tfidf = self.tfidf.transform(pd.Series([combined_text])).toarray()
        
        X_cat = pd.DataFrame({'Evaluation_Result': [eval_result]})
        X_cat_encoded = pd.get_dummies(X_cat, drop_first=False)
        
        for col in self.cat_cols:
            if col not in X_cat_encoded.columns:
                X_cat_encoded[col] = 0
        X_cat_encoded = X_cat_encoded[self.cat_cols]
        
        X_combined = np.hstack([X_tfidf, X_cat_encoded.values])
        return X_combined
    
    def predict_single(self, skill_feedback_1, skill_feedback_2, skill_feedback_3, 
                      overall_feedback, evaluation_result):
        X = self._prepare_features(skill_feedback_1, skill_feedback_2, skill_feedback_3,
                                   overall_feedback, evaluation_result)
        pred_encoded = self.model.predict(X)[0]
        sentiment = self.le.inverse_transform([pred_encoded])[0]
        return sentiment
    
    def predict_batch(self, dataframe, skill_fb1_col='Skill_Feedback_1', 
                     skill_fb2_col='Skill_Feedback_2', skill_fb3_col='Skill_Feedback_3',
                     overall_fb_col='Overall_Feedback', eval_result_col='Evaluation_Result'):
        predictions = []
        for idx, row in dataframe.iterrows():
            pred = self.predict_single(
                row[skill_fb1_col],
                row[skill_fb2_col],
                row[skill_fb3_col],
                row[overall_fb_col],
                row[eval_result_col]
            )
            predictions.append(pred)
        
        result_df = dataframe.copy()
        result_df['Predicted_Sentiment'] = predictions
        return result_df
    
    def predict_from_json(self, filepath, output_filepath=None):
        with open(filepath, 'r') as f:
            data = json.load(f)
        
        if isinstance(data, dict):
            data = [data]
        
        df = pd.DataFrame(data)
        predictions_df = self.predict_batch(df)
        
        if output_filepath:
            predictions_df.to_csv(output_filepath, index=False)
            print(f"Predictions saved: {output_filepath}")
        
        return predictions_df

predictor = SentimentPredictor(
    model=best_model,
    tfidf_vectorizer=tfidf,
    label_encoder=le,
    cat_encoder_cols=list(X_train_cat_encoded.columns)
)

print("SentimentPredictor initialized")
print(f"Model: {best_model_name}")
print(f"Accuracy: {results[best_model_name]['accuracy']:.2%}")

SentimentPredictor initialized
Model: Logistic Regression
Accuracy: 100.00%


In [6]:
print("DETAILED TEST SET PREDICTIONS & EVALUATION\n")

predictions_df = test_df.copy()
predictions_df['Predicted_Sentiment'] = le.inverse_transform(best_predictions)
predictions_df['True_Sentiment'] = y_test
predictions_df['Correct'] = predictions_df['Predicted_Sentiment'] == predictions_df['True_Sentiment']

total_correct = predictions_df['Correct'].sum()
total_records = len(predictions_df)
overall_accuracy = total_correct / total_records
print(f"OVERALL ACCURACY: {overall_accuracy:.2%} ({total_correct}/{total_records})\n")

print("ACCURACY BY SENTIMENT CLASS:")
for sentiment in le.classes_:
    mask = predictions_df['True_Sentiment'] == sentiment
    total = mask.sum()
    correct = (predictions_df[mask]['Correct']).sum()
    acc = correct / total if total > 0 else 0
    print(f"  {sentiment.upper()}: {correct}/{total} ({acc:.2%})")

predictions_df.to_csv('model_predictions.csv', index=False)
print("\nPredictions saved: model_predictions.csv")

DETAILED TEST SET PREDICTIONS & EVALUATION

OVERALL ACCURACY: 100.00% (100/100)

ACCURACY BY SENTIMENT CLASS:
  NEGATIVE: 21/21 (100.00%)
  NEUTRAL: 34/34 (100.00%)
  POSITIVE: 45/45 (100.00%)

Predictions saved: model_predictions.csv


In [7]:
print("GENERATING FINAL FEEDBACK BASED ON PREDICTED SENTIMENTS\n")

def generate_concise_feedback(predicted_sentiment):
    if predicted_sentiment == 'positive':
        return "Ready for advancement. Demonstrates strong capabilities and readiness for growth."
    elif predicted_sentiment == 'neutral':
        return "Conditional progression. Shows promise but requires targeted development in identified areas."
    else:
        return "Requires improvement. Needs additional support and training with mentorship."

predictions_df = test_df.copy()
predictions_df['Predicted_Sentiment'] = le.inverse_transform(best_predictions)
predictions_df['Final_Feedback'] = predictions_df['Predicted_Sentiment'].apply(generate_concise_feedback)

print("Sample Final Feedbacks:\n")
for idx in range(5):
    print(f"{idx+1}. {predictions_df['Associate_Name'].iloc[idx]} ({predictions_df['Predicted_Sentiment'].iloc[idx]}): {predictions_df['Final_Feedback'].iloc[idx]}")

final_df = predictions_df[[
    'Employee_ID', 'Associate_Name', 'Department', 'Evaluation_Result',
    'Skill_Feedback_1', 'Skill_Feedback_2', 'Skill_Feedback_3', 'Overall_Feedback',
    'Sentiment', 'Predicted_Sentiment', 'Final_Feedback'
]]

final_df.to_csv('final_feedback_with_all_columns.csv', index=False)
print(f"\nFinal CSV saved: final_feedback_with_all_columns.csv")
print(f"Total records: {len(final_df)}")
print(f"Columns: {list(final_df.columns)}")

GENERATING FINAL FEEDBACK BASED ON PREDICTED SENTIMENTS

Sample Final Feedbacks:

1. Rohan Chopra (positive): Ready for advancement. Demonstrates strong capabilities and readiness for growth.
2. Kavya Reddy (neutral): Conditional progression. Shows promise but requires targeted development in identified areas.
3. Kavya Desai (positive): Ready for advancement. Demonstrates strong capabilities and readiness for growth.
4. Shreya Sharma (negative): Requires improvement. Needs additional support and training with mentorship.
5. Shreya Singh (negative): Requires improvement. Needs additional support and training with mentorship.

Final CSV saved: final_feedback_with_all_columns.csv
Total records: 100
Columns: ['Employee_ID', 'Associate_Name', 'Department', 'Evaluation_Result', 'Skill_Feedback_1', 'Skill_Feedback_2', 'Skill_Feedback_3', 'Overall_Feedback', 'Sentiment', 'Predicted_Sentiment', 'Final_Feedback']
