In [9]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

In [None]:
test_df = pd.read_csv('../datasets/Train_Test_data/NLP_features_Test.csv')

# Logistic regression model trained on sentiment score

In [14]:
X = np.array(test_df['Sentiment_Score'].values)
X = X.reshape(-1, 1)

# Replace NaNs with zero
X = np.nan_to_num(X, nan=0.0)

# Labels
y = test_df['Emotion']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Predict
predictions = model.predict(X_test)

# Evaluation
report = classification_report(y_test, predictions)

# Output metrics
print(report)


              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        16
        fear       0.00      0.00      0.00         5
   happiness       0.31      0.41      0.35        44
     neutral       0.34      0.61      0.44        49
     sadness       0.00      0.00      0.00         3
    surprise       0.00      0.00      0.00        28

    accuracy                           0.33       145
   macro avg       0.11      0.17      0.13       145
weighted avg       0.21      0.33      0.26       145



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


---

# Second attempt at logistic regression model trained on sentiment score (with class imbalance fix)

In [19]:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler

# Drop rows with missing values
df = test_df[['Sentiment_Score', 'Emotion']].dropna()

# Prepare features and labels
X = df['Sentiment_Score'].values.reshape(-1, 1)
y = df['Emotion']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check class distribution in training data
print("Class distribution in y_train before oversampling:", Counter(y_train))

# Apply Random Over Sampling
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

# Check class distribution after oversampling
print("Class distribution in y_train after oversampling:", Counter(y_train_resampled))

# Train logistic regression on resampled data
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_resampled, y_train_resampled)

# Predict on original test data
predictions = model.predict(X_test)

# Evaluate model
print(classification_report(y_test, predictions))


Class distribution in y_train before oversampling: Counter({'happiness': 194, 'neutral': 192, 'surprise': 113, 'anger': 32, 'sadness': 30, 'fear': 13, 'disgust': 2})
Class distribution in y_train after oversampling: Counter({'anger': 194, 'surprise': 194, 'happiness': 194, 'neutral': 194, 'fear': 194, 'sadness': 194, 'disgust': 194})
              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        14
     disgust       0.00      0.00      0.00         0
        fear       0.00      0.00      0.00         4
   happiness       0.40      0.43      0.42        49
     neutral       0.36      0.59      0.45        44
     sadness       0.00      0.00      0.00         0
    surprise       0.50      0.03      0.06        34

    accuracy                           0.33       145
   macro avg       0.18      0.15      0.13       145
weighted avg       0.36      0.33      0.29       145



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
