In [27]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder

In [21]:
test_df = pd.read_csv('../task4/NLP_features 2.csv')

In [22]:
test_df.head()

Unnamed: 0,Sentence,Emotion,POS_tags,TF-IDF,Pretrained_Embeddings,Custom_Embeddings,Sentiment_Score
0,"Maar zie het als een compliment, want eigenlij...",happiness,"[('Maar', 'CCONJ'), ('zie', 'VERB'), ('het', '...",0.003596,[ 1.26762390e-02 1.73162282e-01 7.43213966e-...,[-0.0084755 0.26464984 -0.05773885 0.017975...,0.2023
1,zien als de grootste bedreiging voor hun relatie.,fear,"[('zien', 'VERB'), ('als', 'SCONJ'), ('de', 'D...",0.002601,[-2.97851562e-02 1.28133133e-01 4.68241386e-...,[-0.01170979 0.2735433 -0.04595882 0.015996...,0.2942
2,"OkÃ©, hier zijn ze, de koppels!",happiness,"[('OkÃ', 'PROPN'), ('©', 'PROPN'), (',', 'PUNC...",0.002231,[ 0.02016602 0.1265625 0.09594727 -0.085205...,[-0.02661045 0.25985995 -0.05596739 0.018820...,-0.5719
3,"De koppels zien elkaar een laatste keer terug,...",sadness,"[('De', 'DET'), ('koppels', 'NOUN'), ('zien', ...",0.003266,[ 0.01642761 0.07313013 0.01586151 -0.002490...,[-0.01235101 0.2364183 -0.04538872 0.017792...,0.6486
4,"Dat is super zenuwachtig, want je weet niet ho...",fear,"[('Dat', 'PRON'), ('is', 'AUX'), ('super', 'NO...",0.002982,[ 1.02195047e-01 1.92493781e-01 2.88835000e-...,[-0.01652984 0.24785426 -0.05721172 0.017966...,0.6369


---

# Logestic Regression Model 1st attempt

In [23]:
# Convert string embeddings to NumPy arrays
X = test_df['Pretrained_Embeddings'].apply(lambda x: np.fromstring(x.strip('[]'), sep=' ')).tolist()
X = np.array(X)

# Labels
y = test_df['Emotion']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Predict
predictions = model.predict(X_test)

# Evaluation
report = classification_report(y_test, predictions)

# Output metrics
print(report)


              precision    recall  f1-score   support

       anger       0.00      0.00      0.00         8
        fear       0.00      0.00      0.00         4
   happiness       0.46      0.55      0.50        55
     neutral       0.47      0.65      0.54        48
     sadness       0.00      0.00      0.00         4
    surprise       0.28      0.16      0.20        31

    accuracy                           0.44       150
   macro avg       0.20      0.23      0.21       150
weighted avg       0.38      0.44      0.40       150



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


---

# Logestic Regression model second attempt (Applying Smote for class imbalance)

As seen in the classification report, the model lacks the ability to predict other classes besides neutral, happiness and surprise. This is because of class imbalance. For this itteration I applied SMOTE to fix this class imbalance so that the new model is able to predict all classes instead of just three.

In [24]:
smote = SMOTE(random_state=42, k_neighbors=1)  
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_sm, y_train_sm)

predictions = model.predict(X_test)

# Evaluation metrics 
report = classification_report(y_test, predictions)

# Output metrics after SMOTE
print(report)




              precision    recall  f1-score   support

       anger       0.14      0.50      0.22         8
        fear       0.09      0.25      0.13         4
   happiness       0.59      0.31      0.40        55
     neutral       0.58      0.44      0.50        48
     sadness       0.09      0.50      0.15         4
    surprise       0.13      0.10      0.11        31

    accuracy                           0.32       150
   macro avg       0.27      0.35      0.25       150
weighted avg       0.44      0.32      0.35       150



With SMOTE applied all classes are being predicted. The accuracy and F1 score might have dropped but that is because the model is actually trying to guess other classes that are less represented in the dataset. This makes the model less accurate because it has 7 classes to choose from instead of just 3. But it is still not great at correctly predicting those classes. It is better then random guessing, the f1 score for random guessing should be arround 0.14 and it is 0.35 in this model making it better than just randomly guessing emotions.

---

In [None]:
# Convert string embeddings to NumPy arrays
X = test_df['Pretrained_Embeddings'].apply(lambda x: np.fromstring(x.strip('[]'), sep=' ')).tolist()
X = np.array(X)

# Labels
y = test_df['Emotion']

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Apply SMOTE on training data only
smote = SMOTE(random_state=42, k_neighbors=1)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

# Train model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_sm, y_train_sm)

# Predict on test set
predictions = model.predict(X_test)

# Decode predicted labels back to original
predictions_decoded = le.inverse_transform(predictions)
y_test_decoded = le.inverse_transform(y_test)

# Evaluation
report = classification_report(y_test_decoded, predictions_decoded)
print(report)


              precision    recall  f1-score   support

       anger       0.14      0.50      0.22         8
        fear       0.09      0.25      0.13         4
   happiness       0.59      0.31      0.40        55
     neutral       0.58      0.44      0.50        48
     sadness       0.09      0.50      0.15         4
    surprise       0.13      0.10      0.11        31

    accuracy                           0.32       150
   macro avg       0.27      0.35      0.25       150
weighted avg       0.44      0.32      0.35       150

