In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re, regex
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import make_scorer, classification_report, precision_score, recall_score, f1_score, accuracy_score, jaccard_score
from sklearn.model_selection import GridSearchCV
import os


In [2]:
train_df = pd.read_csv("../data/train.csv")
val_df   = pd.read_csv("../data/val.csv")
test_df  = pd.read_csv("../data/test.csv")

In [18]:
TARGET_EMOTIONS = ['joy','sadness','anger','fear','surprise','disgust','neutral','love']
X_train, y_train = train_df["clean_text"], train_df[TARGET_EMOTIONS]
X_val, y_val     = val_df["clean_text"], val_df[TARGET_EMOTIONS]
X_test, y_test   = test_df["clean_text"], test_df[TARGET_EMOTIONS]


In [None]:
print("Number of NaN values in training data:", X_train.isna().sum())
print("Number of NaN values in validation data:", X_val.isna().sum())
print("Number of NaN values in test data:", X_test.isna().sum())

mask_train = X_train.notna()
X_train = X_train[mask_train]
y_train = y_train[mask_train]

mask_val = X_val.notna()
X_val = X_val[mask_val]
y_val = y_val[mask_val]

mask_test = X_test.notna()
X_test = X_test[mask_test]
y_test = y_test[mask_test]

# Verify the NaN values are gone
print("\nAfter dropping NaN values:")
print("Number of NaN values in training data:", X_train.isna().sum())
print("Number of NaN values in validation data:", X_val.isna().sum())
print("Number of NaN values in test data:", X_test.isna().sum())

Number of NaN values in training data: 2
Number of NaN values in validation data: 2
Number of NaN values in test data: 1

After dropping NaN values:
Number of NaN values in training data: 0
Number of NaN values in validation data: 0
Number of NaN values in test data: 0


In [22]:
tfidf = TfidfVectorizer(
    max_features=30000,
    ngram_range=(1, 3),
    sublinear_tf=True,
    stop_words="english"
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf   = tfidf.transform(X_val)
X_test_tfidf  = tfidf.transform(X_test)

In [23]:
X_train_tfidf.shape, X_val_tfidf.shape, X_test_tfidf.shape

((31747, 30000), (6801, 30000), (6803, 30000))

In [24]:
def evaluate_model(clf, X, y, dataset_name="Dataset", get_classification_report=False):
    y_pred = clf.predict(X)

    print(f"{dataset_name} Report:")
    if get_classification_report:
        print("\nValidation Classification Report:")
        print(classification_report(y, y_pred, target_names=TARGET_EMOTIONS, digits=3))

    micro_p = precision_score(y, y_pred, average="micro")
    micro_r = recall_score(y, y_pred, average="micro")
    micro_f1 = f1_score(y, y_pred, average="micro")

    macro_p = precision_score(y, y_pred, average="macro")
    macro_r = recall_score(y, y_pred, average="macro")
    macro_f1 = f1_score(y, y_pred, average="macro")

    print(f"Micro Precision: {micro_p:.3f}, Micro Recall: {micro_r:.3f}, Micro F1: {micro_f1:.3f}")
    print(f"Macro Precision: {macro_p:.3f}, Macro Recall: {macro_r:.3f}, Macro F1: {macro_f1:.3f}")

    subset_acc = accuracy_score(y, y_pred)
    jaccard_acc = jaccard_score(y, y_pred, average="samples")

    print(f"Subset Accuracy (Exact Match): {subset_acc:.3f}")
    print(f"Jaccard Accuracy (Sample-based): {jaccard_acc:.3f}")
    return {
        "micro_precision": micro_p,
        "micro_recall": micro_r,
        "micro_f1": micro_f1,
        "macro_precision": macro_p,
        "macro_recall": macro_r,
        "macro_f1": macro_f1,
        "subset_accuracy": subset_acc,
        "jaccard_accuracy": jaccard_acc
    }

### Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)


In [26]:
base_clf = LogisticRegression()
clf = OneVsRestClassifier(base_clf, n_jobs=-1)

In [27]:


param_grid = {
    "estimator__solver": ['liblinear', 'saga'],
    "estimator__C": [0.5, 1.0, 2.0],
    "estimator__penalty": ['l1', 'l2'],
    "estimator__class_weight": [None, 'balanced'],
    "estimator__max_iter": [500, 1000, 1500]
}
f1_micro = make_scorer(f1_score, average='micro')

grid = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring=f1_micro,
    cv=3,
    n_jobs=-1
)

grid.fit(X_train_tfidf, y_train)
print("\nGrid Search Complete.")
print("Best Params:", grid.best_params_)
print("Best Cross-Validated Micro-F1:", grid.best_score_)
best_model = grid.best_estimator_
evaluate_model(best_model, X_val_tfidf, y_val, dataset_name="Validation Set", get_classification_report=True)

best_result = evaluate_model(best_model, X_test_tfidf, y_test, dataset_name="Test Set")
result = {'Model' : 'Logistic Regression (One-vs-Rest)'}

result.update(best_result)
result_df = pd.DataFrame([result])
result_df.to_csv('../results/best_model_test_results.csv', index=False)

results = pd.DataFrame(grid.cv_results_)
os.makedirs("../results", exist_ok=True)
results.to_csv("../results/gridsearch_LR_results.csv", index=False)




Grid Search Complete.
Best Params: {'estimator__C': 2.0, 'estimator__class_weight': None, 'estimator__max_iter': 500, 'estimator__penalty': 'l1', 'estimator__solver': 'saga'}
Best Cross-Validated Micro-F1: 0.7029103571137912
Validation Set Report:

Validation Classification Report:
              precision    recall  f1-score   support

         joy      0.729     0.404     0.520       866
     sadness      0.698     0.381     0.493       662
       anger      0.644     0.351     0.454       866
        fear      0.753     0.407     0.529       307
    surprise      0.732     0.350     0.474       554
     disgust      0.613     0.280     0.384       600
     neutral      0.816     0.889     0.851      4750
        love      0.844     0.708     0.770       641

   micro avg      0.785     0.656     0.715      9246
   macro avg      0.729     0.471     0.559      9246
weighted avg      0.765     0.656     0.688      9246
 samples avg      0.788     0.723     0.730      9246

Micro Preci

### Naive Bayes

In [28]:
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [29]:
clf = OneVsRestClassifier(MultinomialNB())

In [30]:
param_grid = {
    "estimator": [MultinomialNB()],  
    "estimator__alpha": [0.1, 0.5, 1.0, 2.0],        
    "estimator__fit_prior": [True, False]
}
f1_micro = make_scorer(f1_score, average='micro')
grid = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring=f1_micro,
    cv=3,
    n_jobs=-1,
    verbose=0
)
grid.fit(X_train_tfidf, y_train)
print("\nGrid Search Complete.")
print("Best Params:", grid.best_params_)
print("Best Cross-Validated Micro-F1:", grid.best_score_)

best_model = grid.best_estimator_
evaluate_model(best_model, X_val_tfidf, y_val, dataset_name="Validation Set", get_classification_report=True)
best_result = evaluate_model(best_model, X_test_tfidf, y_test, dataset_name="Test Set")
result = {'Model' : 'Multinomial Naive Bayes (One-vs-Rest)'}

result.update(best_result)
result_df = pd.read_csv('../results/best_model_test_results.csv') if os.path.exists('../results/best_model_test_results.csv') else pd.DataFrame()
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)
os.makedirs("../results", exist_ok=True)
result_df.to_csv("../results/best_model_test_results.csv", index=False)

results = pd.DataFrame(grid.cv_results_)
os.makedirs("../results", exist_ok=True)
results.to_csv("../results/gridsearch_NB_results.csv", index=False)


Grid Search Complete.
Best Params: {'estimator': MultinomialNB(), 'estimator__alpha': 0.5, 'estimator__fit_prior': True}
Best Cross-Validated Micro-F1: 0.6030919668041305
Validation Set Report:

Validation Classification Report:
              precision    recall  f1-score   support

         joy      0.869     0.099     0.178       866
     sadness      0.939     0.047     0.089       662
       anger      0.894     0.048     0.092       866
        fear      0.750     0.020     0.038       307
    surprise      0.917     0.020     0.039       554
     disgust      1.000     0.022     0.042       600
     neutral      0.746     0.960     0.839      4750
        love      0.912     0.097     0.175       641

   micro avg      0.752     0.521     0.615      9246
   macro avg      0.878     0.164     0.187      9246
weighted avg      0.823     0.521     0.481      9246
 samples avg      0.702     0.590     0.624      9246

Micro Precision: 0.752, Micro Recall: 0.521, Micro F1: 0.615
Macr

In [31]:
clf = OneVsRestClassifier(ComplementNB())
param_grid = {
    "estimator": [ComplementNB()],  
    "estimator__alpha": [0.1, 0.5, 1.0, 2.0],        
    "estimator__fit_prior": [True, False]
}
f1_micro = make_scorer(f1_score, average='micro')
grid = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring=f1_micro,
    cv=3,
    n_jobs=-1,
    verbose=0
)
grid.fit(X_train_tfidf, y_train)
print("\nGrid Search Complete.")
print("Best Params:", grid.best_params_)
print("Best Cross-Validated Micro-F1:", grid.best_score_)

best_model = grid.best_estimator_
evaluate_model(best_model, X_val_tfidf, y_val, dataset_name="Validation Set", get_classification_report=True)
best_result = evaluate_model(best_model, X_test_tfidf, y_test, dataset_name="Test Set")
result = {'Model' : 'Complement Naive Bayes (One-vs-Rest)'}

result.update(best_result)
result_df = pd.read_csv('../results/best_model_test_results.csv')
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)
result_df.to_csv("../results/best_model_test_results.csv", index=False)

results = pd.DataFrame(grid.cv_results_)
os.makedirs("../results", exist_ok=True)
results.to_csv("../results/gridsearch_NB_results.csv", index=False)


Grid Search Complete.
Best Params: {'estimator': ComplementNB(), 'estimator__alpha': 2.0, 'estimator__fit_prior': True}
Best Cross-Validated Micro-F1: 0.5809530897257161
Validation Set Report:

Validation Classification Report:
              precision    recall  f1-score   support

         joy      0.591     0.225     0.326       866
     sadness      0.483     0.169     0.251       662
       anger      0.571     0.204     0.301       866
        fear      0.270     0.130     0.176       307
    surprise      0.388     0.137     0.203       554
     disgust      0.403     0.128     0.195       600
     neutral      0.776     0.879     0.824      4750
        love      0.653     0.329     0.438       641

   micro avg      0.712     0.547     0.619      9246
   macro avg      0.517     0.275     0.339      9246
weighted avg      0.646     0.547     0.561      9246
 samples avg      0.697     0.612     0.630      9246

Micro Precision: 0.712, Micro Recall: 0.547, Micro F1: 0.619
Macro

### Linear SVM

In [32]:
from sklearn.svm import LinearSVC, SVC

In [33]:
clf = OneVsRestClassifier(LinearSVC())

In [34]:
param_grid = {
    "estimator__C": [0.1, 0.5, 1.0, 2.0, 5.0],
    "estimator__class_weight": [None, "balanced"],
    "estimator__loss": ["hinge", "squared_hinge"],
    "estimator__max_iter": [500, 1000, 1500, 2000]
}
f1_micro = make_scorer(f1_score, average="micro")

grid = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring=f1_micro,
    cv=3,
    n_jobs=-1,
    verbose=0
)
print("Starting Grid Search for Linear SVM...")
grid.fit(X_train_tfidf, y_train)
print("\nGrid Search Complete.")
print("Best Params:", grid.best_params_)
print("Best Cross-Validated Micro-F1:", grid.best_score_)

best_model = grid.best_estimator_
evaluate_model(best_model, X_val_tfidf, y_val, dataset_name="Validation Set", get_classification_report=True)
best_result = evaluate_model(best_model, X_test_tfidf, y_test, dataset_name="Test Set")
result = {'Model' : 'Linear SVM (One-vs-Rest)'}

result.update(best_result)
result_df = pd.read_csv('../results/best_model_test_results.csv')
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)
result_df.to_csv("../results/best_model_test_results.csv", index=False)

results = pd.DataFrame(grid.cv_results_)
os.makedirs("../results", exist_ok=True)
results.to_csv("../results/gridsearch_Linear_SVC_results.csv", index=False)

Starting Grid Search for Linear SVM...





Grid Search Complete.
Best Params: {'estimator__C': 1.0, 'estimator__class_weight': None, 'estimator__loss': 'hinge', 'estimator__max_iter': 1000}
Best Cross-Validated Micro-F1: 0.7020444395157158
Validation Set Report:

Validation Classification Report:
              precision    recall  f1-score   support

         joy      0.816     0.359     0.499       866
     sadness      0.775     0.343     0.475       662
       anger      0.715     0.311     0.433       866
        fear      0.793     0.388     0.521       307
    surprise      0.768     0.341     0.472       554
     disgust      0.688     0.232     0.347       600
     neutral      0.812     0.902     0.854      4750
        love      0.863     0.697     0.771       641

   micro avg      0.804     0.647     0.717      9246
   macro avg      0.779     0.447     0.547      9246
weighted avg      0.793     0.647     0.682      9246
 samples avg      0.800     0.716     0.733      9246

Micro Precision: 0.804, Micro Recall: 0

In [35]:
clf = OneVsRestClassifier(SVC())

In [36]:
param_grid = {
    "estimator__C": [0.1, 0.5, 1.0, 2.0, 5.0],
    "estimator__class_weight": [None, "balanced"],
    "estimator__loss": ["hinge", "squared_hinge"],
    "estimator__max_iter": [1000, 1500]
}
f1_micro = make_scorer(f1_score, average="micro")

grid = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring=f1_micro,
    cv=3,
    n_jobs=-1,
    verbose=0
)
print("Starting Grid Search for Linear SVM...")
grid.fit(X_train_tfidf, y_train)
print("\nGrid Search Complete.")
print("Best Params:", grid.best_params_)
print("Best Cross-Validated Micro-F1:", grid.best_score_)

best_model = grid.best_estimator_
evaluate_model(best_model, X_val_tfidf, y_val, dataset_name="Validation Set", get_classification_report=True)
best_result = evaluate_model(best_model, X_test_tfidf, y_test, dataset_name="Test Set")
result = {'Model' : 'Linear SVM (One-vs-Rest)'}

result.update(best_result)
result_df = pd.read_csv('../results/best_model_test_results.csv')
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)
result_df.to_csv("../results/best_model_test_results.csv", index=False)

results = pd.DataFrame(grid.cv_results_)
os.makedirs("../results", exist_ok=True)
results.to_csv("../results/gridsearch_NB_results.csv", index=False)

Starting Grid Search for Linear SVM...


ValueError: Invalid parameter 'loss' for estimator SVC(C=0.1). Valid parameters are: ['C', 'break_ties', 'cache_size', 'class_weight', 'coef0', 'decision_function_shape', 'degree', 'gamma', 'kernel', 'max_iter', 'probability', 'random_state', 'shrinking', 'tol', 'verbose'].

### Ridge Classifier

In [None]:
from sklearn.linear_model import RidgeClassifier

In [None]:
clf = OneVsRestClassifier(RidgeClassifier())

In [None]:
param_grid = {
    "estimator__alpha": [0.1, 0.5, 1.0, 2.0, 5.0],
    "estimator__tol": [1e-3, 1e-4],
    "estimator__solver": ["auto", "sparse_cg", "lsqr"]
}
f1_micro = make_scorer(f1_score, average="micro")
grid = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring=f1_micro,
    cv=3,
    n_jobs=-1,
    verbose=0
)

print("Starting Grid Search for Ridge Classifier...")
grid.fit(X_train_tfidf, y_train)
print("\nGrid Search Complete.")
print("Best Params:", grid.best_params_)
print("Best Cross-Validated Micro-F1:", grid.best_score_)

best_model = grid.best_estimator_
evaluate_model(best_model, X_val_tfidf, y_val, dataset_name="Validation Set", get_classification_report=True)
best_result = evaluate_model(best_model, X_test_tfidf, y_test, dataset_name="Test Set")
result = {'Model' : 'Ridge Classifier (One-vs-Rest)'}

result.update(best_result)
result_df = pd.read_csv('../results/best_model_test_results.csv')
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)
result_df.to_csv("../results/best_model_test_results.csv", index=False)

results = pd.DataFrame(grid.cv_results_)
os.makedirs("../results", exist_ok=True)
results.to_csv("../results/gridsearch_Ridge_Classifier_results.csv", index=False)

Starting Grid Search for Ridge Classifier...

Grid Search Complete.
Best Params: {'estimator__alpha': 1.0, 'estimator__solver': 'lsqr', 'estimator__tol': 0.001}
Best Cross-Validated Micro-F1: 0.6747333570269541
Validation Set Report:

Validation Classification Report:
              precision    recall  f1-score   support

         joy      0.767     0.334     0.465       866
     sadness      0.760     0.325     0.455       662
       anger      0.679     0.289     0.405       866
        fear      0.790     0.319     0.455       307
    surprise      0.778     0.298     0.431       554
     disgust      0.673     0.223     0.335       600
     neutral      0.804     0.890     0.845      4752
        love      0.862     0.555     0.676       641

   micro avg      0.793     0.620     0.696      9248
   macro avg      0.764     0.404     0.508      9248
weighted avg      0.779     0.620     0.658      9248
 samples avg      0.766     0.688     0.703      9248

Micro Precision: 0.793, Mi

### SGD Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
clf = OneVsRestClassifier(SGDClassifier(random_state=42))

In [None]:
param_grid = {
    "estimator__loss": ["hinge", "log_loss", "modified_huber"],
    "estimator__penalty": ["l2", "l1", "elasticnet"],
    "estimator__alpha": [1e-5, 1e-4, 1e-3],
    "estimator__max_iter": [1000, 1500],
    "estimator__tol": [1e-3, 1e-4],
    "estimator__class_weight": [None, "balanced"]
}
f1_micro = make_scorer(f1_score, average="micro")

In [None]:
grid = GridSearchCV(
    estimator=clf,
    param_grid=param_grid,
    scoring=f1_micro,
    cv=3,
    n_jobs=-1,
    verbose=0
)
print("Starting Grid Search for SGD Classifier...")
grid.fit(X_train_tfidf, y_train)
print("\nGrid Search Complete.")
print("Best Params:", grid.best_params_)
print("Best Cross-Validated Micro-F1:", grid.best_score_)

best_model = grid.best_estimator_
evaluate_model(best_model, X_val_tfidf, y_val, dataset_name="Validation Set", get_classification_report=True)
best_result = evaluate_model(best_model, X_test_tfidf, y_test, dataset_name="Test Set")
result = {'Model' : 'SGD Classifier (One-vs-Rest)'}

result.update(best_result)
result_df = pd.read_csv('../results/best_model_test_results.csv')
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)
result_df.to_csv("../results/best_model_test_results.csv", index=False)

results = pd.DataFrame(grid.cv_results_)
os.makedirs("../results", exist_ok=True)
results.to_csv("../results/gridsearch_SGD_Classifier_results.csv", index=False)

Starting Grid Search for SGD Classifier...

Grid Search Complete.
Best Params: {'estimator__alpha': 0.0001, 'estimator__class_weight': None, 'estimator__loss': 'hinge', 'estimator__max_iter': 1000, 'estimator__penalty': 'l1', 'estimator__tol': 0.0001}
Best Cross-Validated Micro-F1: 0.7034871849957444
Validation Set Report:

Validation Classification Report:
              precision    recall  f1-score   support

         joy      0.825     0.311     0.451       866
     sadness      0.795     0.293     0.428       662
       anger      0.713     0.269     0.391       866
        fear      0.777     0.375     0.505       307
    surprise      0.811     0.318     0.457       554
     disgust      0.702     0.220     0.335       600
     neutral      0.805     0.916     0.857      4752
        love      0.870     0.658     0.750       641

   micro avg      0.803     0.637     0.711      9248
   macro avg      0.787     0.420     0.522      9248
weighted avg      0.795     0.637     0.668 

Ensemble (Voting)

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

In [None]:
lr = LogisticRegression(C=2.0, solver='liblinear', max_iter=1000)
nb = MultinomialNB(alpha=0.5)
lr2 = LogisticRegression(C=1.0, solver='liblinear', max_iter=1000)

voting_clf = OneVsRestClassifier(
    VotingClassifier(
        estimators=[('lr', lr), ('nb', nb), ('lr2', lr2)],
        voting='soft'
    )
)


In [None]:
voting_clf.fit(X_train_tfidf, y_train)
best_model = voting_clf
evaluate_model(best_model, X_val_tfidf, y_val, dataset_name="Validation Set", get_classification_report=True)
best_result = evaluate_model(best_model, X_test_tfidf, y_test, dataset_name="Test Set")
result = {'Model' : 'Voting (lr, nb, svm) (One-vs-Rest)'}

result.update(best_result)
result_df = pd.read_csv('../results/best_model_test_results.csv')
result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)
result_df.to_csv("../results/best_model_test_results.csv", index=False)

Validation Set Report:

Validation Classification Report:
              precision    recall  f1-score   support

         joy      0.833     0.237     0.369       866
     sadness      0.837     0.193     0.314       662
       anger      0.832     0.194     0.315       866
        fear      0.717     0.107     0.187       307
    surprise      0.811     0.155     0.261       554
     disgust      0.753     0.112     0.194       600
     neutral      0.785     0.936     0.854      4752
        love      0.906     0.538     0.675       641

   micro avg      0.796     0.593     0.679      9248
   macro avg      0.809     0.309     0.396      9248
weighted avg      0.803     0.593     0.607      9248
 samples avg      0.767     0.665     0.692      9248

Micro Precision: 0.796, Micro Recall: 0.593, Micro F1: 0.679
Macro Precision: 0.809, Macro Recall: 0.309, Macro F1: 0.396
Subset Accuracy (Exact Match): 0.525
Jaccard Accuracy (Sample-based): 0.648
Test Set Report:
Micro Precision: 0.790