In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os
from google.colab import drive
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
folder_path = '/content/drive/My Drive/nlp'

os.chdir(folder_path)

os.listdir()

['glove.6B.100d.txt',
 'training.csv',
 'test.csv',
 'validation.csv',
 'dataset.csv',
 'glove.6B.100d.word2vec.txt',
 'fine-tune embeddings.ipynb',
 'torchmoji.ipynb',
 'DeepMoji.ipynb',
 'train.csv',
 'train1.csv',
 'test1.csv',
 'Untitled0.ipynb',
 '1_dataset_preparation.ipynb',
 'torchmoji_model',
 '2_task.ipynb',
 'roberta-finetuned',
 'task_1_to_7.ipynb',
 'ml_model_implementations.ipynb']

# Task 8

In [4]:
df = pd.read_csv('dataset.csv')

In [5]:
print("Rows:", len(df))
print("Columns:", df.columns.tolist())
print("Label distribution:\n", df['label'].value_counts())

Rows: 20000
Columns: ['text', 'label']
Label distribution:
 label
joy         6761
sadness     5797
fear        2709
love        2373
anger       1641
surprise     719
Name: count, dtype: int64


In [6]:
# Stratified 80/20 split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
print("\nTrain size:", len(train_df), "Test size:", len(test_df))


Train size: 16000 Test size: 4000


In [7]:
def train_and_evaluate(X_train, y_train, X_test, y_test, vectorizer, vectorizer_name="vect"):
    """
    vectorizer: an instantiated vectorizer (e.g. TfidfVectorizer(...))
    """

    clf = Pipeline([
        ('vect', vectorizer),
        ('svc', LinearSVC(max_iter=5000, random_state=42))
    ])
    # train
    clf.fit(X_train, y_train)
    # predict
    y_pred = clf.predict(X_test)
    # metrics
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, digits=4)
    cm = confusion_matrix(y_test, y_pred)
    print(f"=== Results ({vectorizer_name}) ===")
    print(f"Accuracy: {acc:.4f}\n")
    print("Classification report:\n", report)
    # return results
    return {'vectorizer': vectorizer_name, 'accuracy': acc, 'report': report, 'y_pred': y_pred, 'confusion_matrix': cm}

### TF-IDF features + SVM (no stopword removal)

In [8]:
# I'm running TF-IDF (default) + LinearSVC and printing metrics.

tfidf = TfidfVectorizer()  # default (no stopword removal)
results_tfidf = train_and_evaluate(
    train_df['text'].astype(str).tolist(), train_df['label'].tolist(),
    test_df['text'].astype(str).tolist(), test_df['label'].tolist(),
    vectorizer=tfidf,
    vectorizer_name="TF-IDF (no stopwords)"
)

=== Results (TF-IDF (no stopwords)) ===
Accuracy: 0.8908

Classification report:
               precision    recall  f1-score   support

       anger     0.8070    0.7774    0.7919       328
        fear     0.8839    0.8708    0.8773       542
         joy     0.9006    0.9246    0.9124      1352
        love     0.8725    0.8211    0.8460       475
     sadness     0.9238    0.9413    0.9325      1159
    surprise     0.7836    0.7292    0.7554       144

    accuracy                         0.8908      4000
   macro avg     0.8619    0.8441    0.8526      4000
weighted avg     0.8898    0.8908    0.8901      4000



### TF-IDF with stopword removal + SVM

In [9]:
# I'm running TF-IDF with stopword removal and printing metrics.

tfidf_stop = TfidfVectorizer(stop_words='english')
results_tfidf_stop = train_and_evaluate(
    train_df['text'].astype(str).tolist(), train_df['label'].tolist(),
    test_df['text'].astype(str).tolist(), test_df['label'].tolist(),
    vectorizer=tfidf_stop,
    vectorizer_name="TF-IDF (stop_words='english')"
)

=== Results (TF-IDF (stop_words='english')) ===
Accuracy: 0.8825

Classification report:
               precision    recall  f1-score   support

       anger     0.7981    0.7713    0.7845       328
        fear     0.8684    0.8764    0.8724       542
         joy     0.8885    0.9135    0.9008      1352
        love     0.8775    0.8295    0.8528       475
     sadness     0.9159    0.9206    0.9182      1159
    surprise     0.8030    0.7361    0.7681       144

    accuracy                         0.8825      4000
   macro avg     0.8586    0.8412    0.8495      4000
weighted avg     0.8819    0.8825    0.8820      4000



### CountVectorizer (no stopwords) + SVM

In [10]:
# I'm running CountVectorizer (bag-of-words counts) + LinearSVC and printing metrics.

count_vect = CountVectorizer()
results_count = train_and_evaluate(
    train_df['text'].astype(str).tolist(), train_df['label'].tolist(),
    test_df['text'].astype(str).tolist(), test_df['label'].tolist(),
    vectorizer=count_vect,
    vectorizer_name="CountVectorizer (no stopwords)"
)

=== Results (CountVectorizer (no stopwords)) ===
Accuracy: 0.8848

Classification report:
               precision    recall  f1-score   support

       anger     0.7795    0.7866    0.7830       328
        fear     0.8629    0.8708    0.8669       542
         joy     0.9156    0.9068    0.9112      1352
        love     0.8445    0.8232    0.8337       475
     sadness     0.9210    0.9353    0.9281      1159
    surprise     0.7552    0.7500    0.7526       144

    accuracy                         0.8848      4000
   macro avg     0.8464    0.8454    0.8459      4000
weighted avg     0.8846    0.8848    0.8847      4000



### CountVectorizer with stopwords + SVM

In [11]:
# I'm also running CountVectorizer with stopwords to compare with TF-IDF+stopwords.

count_vect_stop = CountVectorizer(stop_words='english')
results_count_stop = train_and_evaluate(
    train_df['text'].astype(str).tolist(), train_df['label'].tolist(),
    test_df['text'].astype(str).tolist(), test_df['label'].tolist(),
    vectorizer=count_vect_stop,
    vectorizer_name="CountVectorizer (stop_words='english')"
)

=== Results (CountVectorizer (stop_words='english')) ===
Accuracy: 0.8790

Classification report:
               precision    recall  f1-score   support

       anger     0.7751    0.7988    0.7868       328
        fear     0.8615    0.8838    0.8725       542
         joy     0.8979    0.9038    0.9008      1352
        love     0.8531    0.8189    0.8357       475
     sadness     0.9197    0.9094    0.9145      1159
    surprise     0.7692    0.7639    0.7666       144

    accuracy                         0.8790      4000
   macro avg     0.8461    0.8464    0.8461      4000
weighted avg     0.8793    0.8790    0.8790      4000



### Summarize results

In [12]:
summary = pd.DataFrame([
    {'setting': results_tfidf['vectorizer'], 'accuracy': results_tfidf['accuracy']},
    {'setting': results_tfidf_stop['vectorizer'], 'accuracy': results_tfidf_stop['accuracy']},
    {'setting': results_count['vectorizer'], 'accuracy': results_count['accuracy']},
    {'setting': results_count_stop['vectorizer'], 'accuracy': results_count_stop['accuracy']},
])
display(summary.sort_values('accuracy', ascending=False))


Unnamed: 0,setting,accuracy
0,TF-IDF (no stopwords),0.89075
2,CountVectorizer (no stopwords),0.88475
1,TF-IDF (stop_words='english'),0.8825
3,CountVectorizer (stop_words='english'),0.879


Among all models, the TF-IDF with SVM (without stopword removal) achieved the highest accuracy (≈89%). This suggests that removing stopwords may not always benefit emotion classification, as common words (e.g., “am,” “is,” “not”) can carry important contextual information for emotional meaning. The results confirm that linear SVMs with TF-IDF features form a strong baseline for text-based emotion detection tasks.

Across all experiments, the SVM classifier demonstrated strong emotion prediction performance, with accuracy values ranging between 87.9% and 89.1%. Among the four setups, the TF-IDF representation without stopword removal achieved the highest accuracy (89.08%) and macro F1-score (0.85), indicating that preserving stopwords provides additional contextual cues useful for emotion detection (e.g., “I am so happy”, “not feeling good”).

When stopwords were removed, both TF-IDF and CountVectorizer performances dropped slightly, suggesting a minor loss of semantic information that contributes to emotional tone. CountVectorizer models also showed slightly lower accuracies (around 88%) compared to TF-IDF, likely because TF-IDF better emphasizes emotion-indicative words rather than common ones.

Overall, the comparison reveals that TF-IDF without stopword removal offers the most balanced and context-aware representation for emotion classification, aligning with the linguistic nature of emotional expressions where small functional words can alter sentiment meaningfully.