In [None]:
import os
import re
import nltk
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import torch
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn import set_config

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

In [None]:
print(torch.cuda.is_available()) 
print(torch.cuda.device_count()) 
print(torch.cuda.get_device_name(0)) 

In [None]:
parent = os.path.abspath(os.path.join(os.getcwd(), '..'))
data = 'data'
path = os.path.join(parent, data, 'goemotions.csv')
df = pd.read_csv(path)
print(df.head())

                                                text       id  \
0                                    That game hurt.  eew5j0j   
1   >sexuality shouldn’t be a grouping category I...  eemcysk   
2     You do right, if you don't care then fuck 'em!  ed2mah1   
3                                 Man I love reddit.  eeibobj   
4  [NAME] was nowhere near them, he was by the Fa...  eda6yn6   

                author            subreddit    link_id   parent_id  \
0                Brdd9                  nrl  t3_ajis4z  t1_eew18eq   
1          TheGreen888     unpopularopinion  t3_ai4q37   t3_ai4q37   
2             Labalool          confessions  t3_abru74  t1_ed2m7g7   
3        MrsRobertshaw             facepalm  t3_ahulml   t3_ahulml   
4  American_Fascist713  starwarsspeculation  t3_ackt2f  t1_eda65q2   

    created_utc  rater_id  example_very_unclear  admiration  ...  love  \
0  1.548381e+09         1                 False           0  ...     0   
1  1.548084e+09        37               

In [3]:
print(df.columns)
print(df.shape)

Index(['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id',
       'created_utc', 'rater_id', 'example_very_unclear', 'admiration',
       'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
       'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
       'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
       'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
       'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')
(70000, 37)


In [4]:
def processText(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = nltk.word_tokenize(text)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    return ' '.join(tokens)

In [None]:
class_counts = y.sum(axis=0)
min_counts = y.min(axis=0) 

print(class_counts)
print(min_counts)

admiration         5647
amusement          3081
anger              2589
annoyance          4443
approval           5928
caring             1988
confusion          2471
curiosity          3267
desire             1248
disappointment     2771
disapproval        3774
disgust            1704
embarrassment       817
excitement         1900
fear               1048
gratitude          3863
grief               227
joy                2607
love               2745
nervousness         598
optimism           2887
pride               452
realization        2867
relief              452
remorse             849
sadness            2193
surprise           1806
neutral           18423
dtype: int64
admiration        0
amusement         0
anger             0
annoyance         0
approval          0
caring            0
confusion         0
curiosity         0
desire            0
disappointment    0
disapproval       0
disgust           0
embarrassment     0
excitement        0
fear              0
gratitude      

In [16]:
print(y.head())


   admiration  amusement  anger  annoyance  approval  caring  confusion  \
0           0          0      0          0         0       0          0   
1           0          0      0          0         0       0          0   
2           0          0      0          0         0       0          0   
3           0          0      0          0         0       0          0   
4           0          0      0          0         0       0          0   

   curiosity  desire  disappointment  ...  love  nervousness  optimism  pride  \
0          0       0               0  ...     0            0         0      0   
1          0       0               0  ...     0            0         0      0   
2          0       0               0  ...     0            0         0      0   
3          0       0               0  ...     1            0         0      0   
4          0       0               0  ...     0            0         0      0   

   realization  relief  remorse  sadness  surprise  neutral  


In [17]:
label_counts = y.iloc[:, 0].value_counts()
print(label_counts)

admiration
0    64353
1     5647
Name: count, dtype: int64


In [18]:
x = df['text'].apply(processText)
y = df.iloc[:, 9:]

X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42, stratify=y.idxmax(axis=1)
)

In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=20000,
        ngram_range=(1, 3),
        min_df=2,
        max_df=0.95,
        strip_accents='unicode',
        use_idf=True,
        smooth_idf=True,
        sublinear_tf=True
    )),
    ('scaler', StandardScaler(with_mean=False)),
    ('classifier', MultiOutputClassifier(RandomForestClassifier(
        n_estimators=400,
        max_depth=40,
        min_samples_split=5,
        min_samples_leaf=2,
        class_weight='balanced_subsample',
        bootstrap=True,
        random_state=42,
        n_jobs=-1
    )))
])

set_config(display='diagram')
pipeline

In [20]:
param_grid = {
    'tfidf__max_features': [10000, 15000, 20000],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'classifier__estimator__n_estimators': [200, 400],
    'classifier__estimator__max_depth': [20, 40, None],
    'classifier__estimator__min_samples_split': [2, 5],
    'classifier__estimator__min_samples_leaf': [1, 2]
}
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    n_jobs=-1,
    verbose=2,
    scoring='accuracy',
    error_score='raise'
)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

X_train = torch.tensor(X_train).to(device)
y_train = torch.tensor(y_train.values).to(device)
X_test = torch.tensor(X_test).to(device)
y_test = torch.tensor(y_test.values).to(device)

history = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


In [18]:
y_pred = grid_search.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=labels.columns))
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))


Classification Report:
                precision    recall  f1-score   support

    admiration       0.39      0.59      0.47      1129
     amusement       0.54      0.68      0.60       626
         anger       0.25      0.43      0.32       534
     annoyance       0.16      0.37      0.23       884
      approval       0.20      0.33      0.25      1212
        caring       0.14      0.33      0.20       380
     confusion       0.14      0.35      0.20       505
     curiosity       0.16      0.24      0.19       615
        desire       0.19      0.45      0.26       240
disappointment       0.13      0.28      0.18       560
   disapproval       0.16      0.40      0.23       787
       disgust       0.17      0.42      0.25       343
 embarrassment       0.11      0.28      0.16       163
    excitement       0.17      0.35      0.23       376
          fear       0.32      0.41      0.36       206
     gratitude       0.68      0.80      0.74       739
         grief       0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
print("\nEmotion accuracy:")
per_emotion_accuracy = {}
for i, emotion in enumerate(labels.columns):
    acc = accuracy_score(y_test.iloc[:, i], y_pred[:, i])
    per_emotion_accuracy[emotion] = acc
    print(f"{emotion}: {acc:.4f}")


Emotion accuracy:
admiration: 0.8929
amusement: 0.9598
anger: 0.9298
annoyance: 0.8416
approval: 0.8304
caring: 0.9269
confusion: 0.9021
curiosity: 0.9101
desire: 0.9572
disappointment: 0.8966
disapproval: 0.8509
disgust: 0.9371
embarrassment: 0.9659
excitement: 0.9364
fear: 0.9785
gratitude: 0.9698
grief: 0.9942
joy: 0.9381
love: 0.9590
nervousness: 0.9811
optimism: 0.9349
pride: 0.9887
realization: 0.8866
relief: 0.9843
remorse: 0.9841
sadness: 0.9424
surprise: 0.9497
neutral: 0.5852
