Import necessary libraries

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

# For text processing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# For model building
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# For evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

Download the data

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shivamb/go-emotions-google-emotions-dataset")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\Leo\.cache\kagglehub\datasets\shivamb\go-emotions-google-emotions-dataset\versions\1


Load and display the data

In [3]:
data = pd.read_csv(path+"/go_emotions_dataset.csv")
print(data.head())

        id                                               text  \
0  eew5j0j                                    That game hurt.   
1  eemcysk   >sexuality shouldn’t be a grouping category I...   
2  ed2mah1     You do right, if you don't care then fuck 'em!   
3  eeibobj                                 Man I love reddit.   
4  eda6yn6  [NAME] was nowhere near them, he was by the Fa...   

   example_very_unclear  admiration  amusement  anger  annoyance  approval  \
0                 False           0          0      0          0         0   
1                  True           0          0      0          0         0   
2                 False           0          0      0          0         0   
3                 False           0          0      0          0         0   
4                 False           0          0      0          0         0   

   caring  confusion  ...  love  nervousness  optimism  pride  realization  \
0       0          0  ...     0            0         0      0 

# Logistic Regression Model

Data Preprocessing

In [4]:
# Download necessary NLTK data files
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and define stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize and remove stop words
    tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    # Join tokens back to string
    return ' '.join(tokens)

# Apply preprocessing to the text column
data['clean_text'] = data['text'].apply(preprocess_text)

# Display the cleaned text
print("\nCleaned text:")
print(data['clean_text'].head(10))

[nltk_data] Error loading stopwords: <urlopen error [Errno 11004]
[nltk_data]     getaddrinfo failed>
[nltk_data] Error loading wordnet: <urlopen error [Errno 11004]
[nltk_data]     getaddrinfo failed>



Cleaned text:
0                                            game hurt
1    sexuality shouldnt grouping category make diff...
2                              right dont care fuck em
3                                      man love reddit
4                             name nowhere near falcon
5    right considering important document know damn...
6    isnt big he still quite popular ive heard thin...
7    thats crazy went super religion high school th...
8                                   thats adorable asf
9    sponge blurb pub quaw haha gurr ha aaa finale ...
Name: clean_text, dtype: object


Split the Data

In [5]:
# Features and targets
X = data['clean_text']
Y = data.loc[:, 'admiration':'neutral']
emotion_columns = data.columns[3:-1]

# Split into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

Build the Model: Use a pipeline to vectorize text and train a logistic regression classifier in a one-vs-rest fashion

In [6]:
# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='liblinear')))
])

# Train the model
pipeline.fit(X_train, Y_train)

# Make predictions on the test set
Y_pred = pipeline.predict(X_test)

Evaluate the model

In [7]:
# Classification report
print("\nClassification Report:")
print(classification_report(Y_test, Y_pred, target_names=emotion_columns))

# Calculate overall metrics
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred, average='macro', zero_division=0)
recall = recall_score(Y_test, Y_pred, average='macro', zero_division=0)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")


Classification Report:
                precision    recall  f1-score   support

    admiration       0.66      0.25      0.36      3456
     amusement       0.60      0.29      0.39      1891
         anger       0.53      0.08      0.14      1628
     annoyance       0.33      0.02      0.03      2722
      approval       0.56      0.03      0.06      3418
        caring       0.49      0.04      0.07      1147
     confusion       0.61      0.03      0.05      1463
     curiosity       0.72      0.04      0.07      1941
        desire       0.48      0.06      0.11       758
disappointment       0.59      0.01      0.02      1671
   disapproval       0.42      0.01      0.02      2289
       disgust       0.58      0.07      0.13      1074
 embarrassment       0.54      0.03      0.06       502
    excitement       0.57      0.05      0.09      1121
          fear       0.63      0.15      0.25       625
     gratitude       0.90      0.70      0.79      2330
         grief       0.

Hyperparameter tuning

In [8]:
# Define hyperparameters to tune
parameters = {
    'tfidf__max_df': [0.9, 1.0],
    'tfidf__ngram_range': [(1,1), (1,2)],
    'clf__estimator__C': [1, 10]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, parameters, cv=3, scoring='f1_macro')

# Fit the model
grid_search.fit(X_train, Y_train)

# Best parameters
print("\nBest Parameters:")
print(grid_search.best_params_)

# Use the best estimator to make predictions
best_model = grid_search.best_estimator_
Y_pred_best = best_model.predict(X_test)


Best Parameters:
{'clf__estimator__C': 10, 'tfidf__max_df': 0.9, 'tfidf__ngram_range': (1, 2)}


Re-evaluate the model

In [9]:
# Classification report with the best model
print("\nClassification Report with Best Model:")
print(classification_report(Y_test, Y_pred_best, target_names=emotion_columns))

# Calculate overall metrics
accuracy_best = accuracy_score(Y_test, Y_pred_best)
precision_best = precision_score(Y_test, Y_pred_best, average='macro', zero_division=0)
recall_best = recall_score(Y_test, Y_pred_best, average='macro', zero_division=0)

print(f"Best Model Accuracy: {accuracy_best:.4f}")
print(f"Best Model Precision: {precision_best:.4f}")
print(f"Best Model Recall: {recall_best:.4f}")


Classification Report with Best Model:
                precision    recall  f1-score   support

    admiration       0.59      0.39      0.47      3456
     amusement       0.54      0.37      0.44      1891
         anger       0.44      0.18      0.26      1628
     annoyance       0.28      0.09      0.14      2722
      approval       0.31      0.11      0.16      3418
        caring       0.35      0.13      0.19      1147
     confusion       0.35      0.11      0.17      1463
     curiosity       0.45      0.17      0.25      1941
        desire       0.34      0.11      0.17       758
disappointment       0.28      0.08      0.12      1671
   disapproval       0.32      0.11      0.16      2289
       disgust       0.40      0.13      0.19      1074
 embarrassment       0.47      0.10      0.16       502
    excitement       0.39      0.11      0.17      1121
          fear       0.54      0.26      0.35       625
     gratitude       0.87      0.73      0.79      2330
       

# Random Forest Model

In [None]:
# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(RandomForestClassifier(random_state=42)))
])

# Train the model
pipeline.fit(X_train, Y_train)

# Make predictions on the test set
Y_pred = pipeline.predict(X_test)

In [None]:
# Classification report
print("\nClassification Report:")
print(classification_report(Y_test, Y_pred, target_names=emotion_columns))

# Calculate overall metrics
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred, average='macro', zero_division=0)
recall = recall_score(Y_test, Y_pred, average='macro', zero_division=0)

print(f"\nAccuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

In [None]:
# Define hyperparameters to tune
parameters = {
    'tfidf__max_df': [0.9, 1.0],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__estimator__n_estimators': [100, 200],
    'clf__estimator__max_depth': [None, 10, 20],
    'clf__estimator__min_samples_split': [2, 5],
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    parameters,
    cv=3,
    scoring='f1_macro',
    verbose=1,
    n_jobs=-1
)

# Fit the model
grid_search.fit(X_train, Y_train)

# Best parameters
print("\nBest Parameters:")
print(grid_search.best_params_)

# Best estimator
best_model = grid_search.best_estimator_

# Make predictions with the best model
Y_pred_best = best_model.predict(X_test)

In [None]:
# Classification report with the best model
print("\nClassification Report with Best Model:")
print(classification_report(Y_test, Y_pred_best, target_names=emotion_columns))

# Calculate overall metrics
accuracy_best = accuracy_score(Y_test, Y_pred_best)
precision_best = precision_score(Y_test, Y_pred_best, average='macro', zero_division=0)
recall_best = recall_score(Y_test, Y_pred_best, average='macro', zero_division=0)

print(f"\nBest Model Accuracy: {accuracy_best:.4f}")
print(f"Best Model Precision: {precision_best:.4f}")
print(f"Best Model Recall: {recall_best:.4f}")