In [87]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
# !unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [88]:
train_root = "/kaggle/input/emotions-dataset-for-nlp/train.txt"
val_root =  "/kaggle/input/emotions-dataset-for-nlp/val.txt"
test_root =  "/kaggle/input/emotions-dataset-for-nlp/test.txt"

In [91]:
train_df = pd.read_csv(train_root, delimiter=";")
val_df = pd.read_csv(val_root, delimiter=";")
test_df = pd.read_csv(test_root, delimiter=";")
train_df.columns = ["text", "emotions"]
val_df.columns = ["text", "emotions"]
test_df.columns = ["text", "emotions"]

val_df = val_df.drop(val_df[(val_df['emotions'] == 'surprise') | (val_df['emotions'] == 'love')].index)
test_df = test_df.drop(test_df[(test_df['emotions'] == 'surprise') | (test_df['emotions'] == 'love')].index)
train_df = train_df.drop(train_df[(train_df['emotions'] == 'surprise') | (train_df['emotions'] == 'love')].index)
train_df['emotions'].value_counts()

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df["emotions"])
y_val = label_encoder.fit_transform(val_df["emotions"])
y_test = label_encoder.fit_transform(test_df["emotions"])

X_train = train_df["text"]
X_val = val_df["text"]
X_test = test_df["text"]

In [92]:
def get_test_val_acc(model):
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    print(f"Val accuracy: {accuracy_score(preds, y_val)}")
    preds = model.predict(X_test)
    print(f"Test accuracy: {accuracy_score(preds, y_test)}")


In [93]:
models = [RandomForestClassifier(), LogisticRegression(), SVC(), KNeighborsClassifier()]

for model in models:
    clf = make_pipeline(CountVectorizer(), model)
    
    clf_name = type(clf).__name__  # Get the class name of the classifier
    print(f"Classifier: {model}")
    get_test_val_acc(clf)
    print("#" * 20)


Classifier: RandomForestClassifier()
Val accuracy: 0.9057471264367816
Test accuracy: 0.9216459977452086
####################
Classifier: LogisticRegression()


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Val accuracy: 0.9321839080459771
Test accuracy: 0.939120631341601
####################
Classifier: SVC()
Val accuracy: 0.8591954022988506
Test accuracy: 0.8641488162344984
####################
Classifier: KNeighborsClassifier()
Val accuracy: 0.45
Test accuracy: 0.4492671927846674
####################


In [94]:
models = [RandomForestClassifier(), LogisticRegression(), SVC(), KNeighborsClassifier()]

for model in models:
    clf = make_pipeline(TfidfVectorizer(stop_words="english", ngram_range=(1, 2)), model)
    
    clf_name = type(clf).__name__  # Get the class name of the classifier
    print(f"Classifier: {model}")
    get_test_val_acc(clf)
    print("#" * 20)


Classifier: RandomForestClassifier()
Val accuracy: 0.9356321839080459
Test accuracy: 0.9368658399098083
####################
Classifier: LogisticRegression()


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Val accuracy: 0.9149425287356322
Test accuracy: 0.9255918827508456
####################
Classifier: SVC()
Val accuracy: 0.9132183908045977
Test accuracy: 0.9182638105975197
####################
Classifier: KNeighborsClassifier()
Val accuracy: 0.8040229885057472
Test accuracy: 0.8151071025930101
####################


In [95]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess(text):    
    text_lower = text.lower()
    processed_text = re.sub('[^a-zA-Z]', ' ', text_lower)
    text_list = processed_text.split()
    stemmed_list = [stemmer.stem(word) for word in text_list if word not in stop_words]
    
    return " ".join(stemmed_list)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [96]:
preprocessed_X_train = train_df["text"].apply(preprocess)
preprocessed_X_val = val_df["text"].apply(preprocess)
preprocessed_X_test = test_df["text"].apply(preprocess)

In [97]:
preprocessed_X_train[:5]

0    go feel hopeless damn hope around someon care ...
1                 im grab minut post feel greedi wrong
3                                         feel grouchi
4                ive feel littl burden late wasnt sure
6            feel confus life teenag jade year old man
Name: text, dtype: object

In [98]:
models = [RandomForestClassifier(), LogisticRegression(), SVC(), KNeighborsClassifier()]

for model in models:
    clf = make_pipeline(TfidfVectorizer(), model)
    
    clf_name = type(clf).__name__  # Get the class name of the classifier
    print(f"Classifier: {model}")
    clf.fit(preprocessed_X_train, y_train)
    preds = clf.predict(preprocessed_X_val)
    print(f"Val accuracy: {accuracy_score(preds, y_val)}")
    preds = clf.predict(preprocessed_X_test)
    print(f"Test accuracy: {accuracy_score(preds, y_test)}")
    print("#" * 20)


Classifier: RandomForestClassifier()
Val accuracy: 0.8982758620689655
Test accuracy: 0.9126268320180383
####################
Classifier: LogisticRegression()


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Val accuracy: 0.889080459770115
Test accuracy: 0.8979706877113867
####################
Classifier: SVC()
Val accuracy: 0.8833333333333333
Test accuracy: 0.8957158962795941
####################
Classifier: KNeighborsClassifier()
Val accuracy: 0.8097701149425287
Test accuracy: 0.8201803833145435
####################


In [99]:
models = [RandomForestClassifier(), LogisticRegression(), SVC(), KNeighborsClassifier()]

for model in models:
    clf = make_pipeline(CountVectorizer(), model)
    
    clf_name = type(clf).__name__  # Get the class name of the classifier
    print(f"Classifier: {model}")
    clf.fit(preprocessed_X_train, y_train)
    preds = clf.predict(preprocessed_X_val)
    print(f"Val accuracy: {accuracy_score(preds, y_val)}")
    preds = clf.predict(preprocessed_X_test)
    print(f"Test accuracy: {accuracy_score(preds, y_test)}")
    print("#" * 20)


Classifier: RandomForestClassifier()
Val accuracy: 0.9005747126436782
Test accuracy: 0.9092446448703495
####################
Classifier: LogisticRegression()


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Val accuracy: 0.9017241379310345
Test accuracy: 0.911499436302142
####################
Classifier: SVC()
Val accuracy: 0.8637931034482759
Test accuracy: 0.8765501691093573
####################
Classifier: KNeighborsClassifier()
Val accuracy: 0.6994252873563218
Test accuracy: 0.7001127395715896
####################


In [108]:
import xgboost as xgb
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

vectorizer = CountVectorizer()

X_train_vec = vectorizer.fit_transform(preprocessed_X_train)
X_val_vec = vectorizer.transform(preprocessed_X_val)
X_test_vec = vectorizer.transform(preprocessed_X_test)

clf = xgb.XGBClassifier(objective='multi:softmax', 
                            num_class=4,
                            learning_rate=0.5,
                            n_estimators=200,  # Number of boosting rounds
                            early_stopping_rounds=10,  # Stop if no improvement for 10 rounds
                            eval_metric=['merror', 'mlogloss'],  # Evaluation metrics
                            seed=42)

clf.fit(X_train_vec, y_train, 
        eval_set=[(X_val_vec, y_val)],
        verbose=True)  # Set verbose to see training progress

# Evaluate on test set
preds_test = clf.predict(X_test_vec)
test_accuracy = accuracy_score(y_test, preds_test)
print(f"Test accuracy: {test_accuracy:.4f}")

[0]	validation_0-merror:0.52989	validation_0-mlogloss:1.27385
[1]	validation_0-merror:0.46897	validation_0-mlogloss:1.19239
[2]	validation_0-merror:0.39540	validation_0-mlogloss:1.12548
[3]	validation_0-merror:0.34655	validation_0-mlogloss:1.06977
[4]	validation_0-merror:0.31092	validation_0-mlogloss:1.02378
[5]	validation_0-merror:0.29425	validation_0-mlogloss:0.98629
[6]	validation_0-merror:0.26954	validation_0-mlogloss:0.94866
[7]	validation_0-merror:0.25632	validation_0-mlogloss:0.92037
[8]	validation_0-merror:0.24253	validation_0-mlogloss:0.89120
[9]	validation_0-merror:0.21494	validation_0-mlogloss:0.86159
[10]	validation_0-merror:0.18678	validation_0-mlogloss:0.83458
[11]	validation_0-merror:0.17011	validation_0-mlogloss:0.81357
[12]	validation_0-merror:0.15000	validation_0-mlogloss:0.79018
[13]	validation_0-merror:0.14023	validation_0-mlogloss:0.77415
[14]	validation_0-merror:0.14023	validation_0-mlogloss:0.75730
[15]	validation_0-merror:0.13391	validation_0-mlogloss:0.74435
[1

In [110]:
import xgboost as xgb
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

vectorizer = TfidfVectorizer()

X_train_vec = vectorizer.fit_transform(preprocessed_X_train)
X_val_vec = vectorizer.transform(preprocessed_X_val)
X_test_vec = vectorizer.transform(preprocessed_X_test)

clf = xgb.XGBClassifier(objective='multi:softmax', 
                            num_class=4,
                            learning_rate=0.5,
                            n_estimators=200,  # Number of boosting rounds
                            early_stopping_rounds=10,  # Stop if no improvement for 10 rounds
                            eval_metric=['merror', 'mlogloss'],  # Evaluation metrics
                            seed=42)

clf.fit(X_train_vec, y_train, 
        eval_set=[(X_val_vec, y_val)],
        verbose=True)  # Set verbose to see training progress

# Evaluate on test set
preds_test = clf.predict(X_test_vec)
test_accuracy = accuracy_score(y_test, preds_test)
print(f"Test accuracy: {test_accuracy:.4f}")

[0]	validation_0-merror:0.53391	validation_0-mlogloss:1.27528
[1]	validation_0-merror:0.47126	validation_0-mlogloss:1.19577
[2]	validation_0-merror:0.40632	validation_0-mlogloss:1.13079
[3]	validation_0-merror:0.34770	validation_0-mlogloss:1.06911
[4]	validation_0-merror:0.31667	validation_0-mlogloss:1.02220
[5]	validation_0-merror:0.28793	validation_0-mlogloss:0.98380
[6]	validation_0-merror:0.26782	validation_0-mlogloss:0.94587
[7]	validation_0-merror:0.25747	validation_0-mlogloss:0.92025
[8]	validation_0-merror:0.23506	validation_0-mlogloss:0.89188
[9]	validation_0-merror:0.20920	validation_0-mlogloss:0.86183
[10]	validation_0-merror:0.18793	validation_0-mlogloss:0.83529
[11]	validation_0-merror:0.17011	validation_0-mlogloss:0.81260
[12]	validation_0-merror:0.15172	validation_0-mlogloss:0.79241
[13]	validation_0-merror:0.14425	validation_0-mlogloss:0.77310
[14]	validation_0-merror:0.14195	validation_0-mlogloss:0.75647
[15]	validation_0-merror:0.14023	validation_0-mlogloss:0.74109
[1

# **Conclusion**
# The best performing model was logistic regression achieveing 93.9% on the test set.
# It was also the fastest one.