In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.naive_bayes import MultinomialNB # for text classification this is best
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv('train.txt', sep = ';', header = None, names = ['text', 'emotions'])

In [None]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [None]:
df.isnull().sum()

Unnamed: 0,0
text,0
emotions,0


In [None]:
unique_emotions = df['emotions'].unique()
emotions_numbers = {}
i = 0
for em in unique_emotions:
  emotions_numbers[em] = i
  i += 1

df['emotions'] = df['emotions'].map(emotions_numbers)


In [None]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1


In [None]:
#Converting into lower case
df['text'] = df['text'].apply(lambda x : x.lower())

In [None]:
# Removing punctuations like @ ,# $%^&,.
import string
def remove_punc(txt):
  return txt.translate(str.maketrans('', '', string.punctuation))

In [None]:
df['text'] = df['text'].apply(remove_punc)

In [None]:
# Remove Numbers
def remove_numbers(txt):
  new = ""
  for i in txt:
    if not i.isdigit():
      new += i
  return new
df['text'] = df['text'].apply(remove_numbers)

In [None]:
# Remove Links and Urls
import re

def remove_links(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text)

df['text'] = df['text'].apply(remove_links)

In [None]:
# Remove emoji
def remove_emoji(txt):
  for i in txt:
    new = ""
    if i.isascii():
      new += i
  return txt
df['text'] = df['text'].apply(remove_emoji)

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
df.loc[1, 'text']

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [None]:
# Tokenize it
def remove(txt):
  words = txt.split()
  cleaned = []
  for i in words:
    if i not in stop_words:
      cleaned.append(i)
  return ' '.join(cleaned)

In [None]:
df['text'] = df['text'].apply(remove)

In [None]:
df.loc[1, 'text']

'go feeling hopeless damned hopeful around someone cares awake'

In [None]:
df.head()

Unnamed: 0,text,emotions
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,1
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,1


In [None]:
# Train test split the corpus into 80 - 20
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['emotions'], test_size=0.20, random_state=42)

In [None]:
X_train

Unnamed: 0,text
676,refers course though cant help feeling somehow...
12113,im starting feel im suffering fatigue
7077,feel like probably would liked book little bit...
13005,really feel awkward
12123,im feeling little grumpy today lame weather te...
...,...
13418,love leave reader feeling confused slightly de...
5390,feel delicate
860,starting feel little stressed
15795,feel stressed tired worn shape neglected


In [None]:
# Bag of Words
bow_vectorizer = CountVectorizer()

In [None]:
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

In [None]:
X_test_bow

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 26936 stored elements and shape (3200, 13359)>

In [None]:
# Naive Bayes model training
nb_model = MultinomialNB()
nb_model.fit(X_train_bow, y_train)

In [None]:
# Let's check the accuracy
y_pred_bow = nb_model.predict(X_test_bow)
print(accuracy_score(y_test, y_pred_bow))

0.768125


In [None]:
y_pred_bow

array([0, 5, 0, ..., 5, 5, 0])

In [None]:
y_test

Unnamed: 0,emotions
8756,0
4660,5
6095,0
304,5
8241,0
...,...
15578,5
5746,5
6395,5
7624,5


In [None]:
# TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Naive Bayes model training
nb2_model = MultinomialNB()
nb2_model.fit(X_train_tfidf, y_train)

In [None]:
# Let's check the accuracy
y_pred_tfidf = nb2_model.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred_tfidf))

0.6609375


In [None]:
# Logistic Regression
logistic_model = LogisticRegression(max_iter= 1000)
logistic_model.fit(X_train_tfidf, y_train)

In [None]:
# Let's check the accuracy
y_pred_lr = logistic_model.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred_lr))

0.863125


In [None]:
# Support Vector Machine
svc_model = SVC()
svc_model.fit(X_train_tfidf, y_train)

In [None]:
# Let's check the accuracy
y_pred_svc = svc_model.predict(X_test_tfidf)
print(accuracy_score(y_test, y_pred_svc))

0.85125


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid_logistic_model = {
    'C': [0.01, 0.1, 1, 10, 50],
    'solver': ['liblinear', 'lbfgs']
}

grid_logistic_model = GridSearchCV(
    logistic_model,
    param_grid_logistic_model,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_logistic_model.fit(X_train_tfidf, y_train)

print("Best Logistic Regression Params:", grid_logistic_model.best_params_)
print("Best CV Accuracy:", grid_logistic_model.best_score_)

Best Logistic Regression Params: {'C': 10, 'solver': 'liblinear'}
Best CV Accuracy: 0.881796875
