In [44]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import joblib
import os
os.environ['NLTK_DATA'] = os.path.abspath("nltk_data")

In [45]:
df = pd.read_csv('train.txt',sep=';',header=None,names = ['text','emotion'])

In [46]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [47]:
df.isnull().sum()

text       0
emotion    0
dtype: int64

In [48]:
nltk.data.path.clear()
nltk.data.path.append('d:/NLP/nltk_data')

In [49]:
nltk.download('punkt', download_dir='d:/NLP/nltk_data')
nltk.download('stopwords', download_dir='d:/NLP/nltk_data')

[nltk_data] Downloading package punkt to d:/NLP/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to d:/NLP/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [50]:
emotion_numbers = {
    'joy': 0,
    'fear': 1,
    'anger': 2,
    'sadness': 3,
    'surprise': 4,
    'love': 5
}
df['emotion'] = df['emotion'].map(emotion_numbers)

In [51]:
df['text'] = df['text'].apply(lambda x: x.lower())

In [52]:
def remove_punc(txt):
  return txt.translate(str.maketrans('','',string.punctuation))

In [53]:
df['text'] = df['text'].apply(remove_punc)

In [54]:
def remove_nums(txt):
  new = ''
  for i in txt:
    if not i.isdigit():
      new+=i
  return new

df['text'] = df['text'].apply(remove_nums)

In [55]:
def remove_emojis(txt):
    new = ""
    for i in txt:
        if i.isascii():
            new += i
    return new

df['text'] = df['text'].apply(remove_emojis)

In [81]:
stop_words = set(stopwords.words('english'))

In [82]:
def remove_stopwords(text):
    words = text.lower().split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

df['text'] = df['text'].apply(remove_stopwords)

In [78]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['emotion'], test_size=0.20, random_state=42)

In [79]:
bow_vectorizer = CountVectorizer()

x_train_bow = bow_vectorizer.fit_transform(X_train)
x_test_bow = bow_vectorizer.transform(X_test)

In [80]:
model = {
    'NB': MultinomialNB(),
    'LR': LogisticRegression(),
    'SVM': SVC()
}

In [61]:
result_bow = []

In [62]:
for name, mdl in model.items():
  mdl.fit(x_train_bow, y_train)
  y_prediction = mdl.predict(x_test_bow)
  acc = accuracy_score(y_test,y_prediction)
  result_bow.append({
        'model' : name,
        'Accuracy': round(acc,4),
    })

In [63]:
print(f"\n{result_bow}")


[{'model': 'NB', 'Accuracy': 0.7681}, {'model': 'LR', 'Accuracy': 0.8897}, {'model': 'SVM', 'Accuracy': 0.8225}]


In [64]:
tfid_vectorizer = TfidfVectorizer()

x_train_tfid = tfid_vectorizer.fit_transform(X_train)
x_test_tfid = tfid_vectorizer.transform(X_test)

In [65]:
result_tfid = []

In [66]:
for name, mdl in model.items():
  mdl.fit(x_train_tfid, y_train)
  y_prediction = mdl.predict(x_test_tfid)
  acc = accuracy_score(y_test,y_prediction)
  result_tfid.append({
        'model' : name,
        'Accuracy': round(acc,4),
    })

In [67]:
print(f"\n{result_tfid}")


[{'model': 'NB', 'Accuracy': 0.6609}, {'model': 'LR', 'Accuracy': 0.8628}, {'model': 'SVM', 'Accuracy': 0.8509}]


In [68]:
os.makedirs('models', exist_ok=True)
joblib.dump(tfid_vectorizer, 'models/tfidf_vectorizer.pkl')
joblib.dump(model['LR'], 'models/emotion_model.pkl')
joblib.dump(emotion_numbers, 'models/label_mapping.pkl')


['models/label_mapping.pkl']