In [47]:
import numpy as np
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import joblib
import os

In [48]:
df = pd.read_csv('train.txt',sep=';',header=None,names = ['text','emotion'])

In [49]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [50]:
df.isnull().sum()

text       0
emotion    0
dtype: int64

In [51]:
emotion_numbers = {
    'joy': 0,
    'fear': 1,
    'anger': 2,
    'sadness': 3,
    'surprise': 4,
    'love': 5
}
joblib.dump(emotion_numbers, 'label_mapping.pkl')  # Save for Streamlit UI
df['emotion'] = df['emotion'].map(emotion_numbers)  # Apply to your DataFrame

In [52]:
df['text'] = df['text'].apply(lambda x: x.lower())

In [53]:
def remove_punc(txt):
  return txt.translate(str.maketrans('','',string.punctuation))

In [54]:
df['text'] = df['text'].apply(remove_punc)

In [55]:
def remove_nums(txt):
  new = ''
  for i in txt:
    if not i.isdigit():
      new+=i
  return new

df['text'] = df['text'].apply(remove_nums)

In [56]:
def remove_emojis(txt):
    new = ""
    for i in txt:
        if i.isascii():
            new += i
    return new

df['text'] = df['text'].apply(remove_emojis)

In [57]:
nltk_data_dir = os.path.abspath('nltk_data')

In [58]:
nltk.data.path.insert(0, nltk_data_dir)

In [59]:
nltk.download('punkt', download_dir=nltk_data_dir)
nltk.download('stopwords', download_dir=nltk_data_dir)

[nltk_data] Downloading package punkt to d:\NLP\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to d:\NLP\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [60]:
stop_words = set(stopwords.words('english'))

In [61]:
def remove_stopwords(text):
    words = word_tokenize(text.lower())
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

df['text'] = df['text'].apply(remove_stopwords)

In [62]:
documents = [
    "I Like Pizza",
    "Pizza is the best",
    "I Love Pasta",
    "Pasta is great"
]

vectorizer = CountVectorizer(ngram_range=(1,3))

X = vectorizer.fit_transform(documents)

print("Vocabulary:",vectorizer.get_feature_names_out())
print("\nBoW Matrix:\n",X.toarray())

Vocabulary: ['best' 'great' 'is' 'is great' 'is the' 'is the best' 'like' 'like pizza'
 'love' 'love pasta' 'pasta' 'pasta is' 'pasta is great' 'pizza'
 'pizza is' 'pizza is the' 'the' 'the best']

BoW Matrix:
 [[0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0]
 [1 0 1 0 1 1 0 0 0 0 0 0 0 1 1 1 1 1]
 [0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0]
 [0 1 1 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0]]


In [63]:
documents = [
    "I Like Pizza",
    "Pizza is the best",
    "I Love Pasta",
    "Pasta is great"
]

vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(documents)

print("Vocabulary:",vectorizer.get_feature_names_out())
print("Tfid Matrix:",X.toarray())

Vocabulary: ['best' 'great' 'is' 'like' 'love' 'pasta' 'pizza' 'the']
Tfid Matrix: [[0.         0.         0.         0.78528828 0.         0.
  0.6191303  0.        ]
 [0.55528266 0.         0.43779123 0.         0.         0.
  0.43779123 0.55528266]
 [0.         0.         0.         0.         0.78528828 0.6191303
  0.         0.        ]
 [0.         0.66767854 0.52640543 0.         0.         0.52640543
  0.         0.        ]]


In [64]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['emotion'], test_size=0.20, random_state=42)

In [65]:
bow_vectorizer = CountVectorizer()

x_train_bow = bow_vectorizer.fit_transform(X_train)
x_test_bow = bow_vectorizer.transform(X_test)

In [66]:
model = {
    'NB': MultinomialNB(),
    'LR': LogisticRegression(),
    'SVM': SVC()
}

In [67]:
result_bow = []

In [68]:
for name, mdl in model.items():
  mdl.fit(x_train_bow, y_train)
  y_prediction = mdl.predict(x_test_bow)
  acc = accuracy_score(y_test,y_prediction)
  result_bow.append({
        'model' : name,
        'Accuracy': round(acc,4),
    })

In [69]:
print(f"\n{result_bow}")


[{'model': 'NB', 'Accuracy': 0.7678}, {'model': 'LR', 'Accuracy': 0.8888}, {'model': 'SVM', 'Accuracy': 0.8225}]


In [70]:
tfid_vectorizer = TfidfVectorizer()

x_train_tfid = tfid_vectorizer.fit_transform(X_train)
x_test_tfid = tfid_vectorizer.transform(X_test)

In [71]:
result_tfid = []

In [72]:
for name, mdl in model.items():
  mdl.fit(x_train_tfid, y_train)
  y_prediction = mdl.predict(x_test_tfid)
  acc = accuracy_score(y_test,y_prediction)
  result_tfid.append({
        'model' : name,
        'Accuracy': round(acc,4),
    })

In [73]:
print(f"\n{result_tfid}")


[{'model': 'NB', 'Accuracy': 0.6609}, {'model': 'LR', 'Accuracy': 0.8616}, {'model': 'SVM', 'Accuracy': 0.8512}]


In [74]:
joblib.dump(model['LR'], 'emotion_model.pkl')
joblib.dump(tfid_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']