In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns




In [3]:
df=pd.read_csv('train.txt', sep=';', header=None, names=['text', 'emotion'])
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [4]:
unique_emotions = df['emotion'].unique()
emotion_numbers={}
i = 0
for emotion in (unique_emotions):
    emotion_numbers[emotion] = i
    i += 1
print(emotion_numbers)    
df['emotion'] = df['emotion'].map(emotion_numbers)


{'sadness': 0, 'anger': 1, 'love': 2, 'surprise': 3, 'fear': 4, 'joy': 5}


In [5]:
df['text'] = df['text'].apply(lambda x: x.lower())
import string 
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df['text'] = df['text'].apply(remove_punctuation)
def remove_numbers(text):
    return ''.join([i for i in text if not i.isdigit()])
df['text'] = df['text'].apply(remove_numbers)
df

Unnamed: 0,text,emotion
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


In [6]:
def remove_emojis(text):
    new =""
    for char in text:
        if char.isascii():
            new += char
    return new
df['text'] = df['text'].apply(remove_emojis)

In [7]:
# //now we want to remove stop words like "the", "is", "and", etc. bcoz these are not important in nlp using ML
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary data
nltk.download('punkt')
nltk.download('punkt_tab')  # <-- Add this
nltk.download('stopwords')

# Load stop words
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]  # lowercase check
    return ' '.join(filtered_tokens)

# Apply on dataframe column
df['text'] = df['text'].apply(remove_stopwords)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Person\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Person\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Person\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
df.head()

Unnamed: 0,text,emotion
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,1
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,1


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['emotion'], test_size=0.2, random_state=42)

bgw_vectorizer = CountVectorizer()
x_train_cv=bgw_vectorizer.fit_transform(X_train)
x_test_cv = bgw_vectorizer.transform(X_test)
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train_cv, y_train)
y_pred_cv = model.predict(x_test_cv)
accuracy_cv = accuracy_score(y_test, y_pred_cv)
print(f'Accuracy with CountVectorizer: {accuracy_cv}')

idf_vectorizer = TfidfVectorizer()
x_train_idf = idf_vectorizer.fit_transform(X_train)
x_test_idf = idf_vectorizer.transform(X_test)
model.fit(x_train_idf, y_train)
y_pred_idf = model.predict(x_test_idf)
accuracy_idf = accuracy_score(y_test, y_pred_idf)
print(f'Accuracy with TfidfVectorizer: {accuracy_idf}')



Accuracy with CountVectorizer: 0.7678125
Accuracy with TfidfVectorizer: 0.6609375


In [None]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(max_iter=1000)

logistic_model.fit(x_train_cv, y_train)
y_pred_logistic_cv = logistic_model.predict(x_test_cv)
accuracy_logistic_cv = accuracy_score(y_test, y_pred_logistic_cv)
print(f'Accuracy with Logistic Regression: {accuracy_logistic_cv}')

logistic_model.fit(x_train_idf, y_train)
y_pred_logistic = logistic_model.predict(x_test_idf)
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
print(f'Accuracy with Logistic Regression: {accuracy_logistic}')

Accuracy with Logistic Regression: 0.88875
Accuracy with Logistic Regression: 0.8615625


In [27]:
def predict_emotion(text, vectorizer=bgw_vectorizer, model=logistic_model, emotion_map=emotion_numbers):
    # Preprocessing steps (same as used for training)
    text = text.lower()
    text = remove_punctuation(text)
    text = remove_numbers(text)
    text = remove_emojis(text)
    text = remove_stopwords(text)
    
    # Vectorize
    X_vec = vectorizer.transform([text])
    # Predict
    pred = model.predict(X_vec)[0]
    print(pred)
    # Reverse mapping
    reverse_map = {v: k for k, v in emotion_map.items()}
    return reverse_map[pred]

# Example usage:
print(predict_emotion("i feel irritated"))

1
anger
