In [215]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 

In [216]:
df = pd.read_csv('train.txt',sep=';',header=None,names=['text','emotions'])

In [217]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [218]:
df.isnull().sum()

text        0
emotions    0
dtype: int64

In [219]:
unique_emotions = df['emotions'].unique()
emotion_numbers = {}
i = 0 
for emo in unique_emotions:
    emotion_numbers[emo] = i
    i+= 1

df['emotions'] = df['emotions'].map(emotion_numbers)

In [220]:
df

Unnamed: 0,text,emotions
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
15995,i just had a very brief time in the beanbag an...,0
15996,i am now turning and i feel pathetic that i am...,0
15997,i feel strong and good overall,5
15998,i feel like this was such a rude comment and i...,1


In [221]:
# Making the text in lowercase

df['text'] = df['text'].apply(lambda x : x.lower())

In [222]:
# Removing Panctuations
import string
def remove_punc(txt):
    translator = str.maketrans('', '', string.punctuation)
    return txt.translate(translator)


In [223]:
df['text'] = df['text'].apply(remove_punc)

In [224]:
# Removing Numbers
def remove_numbers(txt):
    new = '' 
    for i in txt:
        if not i.isdigit():
            new = new + i
    return new 

df['text'] = df['text'].apply(remove_numbers)

In [225]:
# Remove Emojis
def remove_emojis(txt):
    new = '' 
    for i in txt: 
        if i.isascii():
            new += i
    return new 

df['text'] = df['text'].apply(remove_emojis)

In [226]:
import nltk

In [227]:
# Importing stopwords to remove commonly used words (like "the", "is", "in") that do not add meaningful information during text analysis
from nltk.corpus import stopwords

# Importing word_tokenize to split input text into individual words for processing
from nltk.tokenize import word_tokenize


In [228]:
# Need to download 
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [229]:
# Creating a set of English stopwords to use for filtering out common words during token processing
stop_words = set(stopwords.words('english'))

In [230]:
df.loc[1]['text']

'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'

In [231]:
# Removing Stopwords
def remove(txt):
    words = word_tokenize(txt)
    cleaned = []
    for i in words:
        if not i in stop_words:
            cleaned.append(i)

    return ' '.join(cleaned)


In [232]:
df['text'] = df['text'].apply(remove)

In [233]:
df.loc[1]['text']

'go feeling hopeless damned hopeful around someone cares awake'

----

In [234]:
df.head()

Unnamed: 0,text,emotions
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,1
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,1


In [235]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['emotions'], test_size=0.20, random_state=42)


In [236]:
X_train

676      refers course though cant help feeling somehow...
12113                im starting feel im suffering fatigue
7077     feel like probably would liked book little bit...
13005                                  really feel awkward
12123    im feeling little grumpy today lame weather te...
                               ...                        
13418    love leave reader feeling confused slightly de...
5390                                         feel delicate
860                          starting feel little stressed
15795             feel stressed tired worn shape neglected
7270         feel someone rude wrongly done something lose
Name: text, Length: 12800, dtype: object

----

## **BAG OF WORDS (`BOW`) Implementation**

**📌 Final Rule — Simple Hinglish mein:
✅ Training data pe → fit_transform()
✅ Test data pe → transform() hi lagao**

In [237]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

nb_model = MultinomialNB()
nb_model.fit(X_train_bow,y_train)

In [238]:
pred_bow = nb_model.predict(X_test_bow)
print('Bag Of Words Accuracy Score:', accuracy_score(y_test,pred_bow))

Bag Of Words Accuracy Score: 0.7678125


---


In [239]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

nb2_model = MultinomialNB()
nb2_model.fit(X_train_tfidf,y_train)

In [240]:
y_pred  = nb2_model.predict(X_test_tfidf)

In [241]:
print(accuracy_score(y_test,y_pred))

0.6609375


---

In [242]:
from sklearn.linear_model import LogisticRegression

In [243]:
logistic_model = LogisticRegression(max_iter=1000)

In [244]:
logistic_model.fit(X_train_tfidf,y_train)

In [245]:
log_pred = logistic_model.predict(X_test_tfidf)

In [246]:
print(accuracy_score(y_test,log_pred))

0.8615625


----

In [247]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

# 2. Train the SVM model
svm_model = LinearSVC()
svm_model.fit(X_train_tfidf, y_train)

# 3. Make predictions
svm_predictions = svm_model.predict(X_test_tfidf)

# 4. Accuracy score
accuracy = accuracy_score(y_test, svm_predictions)
print("SVM Accuracy:", accuracy)




SVM Accuracy: 0.891875


---

In [248]:
# Step 1: Preprocessing function (same as training)
def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ''.join([char for char in text if not char.isdigit()])
    text = ''.join([char for char in text if char.isascii()])
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    cleaned = [word for word in words if word not in stop_words]
    return ' '.join(cleaned)

# Step 2: Prediction function
def predict_emotion(user_input):
    cleaned_text = preprocess(user_input)
    vec_input = tfidf_vectorizer.transform([cleaned_text])  # always list
    predicted = logistic_model.predict(vec_input)
    
    # Reverse mapping of label to emotion name (optional)
    emotion_map = {v: k for k, v in emotion_numbers.items()}
    return emotion_map[predicted[0]]


In [249]:
while True:

    user_text = input("Enter a sentence to predict emotion: ")

    if user_text == 'stop':
        print("Prediction Stopped...")
        break

    else:
        emotion = predict_emotion(user_text)
        print(user_text)
        print("Predicted Emotion:", emotion)


this is the good things about this project
Predicted Emotion: joy
Prediction Stopped...


In [250]:
import pickle

with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)

with open('svm_model.pkl', 'wb') as file:
    pickle.dump(logistic_model, file)

