In [26]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import nltk
from nltk.corpus import stopwords
import joblib

In [27]:
# Download NLTK stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nazla\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [28]:
# Load Dataset and display the first few rows of the dataset
df = pd.read_csv('emotion_dataset.csv')
print(df.head())

   Emotion                                               Text  \
0  neutral                                             Why ?    
1      joy    Sage Act upgrade on my to do list for tommorow.   
2  sadness  ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...   
3      joy   Such an eye ! The true hazel eye-and so brill...   
4      joy  @Iluvmiasantos ugh babe.. hugggzzz for u .!  b...   

                                          Clean_Text  
0                                                NaN  
1                     Sage Act upgrade list tommorow  
2  WAY HOMEGIRL BABY FUNERAL MAN HATE FUNERALS SH...  
3  eye  true hazel eyeand brilliant  Regular feat...  
4    ugh babe hugggzzz u  babe naamazed nga ako e...  


In [29]:
# Preprocess Data
stop_words = set(stopwords.words('english'))

def preprocess_text(Text):
    words = Text.split()
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
    return ' '.join(words)

df['Text'] = df['Text'].apply(preprocess_text)

In [30]:
# Ensure only the six specified categories are included
df = df[df['Emotion'].isin(['neutral', 'anger', 'joy', 'disgust', 'fear', 'sadness', 'surprise'])]

In [31]:
# Encode Labels
label_encoder = LabelEncoder()
df['Emotion'] = label_encoder.fit_transform(df['Emotion'])

In [32]:
# Splitting the Data
X = df['Text']
y = df['Emotion']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
# Build the Model
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [34]:
# Evaluate the Model
y_pred = model.predict(X_test_vec)
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("Accuracy:", accuracy_score(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

       anger       0.64      0.47      0.54       840
     disgust       0.90      0.06      0.11       161
        fear       0.69      0.54      0.60      1072
         joy       0.52      0.84      0.64      2285
     neutral       0.76      0.14      0.23       469
     sadness       0.55      0.55      0.55      1332
    surprise       0.61      0.30      0.40       771

    accuracy                           0.57      6930
   macro avg       0.67      0.41      0.44      6930
weighted avg       0.60      0.57      0.54      6930

Accuracy: 0.5653679653679654


In [35]:
# Save the Model, Vectorizer, and Label Encoder using joblib
joblib.dump(model, 'emotion_detection_model.pkl')
joblib.dump(vectorizer, 'emotion_detection_vectorizer.pkl')
joblib.dump(label_encoder, 'emotion_detection_label_encoder.pkl')

['emotion_detection_label_encoder.pkl']

In [36]:
# Predicting on a new text sample (example)
new_text = "I am very happy today!"
processed_text = preprocess_text(new_text)
processed_text_vec = vectorizer.transform([processed_text])
prediction = model.predict(processed_text_vec)[0]
predicted_emotion = label_encoder.inverse_transform([prediction])[0]
print(f"The predicted emotion for the text '{new_text}' is: {predicted_emotion}")

The predicted emotion for the text 'I am very happy today!' is: joy
