In [47]:
import pandas as pd
import re
import nltk

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [48]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [49]:
df = pd.read_csv("tweet_emotions.csv")
df.head()


Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [50]:
df = df[['content', 'sentiment']]
df.columns = ['text', 'emotion']


In [51]:
label_map = {
    'happiness': 'happy',
    'sadness': 'sad'
}

df['emotion'] = df['emotion'].replace(label_map)


In [52]:
allowed_emotions = ['happy', 'sad', 'anger', 'fear', 'neutral']
df = df[df['emotion'].isin(allowed_emotions)]

print("Emotion counts before balancing:")
print(df['emotion'].value_counts())


Emotion counts before balancing:
emotion
neutral    8638
happy      5209
sad        5165
anger       110
Name: count, dtype: int64


In [53]:
min_count = df['emotion'].value_counts().min()
print("Samples per emotion after balancing:", min_count)

df = df.groupby('emotion').apply(
    lambda x: x.sample(n=min_count, random_state=42)
).reset_index(drop=True)

print("Balanced dataset:")
print(df['emotion'].value_counts())


Samples per emotion after balancing: 110
Balanced dataset:
emotion
anger      110
happy      110
neutral    110
sad        110
Name: count, dtype: int64


  df = df.groupby('emotion').apply(


In [54]:
def clean_text(text):
    text = text.lower()
    text = re.sub('[^a-zA-Z]', ' ', text)
    words = text.split()
    words = [w for w in words if w not in stopwords.words('english')]
    return ' '.join(words)

df['clean_text'] = df['text'].apply(clean_text)


In [55]:
vectorizer = TfidfVectorizer(
    max_features=3000,
    ngram_range=(1, 2)
)

X = vectorizer.fit_transform(df['clean_text'])
y = df['emotion']


In [56]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [57]:
model = LogisticRegression(
    max_iter=1000,
    solver='liblinear'
)

model.fit(X_train, y_train)


In [58]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.375


In [59]:
def predict_emotion(text):
    text = clean_text(text)
    vector = vectorizer.transform([text])

    probs = model.predict_proba(vector)[0]
    emotion = model.classes_[probs.argmax()]
    confidence = probs.max()

    return emotion, round(confidence * 100, 2)


In [61]:
while True:
    user_text = input("Enter text (or 'bye'): ")
    if user_text.lower() == "bye":
        print("Goodbye!")
        break

    emotion, conf = predict_emotion(user_text)
    print("Predicted Emotion:", emotion)
    print("Confidence:", conf, "%")


Enter text (or 'bye'): This is so frustrating 
Predicted Emotion: anger
Confidence: 25.88 %
Enter text (or 'bye'): I am scared about the exam
Predicted Emotion: sad
Confidence: 30.66 %
Enter text (or 'bye'): I am extremely happy today
Predicted Emotion: happy
Confidence: 36.14 %
Enter text (or 'bye'): I ate my dinner
Predicted Emotion: neutral
Confidence: 29.93 %
Enter text (or 'bye'): I love going on vacationss
Predicted Emotion: sad
Confidence: 28.41 %
Enter text (or 'bye'): bye
Goodbye!
