In [1]:
import pandas as pd
import numpy as np
import pickle
import string
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords

In [2]:
# Download required NLTK data
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))

In [3]:
# Text preprocessing functions
def remove_punc(txt):
    return txt.translate(str.maketrans('', '', string.punctuation))

def remove_numbers(txt):
    return ''.join([i for i in txt if not i.isdigit()])

def remove_stopwords(txt):
    words = txt.split()
    cleaned = [word for word in words if word not in stop_words]
    return ' '.join(cleaned)

def preprocess_text(text):
    """Complete preprocessing pipeline"""
    text = text.lower()
    text = remove_punc(text)
    text = remove_numbers(text)
    text = remove_stopwords(text)
    return text

In [4]:
# Load and preprocess data
print("Loading data...")
df = pd.read_csv('train.txt', sep=';', header=None, names=['text', 'emotions'])

Loading data...


In [21]:
df.head()

Unnamed: 0,text,emotions
0,didnt feel humiliated,0
1,go feeling hopeless damned hopeful around some...,0
2,im grabbing minute post feel greedy wrong,1
3,ever feeling nostalgic fireplace know still pr...,2
4,feeling grouchy,1


In [5]:
print(f"Dataset size: {len(df)} samples")
print(f"Emotions: {df['emotions'].unique()}")

Dataset size: 16000 samples
Emotions: ['sadness' 'anger' 'love' 'surprise' 'fear' 'joy']


In [6]:
# Create emotion to number mapping
unique_emotions = df['emotions'].unique()
emotions_numbers = {em: i for i, em in enumerate(unique_emotions)}
number_to_emotions = {i: em for em, i in emotions_numbers.items()}

In [7]:
# Save emotion mappings
with open('emotion_mappings.pkl', 'wb') as f:
    pickle.dump({'emotions_to_numbers': emotions_numbers, 
                 'numbers_to_emotions': number_to_emotions}, f)

df['emotions'] = df['emotions'].map(emotions_numbers)

In [8]:
# Preprocess text
print("Preprocessing text...")
df['text'] = df['text'].apply(preprocess_text)

Preprocessing text...


In [9]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['emotions'], 
    test_size=0.20, 
    random_state=42
)

In [10]:
# Create TF-IDF vectorizer
print("Creating TF-IDF vectorizer...")
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

Creating TF-IDF vectorizer...


In [11]:
# Train best model with GridSearchCV
print("Training optimized Logistic Regression model...")
param_grid = {
    'C': [0.01, 0.1, 1, 10, 50],
    'solver': ['liblinear', 'lbfgs'],
    'max_iter': [1000]
}

grid_model = GridSearchCV(
    LogisticRegression(),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_model.fit(X_train_tfidf, y_train)

print(f"\nBest Parameters: {grid_model.best_params_}")
print(f"Best CV Accuracy: {grid_model.best_score_:.4f}")

Training optimized Logistic Regression model...
Fitting 5 folds for each of 10 candidates, totalling 50 fits

Best Parameters: {'C': 10, 'max_iter': 1000, 'solver': 'liblinear'}
Best CV Accuracy: 0.8818


In [12]:
# Evaluate on test set
best_model = grid_model.best_estimator_
y_pred = best_model.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

Test Set Accuracy: 0.8869


In [15]:
# Show classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=[number_to_emotions[i] for i in sorted(number_to_emotions.keys())]))


Classification Report:
              precision    recall  f1-score   support

     sadness       0.92      0.93      0.93       946
       anger       0.90      0.86      0.88       427
        love       0.84      0.72      0.78       296
    surprise       0.87      0.67      0.76       113
        fear       0.87      0.84      0.85       397
         joy       0.87      0.94      0.91      1021

    accuracy                           0.89      3200
   macro avg       0.88      0.83      0.85      3200
weighted avg       0.89      0.89      0.89      3200



In [16]:
# Save the best model
print("\nSaving model and vectorizer...")
with open('best_emotion_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

print("\n✓ Model saved as 'best_emotion_model.pkl'")
print("✓ Vectorizer saved as 'tfidf_vectorizer.pkl'")
print("✓ Emotion mappings saved as 'emotion_mappings.pkl'")


Saving model and vectorizer...

✓ Model saved as 'best_emotion_model.pkl'
✓ Vectorizer saved as 'tfidf_vectorizer.pkl'
✓ Emotion mappings saved as 'emotion_mappings.pkl'


In [17]:
# Example prediction function
def predict_emotion(text):
    """Predict emotion from text"""
    # Load saved models
    with open('best_emotion_model.pkl', 'rb') as f:
        model = pickle.load(f)
    with open('tfidf_vectorizer.pkl', 'rb') as f:
        vectorizer = pickle.load(f)
    with open('emotion_mappings.pkl', 'rb') as f:
        mappings = pickle.load(f)