In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import neattext.functions as nfx
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
df = pd.read_csv("emotion_dataset_raw.csv", encoding="latin1")

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = nfx.remove_stopwords(text)
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

# Apply preprocessing
df['Clean_Text'] = df['Text'].apply(preprocess_text)

# Define features and labels
X = df['Clean_Text']
y = df['Emotion']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define ML models with TF-IDF
models = {
    'Logistic Regression': Pipeline([('tfidf', TfidfVectorizer()), ('lr', LogisticRegression())]),
    'SVM': Pipeline([('tfidf', TfidfVectorizer()), ('svm', SVC(kernel='rbf', C=1.0))]),
    'Random Forest': Pipeline([('tfidf', TfidfVectorizer()), ('rf', RandomForestClassifier(n_estimators=100))])
}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    print(f'{name} Accuracy: {score:.4f}')

# Hyperparameter tuning for SVM
grid_params = {'svm__C': [0.1, 1, 10], 'svm__kernel': ['linear', 'rbf']}
grid_search = GridSearchCV(models['SVM'], grid_params, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Best SVM Parameters:", grid_search.best_params_)
print("Best SVM Score:", grid_search.best_score_)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\KIIT\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.6098
SVM Accuracy: 0.6314
Random Forest Accuracy: 0.6153
Best SVM Parameters: {'svm__C': 10, 'svm__kernel': 'rbf'}
Best SVM Score: 0.6308616684168171


In [21]:
pipe_lr = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Text vectorization
    ('svm', SVC(probability=True))  # SVM classifier with probability estimates
])

# Train the model
pipe_lr.fit(X_train, y_train)

# Save the trained model
joblib.dump(pipe_lr, "emotion_model.pkl")

print("Model trained and saved successfully!")

Model trained and saved successfully!
