In [4]:
import numpy as np
import pandas as pd
import re
import nltk
import csv  # Import csv for quoting constants
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import pickle
import joblib

In [5]:
# Install imbalanced-learn for SMOTE
!pip install imbalanced-learn

from imblearn.over_sampling import SMOTE

# Download NLTK data once at the beginning
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [6]:

train_dataset = pd.read_csv('/content/train_split.csv', encoding='ISO-8859-1', engine='python', quoting=csv.QUOTE_ALL)
test_dataset = pd.read_csv('/content/test_split.csv', encoding='ISO-8859-1', engine='python', quoting=csv.QUOTE_ALL)


In [7]:
# Ensure 'text' column exists and handle missing values
if 'text' not in train_dataset.columns or 'text' not in test_dataset.columns:
    print("train_dataset columns:", train_dataset.columns)
    print("test_dataset columns:", test_dataset.columns)
    raise ValueError("The datasets must contain a 'text' column.")

# Drop rows with missing 'text' values
train_dataset = train_dataset.dropna(subset=['text'])
test_dataset = test_dataset.dropna(subset=['text'])

# Initialize NLP tools
lemmatizer = WordNetLemmatizer()
all_stopwords = stopwords.words('english')

# Function to preprocess text
def preprocess_text(text):
    review = re.sub('[^a-zA-Z]', ' ', text)
    review = review.lower().split()
    # Do not remove stopwords to preserve context
    review = [lemmatizer.lemmatize(word) for word in review]
    return ' '.join(review)

# Apply preprocessing on both the train and test datasets
train_corpus = train_dataset['text'].apply(preprocess_text)
test_corpus = test_dataset['text'].apply(preprocess_text)

# Use TfidfVectorizer with n-grams for text representation
tfidf = TfidfVectorizer(ngram_range=(1, 3))
x_train = tfidf.fit_transform(train_corpus).toarray()
x_test = tfidf.transform(test_corpus).toarray()

# Save the vectorizer for later use
pickle.dump(tfidf, open('/content/c1_TFIDF_Sentiment_Model.pkl', 'wb'))

# Combine emotion columns into a single label column
emotion_columns = ['Joy', 'Fear', 'Anger', 'Sadness', 'Surprise']
train_dataset['label'] = train_dataset[emotion_columns].idxmax(axis=1)
test_dataset['label'] = test_dataset[emotion_columns].idxmax(axis=1)

# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(train_dataset['label'])
y_test = le.transform(test_dataset['label'])

# Analyze class distribution
print("Training set class distribution:")
print(train_dataset['label'].value_counts())

print("\nTest set class distribution:")
print(test_dataset['label'].value_counts())


x_train_resampled, y_train_resampled = x_train, y_train

# Verify the new class distribution after resampling
from collections import Counter
print("\nResampled training set class distribution:")
print(Counter(y_train_resampled))

# Dictionary to store classifiers and their names with class_weight='balanced'
classifiers = {
    "Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced'),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=0, class_weight='balanced'),
    "Support Vector Machine": SVC(kernel='linear', probability=True, class_weight='balanced')
}

# Iterate through each classifier, train it, and evaluate performance
for name, classifier in classifiers.items():
    print(f"\nTraining {name}...")
    classifier.fit(x_train_resampled, y_train_resampled)

    # Save the model for each classifier
    joblib.dump(classifier, f"/content/{name.replace(' ', '_')}_Sentiment_Model.pkl")

    # Predict on the test set
    y_pred = classifier.predict(x_test)

    # Evaluate performance
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=le.classes_)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Confusion Matrix for {name}:\n{cm}")
    print(f"\nClassification Report for {name}:\n{report}")
    print(f"Accuracy for {name}: {accuracy:.4f}")



Training set class distribution:
label
Fear        874
Joy         513
Sadness      89
Surprise     69
Anger        55
Name: count, dtype: int64

Test set class distribution:
label
Fear        222
Joy         126
Surprise     18
Sadness      18
Anger        16
Name: count, dtype: int64

Resampled training set class distribution:
Counter({1: 874, 2: 513, 3: 89, 4: 69, 0: 55})

Training Naive Bayes...
Confusion Matrix for Naive Bayes:
[[  1   9   5   1   0]
 [  1 165  49   3   4]
 [  1  77  43   4   1]
 [  0  13   3   2   0]
 [  0  10   7   1   0]]

Classification Report for Naive Bayes:
              precision    recall  f1-score   support

       Anger       0.33      0.06      0.11        16
        Fear       0.60      0.74      0.67       222
         Joy       0.40      0.34      0.37       126
     Sadness       0.18      0.11      0.14        18
    Surprise       0.00      0.00      0.00        18

    accuracy                           0.53       400
   macro avg       0.30    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Confusion Matrix for Random Forest:
[[  0  15   1   0   0]
 [  0 216   5   1   0]
 [  0 117   9   0   0]
 [  0  17   1   0   0]
 [  0  18   0   0   0]]

Classification Report for Random Forest:
              precision    recall  f1-score   support

       Anger       0.00      0.00      0.00        16
        Fear       0.56      0.97      0.71       222
         Joy       0.56      0.07      0.13       126
     Sadness       0.00      0.00      0.00        18
    Surprise       0.00      0.00      0.00        18

    accuracy                           0.56       400
   macro avg       0.23      0.21      0.17       400
weighted avg       0.49      0.56      0.44       400

Accuracy for Random Forest: 0.5625

Training Support Vector Machine...
Confusion Matrix for Support Vector Machine:
[[  0  10   5   1   0]
 [  1 153  66   2   0]
 [  0  53  72   1   0]
 [  1   8   8   1   0]
 [  0  11   7   0   0]]

Classification Report for Support Vector Machine:
              precision    recall 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
