In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import neattext.functions as nfx
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import contractions


# Load the dataset
data = pd.read_csv('./data/tweet_emotions.csv')

# Clean the text
# User Handles
data['Clean_Text'] =data['Text'].apply(nfx.remove_userhandles)

# Stopwords
data['Clean_Text'] =data['Clean_Text'].apply(nfx.remove_shortwords)

# Remove Special Characters
data['Clean_Text'] =data['Clean_Text'].apply(nfx.remove_special_characters)

# Remove unnecessary characters and symbols
data['Clean_Text'] = data['Text'].apply(lambda x: re.sub(r"[^a-zA-Z\s]", "", x))
# Remove Numbers
data['Clean_Text'] = data['Clean_Text'].apply(nfx.remove_numbers)

# Function to expand contractions in text
def expand_contractions(text):
    expanded_text = contractions.fix(text)
    return expanded_text

# Apply contraction expansion to 'Clean_Text' column
data['Clean_Text'] = data['Clean_Text'].apply(expand_contractions)

# Features and labels
X_features = data['Clean_Text']
y_labels = data['Emotion']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels, test_size=0.2, random_state=42)

# Create a pipeline with feature scaling and SVM classifier
pipeline = Pipeline([
    ('cv', CountVectorizer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('svm', SVC())
])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Generate classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


Accuracy: 0.2755


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
               precision    recall  f1-score   support

       anger       0.00      0.00      0.00        19
     boredom       0.00      0.00      0.00        31
       empty       0.00      0.00      0.00       162
  enthusiasm       0.00      0.00      0.00       163
         fun       0.00      0.00      0.00       338
   happiness       0.28      0.02      0.03      1028
        hate       0.00      0.00      0.00       268
        love       0.54      0.08      0.14       762
     neutral       0.28      0.52      0.36      1740
      relief       0.00      0.00      0.00       352
     sadness       0.22      0.01      0.02      1046
    surprise       0.00      0.00      0.00       425
       worry       0.27      0.72      0.39      1666

    accuracy                           0.28      8000
   macro avg       0.12      0.10      0.07      8000
weighted avg       0.23      0.28      0.18      8000

Confusion Matrix:
 [[   0    0    0    0    0    1    0

  _warn_prf(average, modifier, msg_start, len(result))
