In [None]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


In [57]:
test_df = pd.read_csv('../task4/NLP_features 2.csv')
train_df = pd.read_csv('../datasets/Train_Test_data/Training_dataset.csv')

In [58]:
# print unique labels
print("Unique labels in train_df:", train_df['Emotion'].unique()) 
# print unique labels
print("Unique labels in test_df:", test_df['Emotion'].unique())

Unique labels in train_df: ['surprise' 'sadness' 'disgust' 'fear' 'anger' 'neutral' 'happiness']
Unique labels in test_df: ['happiness' 'fear' 'sadness' 'surprise' 'neutral' 'anger' 'disgust']


In [None]:
# Vectorize using training data only
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_df['Sentence'])
X_test = vectorizer.transform(test_df['Sentence'])

# Labels
y_train = train_df['Emotion']
y_test = test_df['Emotion']

# Convert sparse matrices to dense arrays
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

# Train Gaussian Naive Bayes
gnb = GaussianNB()
y_pred = gnb.fit(X_train_dense, y_train).predict(X_test_dense)

# Evaluate the model
report = classification_report(y_test, y_pred, target_names=np.unique(y_train))
print(report)


              precision    recall  f1-score   support

       anger       0.07      0.13      0.09        47
     disgust       0.01      0.50      0.03         2
        fear       0.02      0.18      0.04        17
   happiness       0.48      0.30      0.37       250
     neutral       0.36      0.15      0.21       255
     sadness       0.08      0.35      0.13        31
    surprise       0.22      0.09      0.13       147

    accuracy                           0.20       749
   macro avg       0.18      0.24      0.14       749
weighted avg       0.33      0.20      0.23       749



---

# Predictions using multinomial Naive bayes model

In [60]:
gnb = MultinomialNB()

# Train Multinomial Naive Bayes
y_pred = gnb.fit(X_train, y_train).predict(X_test)

# Evaluate the model
report = classification_report(y_test, y_pred, target_names=np.unique(y))
print(report)



              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        47
     disgust       0.00      0.00      0.00         2
        fear       0.05      0.24      0.08        17
   happiness       0.58      0.35      0.44       250
     neutral       0.41      0.32      0.36       255
     sadness       0.08      0.42      0.13        31
    surprise       0.25      0.24      0.24       147

    accuracy                           0.30       749
   macro avg       0.19      0.22      0.18       749
weighted avg       0.39      0.30      0.32       749



---

# Predictions using Multinomial and handling class imbalance

In [61]:
# Apply SMOTE on training data
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train_dense, y_train)

# Train Multinomial Naive Bayes
mnb = MultinomialNB()
mnb.fit(X_train_sm, y_train_sm)

# Predict on test data
y_pred = mnb.predict(X_test_dense)

# Evaluate the model
report = classification_report(y_test, y_pred, target_names=np.unique(y))
print(report)

              precision    recall  f1-score   support

       anger       0.07      0.09      0.08        47
     disgust       0.00      0.00      0.00         2
        fear       0.04      0.18      0.06        17
   happiness       0.54      0.40      0.46       250
     neutral       0.35      0.20      0.26       255
     sadness       0.07      0.29      0.11        31
    surprise       0.25      0.19      0.22       147

    accuracy                           0.26       749
   macro avg       0.19      0.19      0.17       749
weighted avg       0.36      0.26      0.29       749



---

# Apply tokenizer to sentences

In [62]:
# Apply lemmatizer
nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmatized_tokens)

# Apply lemmatization to the training and test data
train_df['Lemmatized_Sentence'] = train_df['Sentence'].apply(lemmatize_text)
test_df['Lemmatized_Sentence'] = test_df['Sentence'].apply(lemmatize_text)
# Vectorize the lemmatized sentences
X_train_lemmatized = vectorizer.fit_transform(train_df['Lemmatized_Sentence'])
X_test_lemmatized = vectorizer.transform(test_df['Lemmatized_Sentence'])
# Train Multinomial Naive Bayes on lemmatized data
mnb_lemmatized = MultinomialNB()
mnb_lemmatized.fit(X_train_lemmatized, y_train)
# Predict on test data with lemmatized sentences
y_pred_lemmatized = mnb_lemmatized.predict(X_test_lemmatized)
# Evaluate the model with lemmatized sentences
report_lemmatized = classification_report(y_test, y_pred_lemmatized, target_names=np.unique(y))
print(report_lemmatized)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Beheerder\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Beheerder\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        47
     disgust       0.00      0.00      0.00         2
        fear       0.04      0.18      0.06        17
   happiness       0.57      0.33      0.42       250
     neutral       0.40      0.33      0.36       255
     sadness       0.08      0.45      0.14        31
    surprise       0.24      0.22      0.23       147

    accuracy                           0.29       749
   macro avg       0.19      0.22      0.17       749
weighted avg       0.38      0.29      0.31       749



---

# Apply SMOTE and Lemmatizer

In [63]:
# Applying Smote on lemmatized data
smote_lemmatized = SMOTE(random_state=42)
X_train_lemmatized_sm, y_train_lemmatized_sm = smote_lemmatized.fit_resample(X_train_lemmatized, y_train)
# Train Multinomial Naive Bayes on lemmatized data with SMOTE
mnb_lemmatized_sm = MultinomialNB()
mnb_lemmatized_sm.fit(X_train_lemmatized_sm, y_train_lemmatized_sm)
# Predict on test data with lemmatized sentences
y_pred_lemmatized_sm = mnb_lemmatized_sm.predict(X_test_lemmatized)
# Evaluate the model with lemmatized sentences and SMOTE
report_lemmatized_sm = classification_report(y_test, y_pred_lemmatized_sm, target_names=np.unique(y))
print(report_lemmatized_sm)

              precision    recall  f1-score   support

       anger       0.07      0.09      0.08        47
     disgust       0.00      0.00      0.00         2
        fear       0.03      0.12      0.04        17
   happiness       0.53      0.39      0.45       250
     neutral       0.36      0.21      0.26       255
     sadness       0.08      0.32      0.13        31
    surprise       0.23      0.17      0.20       147

    accuracy                           0.26       749
   macro avg       0.18      0.18      0.16       749
weighted avg       0.35      0.26      0.29       749



---

# Applied TFIDF

In [65]:
# Use TfidfVectorizer instead of CountVectorizer on multinomial Naive Bayes
tfidf_vectorizer = TfidfVectorizer()
# Vectorize using training data only
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['Sentence'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['Sentence'])
# Train Multinomial Naive Bayes with TfidfVectorizer
mnb_tfidf = MultinomialNB()
mnb_tfidf.fit(X_train_tfidf, y_train)
# Predict on test data with TfidfVectorizer
y_pred_tfidf = mnb_tfidf.predict(X_test_tfidf)
# Evaluate the model with TfidfVectorizer
report_tfidf = classification_report(y_test, y_pred_tfidf, target_names=np.unique(y))
print(report_tfidf)



              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        47
     disgust       0.00      0.00      0.00         2
        fear       0.05      0.24      0.08        17
   happiness       0.66      0.28      0.39       250
     neutral       0.40      0.40      0.40       255
     sadness       0.10      0.52      0.17        31
    surprise       0.23      0.23      0.23       147

    accuracy                           0.30       749
   macro avg       0.21      0.24      0.18       749
weighted avg       0.41      0.30      0.32       749



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


---