In [3]:
import pandas as pd
import numpy as np
import string
import nltk

# Loading the dataset

In [4]:
df = pd.read_csv("C:\\Users\\user\\Downloads\\nlp_dataset.csv")
df

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear
...,...,...
5932,i begun to feel distressed for you,fear
5933,i left feeling annoyed and angry thinking that...,anger
5934,i were to ever get married i d have everything...,joy
5935,i feel reluctant in applying there because i w...,fear


In [5]:
df.head()

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [6]:
df.tail()

Unnamed: 0,Comment,Emotion
5932,i begun to feel distressed for you,fear
5933,i left feeling annoyed and angry thinking that...,anger
5934,i were to ever get married i d have everything...,joy
5935,i feel reluctant in applying there because i w...,fear
5936,i just wanted to apologize to you because i fe...,anger


# Preprocessing
## 1. convert to lowercase and removing punctuation
The code helps converting text to lowercase standardizes words, reducing case sensitivity. Removing punctuation eliminates symbols that don't contribute to meaning, simplifying text analysis. Both steps help improve the consistency of data for models.

In [7]:
df['Comment'].str.replace(f'[{string.punctuation}]', '', regex=True).str.lower()
df['Emotion'].str.replace(f'[{string.punctuation}]', '', regex=True).str.lower()


0        fear
1       anger
2        fear
3         joy
4        fear
        ...  
5932     fear
5933    anger
5934      joy
5935     fear
5936    anger
Name: Emotion, Length: 5937, dtype: object

In [10]:
print(df)

                                                Comment Emotion
0     i seriously hate one subject to death but now ...    fear
1                    im so full of life i feel appalled   anger
2     i sit here to write i start to dig out my feel...    fear
3     ive been really angry with r and i feel like a...     joy
4     i feel suspicious if there is no one outside l...    fear
...                                                 ...     ...
5932                 i begun to feel distressed for you    fear
5933  i left feeling annoyed and angry thinking that...   anger
5934  i were to ever get married i d have everything...     joy
5935  i feel reluctant in applying there because i w...    fear
5936  i just wanted to apologize to you because i fe...   anger

[5937 rows x 2 columns]


## 2. Stemming and Lemmatization
The function process_words can perform both stemming and lemmatization on a given text, where stemming reduces words to their root forms, such as turning "playing" into "play," while lemmatization transforms words into their dictionary form, like changing "better" to "good." These techniques help standardize text data, improving NLP model accuracy by reducing word variations.

In [14]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer


In [15]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [16]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [17]:
df['stemmed_comment'] = df['Comment'].apply(lambda text: ' '.join([stemmer.stem(word) for word in nltk.word_tokenize(text)]))

In [18]:
df['lemmatized_comment'] = df['Comment'].apply(lambda text: ' '.join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(text)]))

In [19]:
df[['Comment', 'stemmed_comment', 'lemmatized_comment']].head()

Unnamed: 0,Comment,stemmed_comment,lemmatized_comment
0,i seriously hate one subject to death but now ...,i serious hate one subject to death but now i ...,i seriously hate one subject to death but now ...
1,im so full of life i feel appalled,im so full of life i feel appal,im so full of life i feel appalled
2,i sit here to write i start to dig out my feel...,i sit here to write i start to dig out my feel...,i sit here to write i start to dig out my feel...
3,ive been really angry with r and i feel like a...,ive been realli angri with r and i feel like a...,ive been really angry with r and i feel like a...
4,i feel suspicious if there is no one outside l...,i feel suspici if there is no one outsid like ...,i feel suspicious if there is no one outside l...


## 2. Tokenization
The function tokenize_text is used to split a given text into individual tokens, such as words or phrases. Tokenization breaks down sentences into smaller parts.

In [11]:
from nltk.tokenize import word_tokenize
import nltk

In [20]:
df['tokenized_comment'] = df['Comment'].apply(nltk.word_tokenize)


In [21]:
df[['Comment', 'tokenized_comment']].head()

Unnamed: 0,Comment,tokenized_comment
0,i seriously hate one subject to death but now ...,"[i, seriously, hate, one, subject, to, death, ..."
1,im so full of life i feel appalled,"[im, so, full, of, life, i, feel, appalled]"
2,i sit here to write i start to dig out my feel...,"[i, sit, here, to, write, i, start, to, dig, o..."
3,ive been really angry with r and i feel like a...,"[ive, been, really, angry, with, r, and, i, fe..."
4,i feel suspicious if there is no one outside l...,"[i, feel, suspicious, if, there, is, no, one, ..."


## 4. Splitting data into training and testing sets

The function split_data is used to divide a dataset into training and testing sets. This process is essential in machine learning to evaluate model performance, as it allows the model to learn from one portion of the data and be tested on another to ensure it generalizes well to new data.

In [29]:
from sklearn.model_selection import train_test_split
X = df['Comment']
y = df['Emotion']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Feature Extraction

## Using Counter vector

The function extract_features uses a CounterVectorizer to convert text data into numerical feature vectors. This method counts the occurrences of each word in the text and represents them as a vector, making it easier for machine learning models to analyze and process the text data.

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [31]:
print("Training data shape:", X_train_vectorized.shape)
print("Testing data shape:", X_test_vectorized.shape)


Training data shape: (4749, 7900)
Testing data shape: (1188, 7900)


## Using TF-IDF vector
The function extract_features uses a TfidfVectorizer to convert text data into numerical feature vectors. This method calculates the Term Frequency-Inverse Document Frequency (TF-IDF) of words, which reflects how important a word is in a document relative to a collection of documents. It helps to downscale common words and emphasize words that are more informative for the analysis.

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [40]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)  
X_test_tfidf = tfidf_vectorizer.transform(X_test) 

In [41]:
print("Training data shape (TF-IDF):", X_train_tfidf.shape)
print("Testing data shape (TF-IDF):", X_test_tfidf.shape)

Training data shape (TF-IDF): (4749, 5000)
Testing data shape (TF-IDF): (1188, 5000)


# Training Models
## Using Naive Bayes

In [42]:
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

In [55]:
y_pred_nb = nb_model.predict(X_test_tfidf)

In [45]:
from sklearn.metrics import accuracy_score, classification_report
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 90.74%

Classification Report:
               precision    recall  f1-score   support

       anger       0.88      0.94      0.91       392
        fear       0.93      0.89      0.91       416
         joy       0.92      0.89      0.91       380

    accuracy                           0.91      1188
   macro avg       0.91      0.91      0.91      1188
weighted avg       0.91      0.91      0.91      1188



## Using Support Vector Machine

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC


In [49]:
X = df['Comment'] 
y = df['Emotion'] 

In [50]:
svm_model = SVC(kernel='linear')  
svm_model.fit(X_train_tfidf, y_train)

In [58]:
y_pred_svm = svm_model.predict(X_test_tfidf)

In [52]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 93.77%

Classification Report:
               precision    recall  f1-score   support

       anger       0.91      0.95      0.93       392
        fear       0.96      0.92      0.94       416
         joy       0.94      0.95      0.94       380

    accuracy                           0.94      1188
   macro avg       0.94      0.94      0.94      1188
weighted avg       0.94      0.94      0.94      1188



# Model Comparison

In [53]:
from sklearn.metrics import f1_score

## Naive Bayes

In [57]:
print("Naive Bayes Model Evaluation:")
nb_accuracy = accuracy_score(y_test, y_pred_nb)  
nb_f1 = f1_score(y_test, y_pred_nb, average='weighted') 
print(f"Naive Bayes Accuracy: {nb_accuracy * 100:.2f}%")
print(f"Naive Bayes F1-Score: {nb_f1:.2f}")
print("Classification Report for Naive Bayes:\n", classification_report(y_test, y_pred_nb))

Naive Bayes Model Evaluation:
Naive Bayes Accuracy: 90.74%
Naive Bayes F1-Score: 0.91
Classification Report for Naive Bayes:
               precision    recall  f1-score   support

       anger       0.88      0.94      0.91       392
        fear       0.93      0.89      0.91       416
         joy       0.92      0.89      0.91       380

    accuracy                           0.91      1188
   macro avg       0.91      0.91      0.91      1188
weighted avg       0.91      0.91      0.91      1188



## Support Vector Machine

In [59]:
print("\nSupport Vector Machine (SVM) Model Evaluation:")
svm_accuracy = accuracy_score(y_test, y_pred_svm)
svm_f1 = f1_score(y_test, y_pred_svm, average='weighted')  # Weighted average to handle class imbalance
print(f"SVM Accuracy: {svm_accuracy * 100:.2f}%")
print(f"SVM F1-Score: {svm_f1:.2f}")
print("Classification Report for SVM:\n", classification_report(y_test, y_pred_svm))


Support Vector Machine (SVM) Model Evaluation:
SVM Accuracy: 93.77%
SVM F1-Score: 0.94
Classification Report for SVM:
               precision    recall  f1-score   support

       anger       0.91      0.95      0.93       392
        fear       0.96      0.92      0.94       416
         joy       0.94      0.95      0.94       380

    accuracy                           0.94      1188
   macro avg       0.94      0.94      0.94      1188
weighted avg       0.94      0.94      0.94      1188



# Model Comparison:
## Accuracy:

The Support Vector Machine (SVM) model has an accuracy of 93.77%, outperforming the Naive Bayes model, which has an accuracy of 91.00%.
## F1-Score:

The SVM achieves an F1-Score of 0.94, indicating better overall performance compared to Naive Bayes, which has an F1-Score of 0.91.
## Precision:

In terms of precision, the SVM model demonstrates higher values across all emotion categories, notably achieving a precision of 0.96 for fear, while Naive Bayes's highest precision is 0.93.
## Recall:
The SVM also excels in recall, with scores of 0.95 for anger and joy, compared to Naive Bayes, which has a maximum recall of 0.94, reflecting its superior ability to correctly identify instances of each emotion.
