# Importing Libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [2]:
import nltk

In [None]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


# Loading and Preprocessing

In [None]:
df = pd.read_csv("D:/Machine_Learning/Study_Material/nlp_dataset.csv")
df.head()

#### Convert to lowercase

In [3]:
df['Comment'] = df['Comment'].str.lower()
df['Emotion'] = df['Emotion'].str.lower()
df.head()

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [4]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


#### Tokenization

In [None]:
print("word tokenize")

from nltk.tokenize import word_tokenize
tokenized_docs = [word_tokenize(doc) for doc in df['Comment'].tolist()]
print(tokenized_docs)

print("\nSentence tokenization")

from nltk.tokenize import sent_tokenize
sent_token = [sent_tokenize(doc) for doc in df['Comment'].tolist()]
print(sent_token)

#### Punctuation Removal

In [None]:
import string
import re

regex = re.compile('[%s]' % re.escape(string.punctuation))
#This line creates a regular expression pattern to match any character that is included in the string.punctuation constant. 
#re.escape() is used to escape any special characters within the punctuation string .

tokenized_docs_no_punctuation = []

for review in tokenized_docs:
    new_review = []
    for token in review:
        new_token = regex.sub(u'', token) # substitute any punctuation characters in the current token (token) with an empty string (''). This effectively removes all punctuation from the token.
        if not new_token == u'': # checks if the token after removing punctuation is not an empty string.
            new_review.append(new_token)
    
    tokenized_docs_no_punctuation.append(new_review)
    
print(tokenized_docs_no_punctuation)

#### Removing Stopwords

In [None]:
from nltk.corpus import stopwords

tokenized_docs_no_stopwords = []

for doc in tokenized_docs_no_punctuation:
    new_term_vector = []
    for word in doc:
        if not word in stopwords.words('english'):
            new_term_vector.append(word)
    
    tokenized_docs_no_stopwords.append(new_term_vector)

print(tokenized_docs_no_stopwords)

In [None]:
stopwords.words('english')

#### Stemming and Lemmatization

In [None]:
# Stemming and Lemmatization
from nltk.stem.porter import PorterStemmer # imports the PorterStemmer class from NLTK, which is used for stemming.
from nltk.stem.wordnet import WordNetLemmatizer #  imports the WordNetLemmatizer class from NLTK, which is used for lemmatization.

porter = PorterStemmer() #Creates an instance of the PorterStemmer class
wordnet = WordNetLemmatizer() #Creates an instance of the WordNetLemmatizer class

preprocessed_docs = []

for doc in tokenized_docs_no_stopwords:
    final_doc = []
    for word in doc:
        #final_doc.append(porter.stem(word)) #stems the current word using the Porter Stemmer and appends the stemmed form to the final_doc list.
        final_doc.append(wordnet.lemmatize(word)) # lemmatizes the current word using the WordNet Lemmatizer and appends the lemmatized form to the final_doc list.
    
    preprocessed_docs.append(final_doc) #After processing all words in a document, the resulting list of stemmed or lemmatized words (final_doc) is appended to the preprocessed_docs list.

print(preprocessed_docs)

<b>Preprocessing:</b>
<p><b>Text Cleaning: Convert the text to lowercase, remove punctuation, and special characters.
Tokenization: Split the text into individual words (tokens).
Stopword Removal: Remove commonly occurring words (like 'the', 'is', 'in') that don’t add much meaning to the classification task.
Impact on Model Performance: Cleaning helps reduce noise in the data, stopword removal focuses the model on more relevant terms, and tokenization allows the model to treat individual words as features.
</b></p>

# Feature Extraction

In [None]:
# Feature Extraction using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['Comment'])
y = df['Emotion']
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Development

In [None]:
# Train a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Make predictions on the test set
predictions = classifier.predict(X_test)

In [None]:
# Train a SVC
SvcClassifier = SVC(kernel='linear')
SvcClassifier.fit(X_train, y_train)

# Make predictions on the test set
SvcPredictions = SvcClassifier.predict(X_test)

# Model Comparison

In [None]:
# Evaluate the Naive Bayes model
accuracyNB = accuracy_score(y_test, predictions)
print("Accuracy:", accuracyNB)

In [None]:
# Evaluate the SVC model
accuracySVC = accuracy_score(y_test, SvcPredictions)
print("Accuracy:", accuracySVC)

In [14]:
# Print the classification report Naive Bayes
print("Classification Report Naive Bayes:")
print(classification_report(y_test, predictions))

Classification Report:
              precision    recall  f1-score   support

       anger       0.88      0.95      0.91       392
        fear       0.92      0.92      0.92       416
         joy       0.94      0.87      0.91       380

    accuracy                           0.91      1188
   macro avg       0.91      0.91      0.91      1188
weighted avg       0.91      0.91      0.91      1188



In [None]:
# Print the classification report SVC
print("Classification Report SVC:")
print(classification_report(y_test, SvcPredictions))

<b>Naive Bayes is simple and fast, making it a good baseline for text classification tasks. SVM often performs better for complex data like text due to its ability to handle high-dimensional feature spaces and margins</b>