# Apply Multi class classification  Using Naive Bayes
The sentiment labels are:
1. negative
2. positive

In [75]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

In [32]:
#The dataset is comprised of tab-separated files with phrases from the Rotten Tomatoes dataset.
# Load the dataset
df = pd.read_csv('train.tsv', sep='\t')

#Check the first ten rows of the dataset
df.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2
5,6,1,of escapades demonstrating the adage that what...,2
6,7,1,of,2
7,8,1,escapades demonstrating the adage that what is...,2
8,9,1,escapades,2
9,10,1,demonstrating the adage that what is good for ...,2


In [33]:
# Preprocessing
df = df.drop(['PhraseId', 'SentenceId'], axis=1) # remove irrelevant columns
df = df.dropna() # drop any rows with missing values
df = df[(df['Sentiment'] != 2) & (df['Sentiment'] != 3)& (df['Sentiment'] != 4)] # remove the 'neutral' class (sentiment=2) along with 'somewhat negative', 'somewhat postive' class
df.Sentiment.unique() # check the remaining sentiment values as we will label them into 'negative and positive'

array([1, 0])

In [34]:
df.head(5) # keeping only the columns of interest

Unnamed: 0,Phrase,Sentiment
0,A series of escapades demonstrating the adage ...,1
33,"the gander , some of which occasionally amuses...",1
47,but none of which amounts to much of a story,1
49,none of which amounts to much of a story,1
81,"Even fans of Ismail Merchant 's work , I suspe...",1


In [35]:
# Map the sentiment labels to new labels
sentiment_map = {0: 'negative', 1: 'positive'}
df['Sentiment'] = df['Sentiment'].map(sentiment_map)

In [40]:
df[df['Sentiment'] == 'negative'] #filtering our negative sentiments

Unnamed: 0,Phrase,Sentiment
101,would have a hard time sitting through this one,negative
103,have a hard time sitting through this one,negative
157,Aggressive self-glorification and a manipulati...,negative
159,self-glorification and a manipulative whitewash,negative
201,Trouble Every Day is a plodding mess .,negative
...,...,...
155965,has turned out nearly 21\/2 hours of unfocused...,negative
155967,"turned out nearly 21\/2 hours of unfocused , e...",negative
155970,"of unfocused , excruciatingly tedious cinema",negative
155971,"unfocused , excruciatingly tedious cinema",negative


In [42]:
df[df['Sentiment'] == 'positive'] #filtering our postitive sentiments

Unnamed: 0,Phrase,Sentiment
0,A series of escapades demonstrating the adage ...,positive
33,"the gander , some of which occasionally amuses...",positive
47,but none of which amounts to much of a story,positive
49,none of which amounts to much of a story,positive
81,"Even fans of Ismail Merchant 's work , I suspe...",positive
...,...,...
156036,substitute plot for personality,positive
156047,quietly suggesting the sadness and obsession b...,positive
156051,sadness and obsession,positive
156052,sadness and,positive


In [52]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [51]:
# Feature extraction
vectorizer = CountVectorizer(lowercase=True, stop_words='english')
X = vectorizer.fit_transform(df['Phrase'])
y = df['Sentiment']

In [55]:
# Train and evaluate MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB()

In [58]:
# Train and evaluate BernoulliNB
bnb = BernoulliNB(binarize=0.0)
bnb.fit(X_train, y_train)

BernoulliNB()

In [60]:
# Evaluate the performance of MultinomialNB
y_pred_mnb = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_mnb)
precision = precision_score(y_test, y_pred_mnb, average='weighted')
recall = recall_score(y_test, y_pred_mnb, average='weighted')
f1 = f1_score(y_test, y_pred_mnb, average='weighted')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)

Accuracy: 0.7833745814529044
Precision: 0.7672800478019751
Recall: 0.7833745814529044
F1-score: 0.7736674325440367


In [61]:
# Evaluate the performance of BernoulliNB
y_pred_bnb = bnb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_bnb)
precision = precision_score(y_test, y_pred_bnb, average='weighted')
recall = recall_score(y_test, y_pred_bnb, average='weighted')
f1 = f1_score(y_test, y_pred_bnb, average='weighted')
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)

Accuracy: 0.7825010918619887
Precision: 0.7536022836172377
Recall: 0.7825010918619887
F1-score: 0.7623680567291775


# sklearn.metrics.classification_report

In [71]:
from sklearn.metrics import confusion_matrix

cm1 = confusion_matrix(y_test,y_pred_mnb)
print("Confusion Matrix under MultinomialNB")
cm1

Confusion Matrix under MultinomialNB


array([[ 531,  892],
       [ 596, 4850]])

In [73]:
cm2 = confusion_matrix(y_test,y_pred_bnb)
print("Confusion Matrix under BernoulliNB")
cm2

Confusion Matrix under BernoulliNB


array([[ 408, 1015],
       [ 479, 4967]])

In [63]:
from sklearn.metrics import classification_report

#                            Actual, un-seen-data
print(classification_report(y_test,y_pred_mnb))

              precision    recall  f1-score   support

    negative       0.47      0.37      0.42      1423
    positive       0.84      0.89      0.87      5446

    accuracy                           0.78      6869
   macro avg       0.66      0.63      0.64      6869
weighted avg       0.77      0.78      0.77      6869



In [74]:
#                            Actual, un-seen-data
print(classification_report(y_test,y_pred_bnb))

              precision    recall  f1-score   support

    negative       0.46      0.29      0.35      1423
    positive       0.83      0.91      0.87      5446

    accuracy                           0.78      6869
   macro avg       0.65      0.60      0.61      6869
weighted avg       0.75      0.78      0.76      6869





# Based on the confusion matrices, we can see that both models have similar overall accuracy, with the MultinomialNB model having a slightly higher accuracy of 0.7834 compared to the BernoulliNB model with an accuracy of 0.7825.





# However, when we look at the confusion matrices, we see that the MultinomialNB model has more false positives (892) than the BernoulliNB model (1015), meaning that the MultinomialNB model is more likely to incorrectly classify a neutral or positive review as negative. On the other hand, the BernoulliNB model has more false negatives (479) than the MultinomialNB model (596), meaning that the BernoulliNB model is more likely to incorrectly classify a negative review as neutral or positive.