In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
train_df = pd.read_csv('../data/processed/processed_train.csv')
test_df = pd.read_csv('../data/processed/processed_test.csv')

general_df = pd.concat([train_df, test_df], ignore_index=True)
general_df.columns

Index(['Unnamed: 0', 'review', 'sentiment', 'number_of_chars',
       'percentage_of_signs', 'number_of_excl_marks',
       'number_of_question_marks', 'number_of_ellipses',
       'number_of_uppercase_words', 'cleaned_review', 'tokenized_review',
       'lemmatized_review', 'stemmed_review'],
      dtype='object')

In [3]:
len(general_df)

45999

In [4]:
def vectorize_review(
        text_data: pd.Series,
        vectorizer
):
    return vectorizer.fit_transform(text_data)

In [5]:
count_vectorizer = CountVectorizer()
count_x = vectorize_review(general_df['stemmed_review'], count_vectorizer)
count_X_train, count_X_test, count_y_train, count_y_test = train_test_split(count_x, general_df['sentiment'], test_size=len(train_df), shuffle=False)

tfidf_vectorizer = TfidfVectorizer()
tfidf_x = vectorize_review(general_df['stemmed_review'], tfidf_vectorizer)
tfidf_X_train, tfidf_X_test, tfidf_y_train, tfidf_y_test = train_test_split(tfidf_x, general_df['sentiment'], test_size=len(train_df), shuffle=False)

# Models training.

## Complement Naive Bayes 

In [6]:
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import classification_report, confusion_matrix
CNB_with_count_v = ComplementNB()
CNB_with_count_v.fit(count_X_train, count_y_train)

### Count vectorizer

In [7]:
from sklearn import metrics
predicted = CNB_with_count_v.predict(count_X_test)
accuracy_score = metrics.accuracy_score(predicted, count_y_test)

print('ComplementNB model accuracy is',str('{:04.2f}'.format(accuracy_score * 100)) + '%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(count_y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(count_y_test, predicted))

ComplementNB model accuracy is 84.43%
------------------------------------------------
Confusion Matrix:
       0      1
0  16158   2284
1   3442  14886
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

    negative       0.82      0.88      0.85     18442
    positive       0.87      0.81      0.84     18328

    accuracy                           0.84     36770
   macro avg       0.85      0.84      0.84     36770
weighted avg       0.85      0.84      0.84     36770



### TF-IDF vectorizer for naive bayes

In [8]:
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import classification_report, confusion_matrix
CNB_with_tfidf_v = ComplementNB()
CNB_with_tfidf_v.fit(tfidf_X_train, tfidf_y_train)

In [10]:
from sklearn import metrics
predicted = CNB_with_tfidf_v.predict(tfidf_X_test)
accuracy_score = metrics.accuracy_score(predicted, tfidf_y_test)

print('ComplementNB model accuracy is',str('{:04.2f}'.format(accuracy_score * 100)) + '%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(tfidf_y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(tfidf_y_test, predicted))

ComplementNB model accuracy is 83.76%
------------------------------------------------
Confusion Matrix:
       0      1
0  16837   1605
1   4366  13962
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

    negative       0.79      0.91      0.85     18442
    positive       0.90      0.76      0.82     18328

    accuracy                           0.84     36770
   macro avg       0.85      0.84      0.84     36770
weighted avg       0.85      0.84      0.84     36770



## Bernoulli naive bayes

### Count vectorizer

In [11]:
from sklearn.naive_bayes import BernoulliNB

BNB = BernoulliNB()
BNB.fit(count_X_train, count_y_train)

In [13]:
from sklearn import metrics
predicted = BNB.predict(count_X_test)
accuracy_score = metrics.accuracy_score(predicted, count_y_test)

print('ComplementNB model accuracy is',str('{:04.2f}'.format(accuracy_score * 100)) + '%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(count_y_test, predicted)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(count_y_test, predicted))

ComplementNB model accuracy is 85.13%
------------------------------------------------
Confusion Matrix:
       0      1
0  16040   2402
1   3065  15263
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

    negative       0.84      0.87      0.85     18442
    positive       0.86      0.83      0.85     18328

    accuracy                           0.85     36770
   macro avg       0.85      0.85      0.85     36770
weighted avg       0.85      0.85      0.85     36770



### TF-IDF vectorizer