# Load data and split to train and test

In [None]:
import pandas as pd
import numpy as np

In [None]:
reviews = pd.read_csv('Reviews.csv')

In [None]:
reviews.head(10)

In [None]:
def classifier(row):
    if row['Score'] > 3:
        return 'Positive'
    else:
        return 'Negative'

reviews['Response'] = reviews.apply(classifier, axis=1)

In [None]:
reviews.head(10)

In [None]:
total_size = len(reviews)
total_size

In [None]:
train_size = int(0.70*total_size)
train_size

In [None]:
trainSet = reviews.head(train_size)
testSet = reviews.tail(total_size-train_size)

In [None]:
trainSet.Score.value_counts()

In [None]:
testSet.Score.value_counts()

In [None]:
trainSet.head(50).Score.value_counts()

In [None]:
testSet.head(50).Score.value_counts()

# Preprocess texts

In [None]:
from nltk.tokenize import word_tokenize
import re
import spacy
import nltk
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [None]:
def preprocess_text(input_str):
    words = word_tokenize(input_str)
    words = [re.sub(r"[^A-Za-z@]", "", word) for word in words]
    words = [re.sub(r"\S+com", "", word) for word in words]
    words = [re.sub(r"\S+@\S+", "", word) for word in words]
    words = [word for word in words if word!=' ']

    #import nltk stopwords
    stopwords_nltk = nltk.corpus.stopwords.words('english')
    
    #import other lists of stopwords
    with open('stopwords_en.txt', 'r') as f:
        file = f.readlines()
    stopwords = []
    for word in file:
        stopwords.append(word.replace('\n', ''))

    stopwords = stopwords + stopwords_nltk
    stopwords = [s.lower() for s in stopwords]
    
    words = [token.lemma_ for token in nlp(' '.join(words)) if not token.is_punct]
    words = [word.lower() for word in words if word.lower() not in stopwords]
    
    while ' ' in words: words.remove(' ')
    return ' '.join(words)

In [None]:
train_texts = []
test_texts = []

# remove the [:50] on a more powerful computer
for text in trainSet['Text'].tolist()[:50]:
    train_texts.append(preprocess_text(text))
    
for text in testSet['Text'].tolist()[:50]:
    test_texts.append(preprocess_text(text))

In [None]:
train_texts

In [None]:
test_texts

# Vectorise

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vect = CountVectorizer()
x_train_dtm = vect.fit_transform(train_texts)
x_test_dtm = vect.transform(test_texts)

In [None]:
x_test_dtm

In [None]:
x_train_dtm

# Build and evaluate model

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

Build model

In [None]:
# train model using x_train_dtm
%time nb.fit(x_train_dtm, trainSet.head(50).Score)

In [None]:
y_pred_class_nb = nb.predict(x_test_dtm)

Evaluate model

In [None]:
from sklearn import metrics
metrics.accuracy_score(testSet.head(50).Score, y_pred_class_nb)

In [None]:
con_metrics_nb = metrics.confusion_matrix(testSet.head(50).Score, y_pred_class_nb)
con_metrics_nb

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.heatmap(con_metrics_nb, annot=True, fmt='d')
plt.title('Confusion Matrix: Naive Bayes')
plt.show()

In [None]:
# look at huggingface.com website
pip install transformers

In [None]:
from sklearn.metrics import classification_report
print(classification_report(testSet.head(50).Score, y_pred_class_nb))

In [None]:
auc_nb = metrics.roc_auc_score(testSet.head(50).Score, y_pred_class_nb)
print(auc_nb)

In [None]:
y_pred_class_nb

In [None]:
reviews.tail(total_size-train_size)

In [None]:
test_texts

In [None]:
testSet.head(50).Score

In [None]:
testSet