# Sentiment Analysis on Yelp dataset

## Loading libraries


In [1]:
import joblib
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report


## Loading the dataset


In [2]:
IO_TRAIN = "../input/yelp-sent-analysis-preprocess/train_processed.csv"
ylp_processed = pd.read_csv(IO_TRAIN, dtype={"sentiment": "category", "review": str})
ylp_processed.head()


Unnamed: 0,sentiment,review
0,NEG,unfortunately frustration dr goldberg patient ...
1,POS,going dr goldberg years think st patients star...
2,NEG,know dr goldberg like moving arizona let tell ...
3,NEG,writing review heads doctor office staff admin...
4,POS,food great best thing wings wings simply fanta...


## Variation on naive Bayes

after making a benchmark of naive bayes with unigrams, and got accuracy of $0.875$

let's try a variant of a naive Bayes with mixture of unigrams, bigrams, & trigrams

> The dataset is too large for classic ML to run trigrams or mixture of polygrams, even for pyspark, locally or on cloud (kaggle, colab)


In [None]:
X = ylp_processed["review"]
y = ylp_processed["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [5]:
# the dataset is too large to be handled locally or on cloud, even with pyspark
# so check the full dataset in bigram
vectoriser = TfidfVectorizer(ngram_range=(2,2))
clf = MultinomialNB()

nb_bigram = make_pipeline(vectoriser, clf)


In [6]:
scores_bigram = cross_validate(nb_bigram, X, y, cv=5, n_jobs=-1, verbose=1)
np.mean(scores_bigram["test_score"])


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  4.8min finished


0.902598126606194

using a bigram has raised the accuracy to $0.9$


In [7]:
_ = nb_bigram.fit(X_train, y_train)
y_pred = nb_bigram.predict(X_test)

print(classification_report(y_test, y_pred, digits=4))


              precision    recall  f1-score   support

         NEG     0.9090    0.9019    0.9054     55986
         POS     0.9027    0.9097    0.9062     55996

    accuracy                         0.9058    111982
   macro avg     0.9058    0.9058    0.9058    111982
weighted avg     0.9058    0.9058    0.9058    111982



In [None]:
_ = joblib.dump(nb_bigram, "bigram-naive-bayes.joblib")


In [9]:
!cp '../input/yelp-sent-analysis-preprocess/train_processed.csv' './'
