# Sentiment Analysis on Yelp dataset

## Loading libraries


In [1]:
import joblib
import numpy as np
import pandas as pd
import spacy
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
from gensim.parsing.preprocessing import STOPWORDS
from nltk.corpus import stopwords


## Loading the dataset


In [2]:
IO_TRAIN = "../input/yelp-sent-analysis-preprocess/train_processed.csv"
ylp_processed = pd.read_csv(
    IO_TRAIN, index_col=0, dtype={"sentiment": "category", "review": str}
)
ylp_processed.head()


Unnamed: 0,sentiment,review
0,NEG,unfortunately frustration dr goldberg patient ...
1,POS,going dr goldberg years think st patients star...
2,NEG,know dr goldberg like moving arizona let tell ...
3,NEG,writing review heads doctor office staff admin...
4,POS,food great best thing wings wings simply fanta...


to be able to deal with classes, it should be in some human interpretable form, so using knowledge from `readme.txt`, class 1 is `NEG` for negative, and class 2 is `POS` for positive


In [3]:
nlp = spacy.load("en_core_web_lg")
# combining the stopwords from gensim & nltk
STOPWORDS = STOPWORDS.union(frozenset(stopwords.words("english")), nlp.Defaults.stop_words)


## Variation on naive Bayes

after making a benchmark of naive bayes with unigrams, and got accuracy of $0.875$

let's try a variant of a naive Bayes with mixture of unigrams, bigrams, & trigrams

> The dataset is too large for classic ML to run trigrams or mixture of polygrams, even for pyspark, locally or on cloud (kaggle, colab)


In [4]:
X = ylp_processed["review"]
y = ylp_processed["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [5]:
benchmark = make_pipeline(TfidfVectorizer(), MultinomialNB())

_ = benchmark.fit(X_train, y_train)
joblib.dump(benchmark, "benchmark-unigram-naive-bayes.joblib")


['benchmark-unigram-naive-bayes.joblib']

In [6]:
# the dataset is too large to be handled locally or on cloud, even with pyspark
# so check the full dataset in bigram
vectoriser = TfidfVectorizer(ngram_range=(2,2))
clf = MultinomialNB()

nb_bigram = make_pipeline(vectoriser, clf)


In [7]:
scores_bigram = cross_validate(nb_bigram, X, y, cv=5, n_jobs=-1, verbose=1)
np.mean(scores_bigram["test_score"])


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  5.7min finished


0.902598126606194

using a bigram has raised the accuracy to $0.9$


In [8]:
_ = nb_bigram.fit(X_train, y_train)
y_pred = nb_bigram.predict(X_test)

print(classification_report(y_test, y_pred, digits=4))


              precision    recall  f1-score   support

         NEG     0.9090    0.9019    0.9054     55986
         POS     0.9027    0.9097    0.9062     55996

    accuracy                         0.9058    111982
   macro avg     0.9058    0.9058    0.9058    111982
weighted avg     0.9058    0.9058    0.9058    111982



In [9]:
joblib.dump(nb_bigram, "bigram-naive-bayes.joblib")


['bigram-naive-bayes.joblib']

In [10]:
!cp '../input/yelp-sent-analysis-preprocess/train_processed.csv' './'


---

## Using a pre-trained model

using spaCy pre-trained model [spacytextblob](https://spacy.io/universe/project/spacy-textblob) for sentiment analysis


In [11]:
!python3 -m pip install spacytextblob


Collecting spacytextblob
  Downloading spacytextblob-4.0.0-py3-none-any.whl (4.5 kB)
Collecting textblob<0.16.0,>=0.15.3
  Downloading textblob-0.15.3-py2.py3-none-any.whl (636 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m636.5/636.5 kB[0m [31m863.8 kB/s[0m eta [36m0:00:00[0m
Collecting typing-extensions<4.0.0.0,>=3.7.4
  Downloading typing_extensions-3.10.0.2-py3-none-any.whl (26 kB)
Installing collected packages: typing-extensions, textblob, spacytextblob
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.2.0
    Uninstalling typing_extensions-4.2.0:
      Successfully uninstalled typing_extensions-4.2.0
  Attempting uninstall: textblob
    Found existing installation: textblob 0.17.1
    Uninstalling textblob-0.17.1:
      Successfully uninstalled textblob-0.17.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the sour

In [12]:
from spacytextblob.spacytextblob import SpacyTextBlob

_ = nlp.add_pipe("spacytextblob")


In [13]:
def predict(doc:str)->str:
    """
    TODO: fill pydoc
    
    """
    polarity = nlp(doc)._.blob.polarity
    if polarity < 0:
        return "NEG"
    else:
        return "POS"


In [14]:
y_pred = ylp_processed["review"].apply(predict)

print(classification_report(y, y_pred, digits=4))


              precision    recall  f1-score   support

         NEG     0.9365    0.3943    0.5549    279931
         POS     0.6164    0.9732    0.7548    279976

    accuracy                         0.6838    559907
   macro avg     0.7764    0.6838    0.6549    559907
weighted avg     0.7764    0.6838    0.6549    559907



a pre-trained model gave a weak accuracy (f1-score) of $0.68$, the benchmark is better $0.875$
