In [1]:
import pandas as pd
import numpy as np

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

import pickle

from sklearn.preprocessing import LabelEncoder

#plot imports
from math import pi


from bokeh.layouts import gridplot
from bokeh.layouts import column
#https://docs.bokeh.org/en/latest/docs/user_guide/basic/layouts.html

from bokeh.palettes import Category20c
from bokeh.plotting import figure, show
from bokeh.transform import cumsum



from sklearn.feature_extraction.text import TfidfVectorizer


#regex
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
#read binary model
model= pickle.load(open('/content/LRTrained2.sav', 'rb'))

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [3]:

vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
import joblib
#https://stackoverflow.com/questions/58344350/how-to-save-and-load-vocabulary-from-a-countvectorizer
# loading pickled vectorizer
vectorizer = joblib.load("MultiModelVector.pkl")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [4]:
n_dat= pd.read_csv('../content/test.csv')
n_dat.head()

Unnamed: 0,class_index,review_text
0,1,I got 'new' tires from them and within two wee...
1,1,Don't waste your time. We had two different p...
2,1,All I can say is the worst! We were the only 2...
3,1,I have been to this restaurant twice and was d...
4,1,Food was NOT GOOD at all! My husband & I ate h...


In [5]:
n_dat['class_index'].value_counts()

1    10000
3    10000
2    10000
4    10000
5    10000
Name: class_index, dtype: int64

In [6]:
stop_words = stopwords.words('english')
stop_words.remove('not')

In [7]:
import re
import string
english_punctuations = string.punctuation
punctuations_list = english_punctuations


def cleaning_text(x):
    temp =" ".join([w for w in str(x).split() if w not in stop_words])
    translator =  str.maketrans("", "", punctuations_list)
    temp =  str(temp).translate(translator)
    temp = re.sub('((www.[^s]+)|(https?://[^s]+))',' ',temp)
    temp = re.sub('[0-9]+', '', temp)
    #  remove special characters
    temp = re.sub(r"[^a-zA-Z0-9]+", ' ', temp)
    return temp.lower()


n_dat['review_text'] = n_dat['review_text'].apply(cleaning_text)

n_dat.head()

Unnamed: 0,class_index,review_text
0,1,i got new tires within two weeks got flat i to...
1,1,dont waste time we two different people come h...
2,1,all i say worst we people place lunch place fr...
3,1,i restaurant twice disappointed times i go bac...
4,1,food not good all my husband i ate couple week...


In [8]:
new_data = vectorizer.transform(n_dat['review_text'])

In [9]:
prediction = model.predict(new_data)

In [10]:
# add predictions to df
n_dat['predictions'] = prediction
n_dat.head()

Unnamed: 0,class_index,review_text,predictions
0,1,i got new tires within two weeks got flat i to...,0
1,1,dont waste time we two different people come h...,0
2,1,all i say worst we people place lunch place fr...,0
3,1,i restaurant twice disappointed times i go bac...,0
4,1,food not good all my husband i ate couple week...,0


In [11]:
n_dat['predictions'].value_counts()

1    26861
0    23139
Name: predictions, dtype: int64

In [12]:
# this will take the counts of differnt yelp ratings, and put them into a dictionary for
#that will be used to make a pie chart and a bar graph
#https://docs.bokeh.org/en/latest/docs/examples/topics/pie/pie.html
x = {
    #'Rated 0': n_dat['class_index'].value_counts()[0],
    'Rated 1':n_dat['class_index'].value_counts()[1],
    'Rated 2': n_dat['class_index'].value_counts()[2],
    'Rated 3': n_dat['class_index'].value_counts()[3],
    'Rated 4': n_dat['class_index'].value_counts()[4],
    'Rated 5': n_dat['class_index'].value_counts()[5],

}
# after converting the ratings in the dictionary into fractions of the whole that
#can be converted into angels for the pie chart

data = pd.Series(x).reset_index(name='value').rename(columns={'index': 'rating'})
data['angle'] = data['value']/data['value'].sum() * 2*pi
data['color'] = Category20c[len(x)]

In [13]:
# create dictionary for sentiment
s = {'Positive': n_dat['predictions'].value_counts()[0],
     'Negative': n_dat['predictions'].value_counts()[1]}

In [14]:
# transformations for sentiment pie
d = pd.Series(s).reset_index(name='value').rename(columns={'index': 'sentiment'})
d['angle'] = d['value']/d['value'].sum() * 2*pi
#https://docs.bokeh.org/en/latest/docs/reference/palettes.html
d['color'] = {1: 'crimson', 0: 'skyblue'}

In [15]:
from bokeh.io import output_notebook
output_notebook()
#bokeh.io.output_notebook()

In [16]:
# create background for viz
rate = figure(height=350, title="Yelp Rating", toolbar_location=None,
           tools="hover", tooltips="@rating: @value", x_range=(-0.5, 1.0))

#create background for the pie of the predicted sentiment
sent = figure(height=350, title="sentiment", toolbar_location=None,
           tools="hover", tooltips="@sentiment: @value")


# create the pie chart
#https://docs.bokeh.org/en/latest/docs/examples/topics/pie/pie.html
rate.wedge(x=0, y=1, radius=0.4,
        start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
        line_color="white", fill_color='color', legend_field='rating',
        source=data)
#create sentiment pie
sent.wedge(x=0, y=1, radius=0.4,
        start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
        line_color="white", fill_color='color', legend_field='sentiment',
        source=d)

#Not lableing the axis
rate.axis.axis_label = None
sent.axis.axis_label = None
#not showing the axis
rate.axis.visible = False
sent.axis.visible = False
#not puting grid lines on the backgroud
rate.grid.grid_line_color = None
sent.grid.grid_line_color = None

#show plots
#https://docs.bokeh.org/en/latest/docs/user_guide/basic/layouts.html

#plot column of pies
show(column(rate, sent))
