## This Jupyter notebook will train a sentiment analysis model using BOW-method

In [1]:
# Load in all of the required modules at the beginning
import warnings
warnings.filterwarnings('ignore')

import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm

# Read in and balance the datasets

In [2]:
# Read in the Yelp review data
yelp = pd.read_csv('yelp.csv')
yelp = yelp.sample(frac=1)

# Read in Amazon review data
data_automotive = pd.read_json('reviews_Automotive_5.json', lines=True)
data_automotive = data_automotive.sample(frac=1)

data_office_products = pd.read_json('reviews_Office_Products_5.json', lines=True)
data_office_products  = data_office_products.sample(frac=1)

#data_home_kitchen = pd.read_json('reviews_Home_and_Kitchen_5.json',lines=True)
#data_home_kitchen = data_home_kitchen.sample(frac=1)

# Create a new dataframe which will be used to aggregate all reviews
columns = ['review','rating']
df = pd.DataFrame(columns=columns)

#-------------------------------------------------------------------
# These are the parameters of the model we will train
#-------------------------------------------------------------------
n_features = 1000 # The number of features to use for constructing the dictionary (size of the vocabulary)
n_reviews = 10000 # The number of reviews to use from each data sets
train_size = 0.8 # Splits data into two groups which will be further divided
test_size = 0.8 # Splits the remaining set into two groups
random_state = 10 # For reproducibility 
#-------------------------------------------------------------------

print('---------------------------------------------')
print('Total Yelp reviews: ',len(yelp))
print('Total Amazon automotive reviews: ',len(data_automotive))
print('Total Amazon office reviews: ',len(data_office_products))
#print('Total Amazon home and kitchen reviews: ',len(data_home_kitchen))
print('---------------------------------------------')

# Here we read in the reviews from all of the data sets and aggregate them into one data frame
for i in tqdm(range(0,min(n_reviews,len(yelp)))):
    df=df.append({"review":yelp["text"][i],"rating":yelp["stars"][i]},ignore_index=True)

for i in tqdm(range(0,min(n_reviews,len(data_automotive)))):
    df=df.append({"review":data_automotive['reviewText'][i],"rating": int(data_automotive["overall"][i])},ignore_index=True)

for i in tqdm(range(0,min(n_reviews,len(data_office_products)))):
    df=df.append({"review":data_office_products['reviewText'][i],"rating": int(data_office_products["overall"][i])},ignore_index=True)

#for i in tqdm(range(0,min(n_reviews,len(data_home_kitchen)))):
#    df=df.append({"review":data_home_kitchen['reviewText'][i],"rating": int(data_home_kitchen["overall"][i])},ignore_index=True)    
    
# Find the specific star rated reviews
df_1_star = df[df["rating"]== 1]
df_2_star = df[df["rating"]== 2]
df_3_star = df[df["rating"]== 3]
df_4_star = df[df["rating"]== 4]
df_5_star = df[df["rating"]== 5]

print("")
print('Raw reviews: ')
print("1-Star reviews:",len(df_1_star))
print("2-Star reviews:",len(df_2_star))
print("3-Star reviews:",len(df_3_star))
print("4-Star reviews:",len(df_4_star))
print("5-Star reviews:",len(df_5_star))
print('\n')

# Aggregate all of the review data
min_stars= min(df_1_star.count()[0],df_2_star.count()[0],df_3_star.count()[0],df_4_star.count()[0],df_5_star.count()[0])

# Randomize the data set
df = df.sample(frac=1)

df_1_star = df[df["rating"]== 1].head(min_stars)
df_2_star = df[df["rating"]== 2].head(min_stars)
df_3_star = df[df["rating"]== 3].head(min_stars)
df_4_star = df[df["rating"]== 4].head(min_stars)
df_5_star = df[df["rating"]== 5].head(min_stars)

print("")
print('Balaced reviews: ')
print("1-Star reviews:",len(df_1_star))
print("2-Star reviews:",len(df_2_star))
print("3-Star reviews:",len(df_3_star))
print("4-Star reviews:",len(df_4_star))
print("5-Star reviews:",len(df_5_star))
print('\n')

# Combine all of the dataframes
frames = [df_1_star,df_2_star,df_3_star,df_4_star,df_5_star]

df_balanced = pd.concat(frames)

# Shuffle the data frame to randomize everything
df_balanced = df_balanced.sample(frac=1)
df_balanced.index = range(0,df_balanced.shape[0]) # Relabel the indices
df_balanced.head()

print('The total number of reviews: ', len(df_balanced))

  0%|          | 46/10000 [00:00<00:21, 453.60it/s]

---------------------------------------------
Total Yelp reviews:  10000
Total Amazon automotive reviews:  20473
Total Amazon office reviews:  53258
---------------------------------------------


100%|██████████| 10000/10000 [00:33<00:00, 295.79it/s]
100%|██████████| 10000/10000 [00:56<00:00, 175.53it/s]
100%|██████████| 10000/10000 [01:39<00:00, 100.48it/s]


Raw reviews: 
1-Star reviews: 1221
2-Star reviews: 1428
3-Star reviews: 2768
4-Star reviews: 7528
5-Star reviews: 17055



Balaced reviews: 
1-Star reviews: 1221
2-Star reviews: 1221
3-Star reviews: 1221
4-Star reviews: 1221
5-Star reviews: 1221


The total number of reviews:  6105





# Define the preprocessing functions, apply them to the balanced data set, and store the cleaned up documents

In [3]:
from process_text import clean_up_text 


# Let us now generate a subset of the data to create a vocabulary with
n_sample = len(df_balanced)
dict_sample = []

# Now we preprocess the entire balanced data set
for i in tqdm(range(0,n_sample)):
    sentence = df_balanced["review"][i]
    dict_sample.append(clean_up_text(sentence))

# Convert to numpy array
dict_sample = np.asarray(dict_sample)

  0%|          | 0/6105 [00:00<?, ?it/s]

[nltk_data] Downloading package wordnet to /home/javier/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/javier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/javier/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


100%|██████████| 6105/6105 [00:38<00:00, 159.44it/s]


# Using preprocessed text, custom stop words, applying TFIDF transform build a vocabulary and generate a vectorizer

In [6]:
# we can pass in the same preprocessing parameters
from sklearn.feature_extraction.text import TfidfVectorizer
import operator

custom_stop_words = []
with open( "stopwords.txt", "r" ) as fin:
    for line in fin.readlines():
        custom_stop_words.append( line.strip() )

#----------------------------------------------------------------------------------------------
# Here we vectorize the dictionary sample
vectorizer = TfidfVectorizer(stop_words = custom_stop_words,min_df = 20)
A = vectorizer.fit_transform(dict_sample)
print( "Created %d X %d TF-IDF-normalized document-term matrix" % (A.shape[0], A.shape[1]) )
print("")
#----------------------------------------------------------------------------------------------

# extract the resulting vocabulary
terms = vectorizer.get_feature_names()

def rank_terms( A, terms ):
    # get the sums over each column
    sums = A.sum(axis=0)
    # map weights to the terms
    weights = {}
    for col, term in enumerate(terms):
        weights[term] = sums[0,col]
    # rank the terms by their weight over all documents
    return sorted(weights.items(), key=operator.itemgetter(1), reverse=True)

ranking = rank_terms( A, terms )
for i, pair in enumerate( ranking[0:20] ):
    print( "%02d. %s (%.2f)" % ( i+1, pair[0], pair[1] ) )

Created 6105 X 1963 TF-IDF-normalized document-term matrix

01. not (262.21)
02. get (189.75)
03. good (189.25)
04. go (175.08)
05. like (173.72)
06. place (168.26)
07. food (166.41)
08. great (144.03)
09. work (142.40)
10. time (141.65)
11. can (123.33)
12. no (119.82)
13. dont (118.26)
14. order (114.81)
15. really (113.18)
16. price (110.00)
17. buy (107.98)
18. only (104.00)
19. look (103.25)
20. try (102.99)


# Take the five-start ratings and condense into two outputs; +1 or -1, where +1 is the positive review, and -1 is a negative review

In [7]:
from sklearn.model_selection import train_test_split

# Now we must transform our original review data into a feature Matrix
y = df_balanced["rating"]

# Now we must transform our original review data into a feature Matrix

X = []
y = []

# converting the data frame into a feature matrix
for i in tqdm(range(0,df_balanced.shape[0])):
    
    r = df_balanced["rating"][i]
    
    if(r==1):
        yi =-1
        X.append(df_balanced["review"][i])
        y.append(yi)
    elif(r==5):
        yi=1
        X.append(df_balanced["review"][i])
        y.append(yi)
print('Balanced Dataset: ',len(y))        

## We store the remaining data 
df_final = df_balanced.drop(df_balanced[(df_balanced.rating == 2) | (df_balanced.rating == 3)| (df_balanced.rating== 4)].index)


X = np.asarray(X)
y = np.asarray(y)

# Converts the Document matrix consisting of strings into arrays according to the dictionary that we built previously
X= vectorizer.transform(X)

print('X- Feature matrix: ', X.shape)

# Now we split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, random_state=random_state)
X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test, train_size=test_size, random_state=random_state)

print('X_train: ', len(y_train))
print('X_test: ', len(y_test))
print('X_validation: ', len(y_validation))

100%|██████████| 6105/6105 [00:00<00:00, 50734.23it/s]


Balanced Dataset:  2442
X- Feature matrix:  (2442, 1963)
X_train:  1953
X_test:  391
X_validation:  98


# Fit and test different models

In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from numpy.random import randint
from textblob import TextBlob

# Now we train different models
nb = MultinomialNB()
dt = DecisionTreeClassifier(random_state=0)
rf = RandomForestClassifier(max_depth=20, random_state=0)
lr = LogisticRegression(multi_class='multinomial',solver='newton-cg')

# Fit the different machine learning models
nb.fit(X_train, y_train)
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)

# The baseline model generates (-1 or 1) randomly
preds_bl = np.ones(len(y_test))-2*randint(0,2,len(y_test))
preds_nb = nb.predict(X_test)
preds_dt = dt.predict(X_test)
preds_rf = rf.predict(X_test)
preds_lr = lr.predict(X_test)
preds_lr_val = lr.predict(X_validation)

print('================================================================\n')
print('Train data set size: ', len(y_train),'\n')
print('Test data set size: ', len(y_test),'\n')
print('================================================================\n')
print("Baseline Model: \n",classification_report(y_test,preds_bl))
print('================================================================\n')
print("Naive Bayes: \n" ,classification_report(y_test,preds_nb))
print('================================================================\n')
print("Desicion Tree: \n",classification_report(y_test,preds_dt))
print('================================================================\n')
print("Random Forests: \n",classification_report(y_test,preds_rf))
print('================================================================\n')
print("Logistic Regression: \n",classification_report(y_test,preds_lr))
print('================================================================\n')

  from numpy.core.umath_tests import inner1d



Train data set size:  1953 

Test data set size:  391 


Baseline Model: 
              precision    recall  f1-score   support

         -1       0.50      0.50      0.50       189
          1       0.53      0.53      0.53       202

avg / total       0.51      0.51      0.51       391


Naive Bayes: 
              precision    recall  f1-score   support

         -1       0.80      0.76      0.78       189
          1       0.78      0.83      0.80       202

avg / total       0.79      0.79      0.79       391


Desicion Tree: 
              precision    recall  f1-score   support

         -1       0.73      0.66      0.69       189
          1       0.71      0.77      0.74       202

avg / total       0.72      0.72      0.71       391


Random Forests: 
              precision    recall  f1-score   support

         -1       0.76      0.76      0.76       189
          1       0.77      0.78      0.78       202

avg / total       0.77      0.77      0.77       391


Logistic R

## Now we save the models and the text used to generate the vectorizer

In [9]:
# Here we save the model
terms = vectorizer.get_feature_names()

filename1 = 'lr_sentiment_model.sav'
filename2 = 'nb_sentiment_model.sav'
pickle.dump(lr, open(filename1, 'wb'))
pickle.dump(nb, open(filename2, 'wb'))
joblib.dump((X,terms,dict_sample), "articles-raw.pkl")

['articles-raw.pkl']

## Load the model, test the models on some example text

In [12]:
# Load the models and use them to make a prediction
import pickle
from sklearn.externals import joblib

# The names of the files containing the weights of the model
filename1 = 'lr_sentiment_model.sav'
filename2 = 'nb_sentiment_model.sav'

# Now we load in the trained models
loaded_lr = pickle.load(open(filename1, 'rb'))
loaded_nb = pickle.load(open(filename2, 'rb'))

sample_text = 'This car does not deserve a rating'
s = str(clean_up_text(sample_text))

print(sample_text)

# Transform the text
X0 = vectorizer.transform([s])
print(X0)
preds_nb = loaded_nb.predict(X0)
preds_lr = loaded_lr.predict(X0)

prob_nb = np.max(loaded_nb.predict_proba(X0))
prob_lr = np.max(loaded_lr.predict_proba(X0))

print("")
print('lr: ',preds_lr,prob_lr)
print('nb: ',preds_nb,prob_nb)
print('TxtBlob: ', TextBlob(s).sentiment.polarity)

This car does not deserve a rating
  (0, 1354)	0.5468934265496899
  (0, 1128)	0.19415689274663434
  (0, 465)	0.7034788815927865
  (0, 254)	0.4102781302344461

lr:  [-1] 0.6409175104484928
nb:  [-1] 0.6270737905528316
TxtBlob:  0.0
