## This Jupyter notebook will train a sentiment analysis model using BOW-method

In [1]:
# Load in all of the required modules at the beginning
import warnings
warnings.filterwarnings('ignore')

import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm

# Read in and balance the datasets

In [3]:
# Read in the Yelp review data
yelp = pd.read_csv('model_training_data/yelp.csv')
yelp = yelp.sample(frac=1)

# Read in Amazon review data
data_automotive = pd.read_json('model_training_data/reviews_Automotive_5.json', lines=True)
data_automotive = data_automotive.sample(frac=1)

data_office_products = pd.read_json('model_training_data/reviews_Office_Products_5.json', lines=True)
data_office_products  = data_office_products.sample(frac=1)


#=====================================================================================================
# Here we read in the twitter sentiment data and map it to our prefered labels
#=====================================================================================================

def func(true_sentiment):
    
    y_true = None
    
    if(true_sentiment==0):
        y_true = 1
    elif(true_sentiment==4):
        y_true = 5
    
    return y_true

# Specify the name of the file, along with the name of the columns
cols = ['sentiment','id','date','query_string','user','text']

# Read in the data, randomly shuffle and reset the index
data_twitter=pd.read_csv('model_training_data/training.1600000.processed.noemoticon.csv',header=None, names=cols,encoding = "ISO-8859-1")
data_twitter=data_twitter.sample(frac=1)
data_twitter=data_twitter.reset_index(drop=True)

# Take a subset of the data for faster evaluation
data_twitter = data_twitter[0:70000]
data_twitter["text"]=data_twitter["text"].apply(func)

#=====================================================================================================

#data_home_kitchen = pd.read_json('reviews_Home_and_Kitchen_5.json',lines=True)
#data_home_kitchen = data_home_kitchen.sample(frac=1)

# Create a new dataframe which will be used to aggregate all reviews
columns = ['review','rating']
df = pd.DataFrame(columns=columns)

#-------------------------------------------------------------------
# These are the parameters of the model we will train
#-------------------------------------------------------------------
n_reviews = 70000 # The number of reviews to use from each data sets
train_size = 0.8 # Splits data into two groups which will be further divided
random_state = 10 # For reproducibility 
#-------------------------------------------------------------------

print('---------------------------------------------')
print('Total Yelp reviews: ',len(yelp))
print('Total Amazon automotive reviews: ',len(data_automotive))
print('Total Amazon office reviews: ',len(data_office_products))
print("Total Twitter Data reviews: ", len(data_twitter))
#print('Total Amazon home and kitchen reviews: ',len(data_home_kitchen))
print('---------------------------------------------')

# Here we read in the reviews from all of the data sets and aggregate them into one data frame
for i in tqdm(range(0,min(n_reviews,len(yelp)))):
    df=df.append({"review":yelp["text"][i],"rating":yelp["stars"][i]},ignore_index=True)
    
for i in tqdm(range(0,min(n_reviews,len(data_twitter)))):
    df=df.append({"review":data_twitter["text"][i],"rating":data_twitter["sentiment"][i]},ignore_index=True)

for i in tqdm(range(0,min(n_reviews,len(data_automotive)))):
    df=df.append({"review":data_automotive['reviewText'][i],"rating": int(data_automotive["overall"][i])},ignore_index=True)

for i in tqdm(range(0,min(n_reviews,len(data_office_products)))):
    df=df.append({"review":data_office_products['reviewText'][i],"rating": int(data_office_products["overall"][i])},ignore_index=True)

#for i in tqdm(range(0,min(n_reviews,len(data_home_kitchen)))):
#    df=df.append({"review":data_home_kitchen['reviewText'][i],"rating": int(data_home_kitchen["overall"][i])},ignore_index=True)    
    
# Find the specific star rated reviews
df_1_star = df[df["rating"]== 1]
df_2_star = df[df["rating"]== 2]
df_3_star = df[df["rating"]== 3]
df_4_star = df[df["rating"]== 4]
df_5_star = df[df["rating"]== 5]

print("")
print('Raw reviews: ')
print("1-Star reviews:",len(df_1_star))
print("2-Star reviews:",len(df_2_star))
print("3-Star reviews:",len(df_3_star))
print("4-Star reviews:",len(df_4_star))
print("5-Star reviews:",len(df_5_star))
print('\n')

# Aggregate all of the review data
# find the minimum of the extreme star reviews which will be used for training
#min_stars= min(df_1_star.count()[0],df_2_star.count()[0],df_3_star.count()[0],df_4_star.count()[0],df_5_star.count()[0])
min_stars= min(df_1_star.count()[0],df_5_star.count()[0])


# Randomize the data set
df = df.sample(frac=1)

df_1_star = df[df["rating"]== 1].head(min_stars)
df_2_star = df[df["rating"]== 2].head(min_stars)
df_3_star = df[df["rating"]== 3].head(min_stars)
df_4_star = df[df["rating"]== 4].head(min_stars)
df_5_star = df[df["rating"]== 5].head(min_stars)

print("")
print('Balaced reviews: ')
print("1-Star reviews:",len(df_1_star))
print("2-Star reviews:",len(df_2_star))
print("3-Star reviews:",len(df_3_star))
print("4-Star reviews:",len(df_4_star))
print("5-Star reviews:",len(df_5_star))
print('\n')

# Combine all of the dataframes
frames = [df_1_star,df_2_star,df_3_star,df_4_star,df_5_star]

df_balanced = pd.concat(frames)

# Shuffle the data frame to randomize everything
df_balanced = df_balanced.sample(frac=1)
df_balanced.index = range(0,df_balanced.shape[0]) # Relabel the indices
df_balanced.head()

print('The total number of reviews: ', len(df_balanced))

  0%|          | 16/10000 [00:00<01:03, 156.44it/s]

---------------------------------------------
Total Yelp reviews:  10000
Total Amazon automotive reviews:  20473
Total Amazon office reviews:  53258
Total Twitter Data reviews:  70000
---------------------------------------------


100%|██████████| 10000/10000 [01:20<00:00, 124.16it/s]
100%|██████████| 70000/70000 [13:45<00:00, 84.81it/s] 
100%|██████████| 20473/20473 [16:51<00:00, 20.24it/s]
100%|██████████| 53258/53258 [1:05:42<00:00, 13.51it/s]



Raw reviews: 
1-Star reviews: 2421
2-Star reviews: 3259
3-Star reviews: 7951
4-Star reviews: 57340
5-Star reviews: 47592



Balaced reviews: 
1-Star reviews: 2421
2-Star reviews: 2421
3-Star reviews: 2421
4-Star reviews: 2421
5-Star reviews: 2421


The total number of reviews:  12105


# Define the preprocessing functions, apply them to the balanced data set, and store the cleaned up documents

In [4]:
from process_text import clean_up_text 


# Let us now generate a subset of the data to create a vocabulary with
n_sample = len(df_balanced)
dict_sample = []

# Now we preprocess the entire balanced data set
for i in tqdm(range(0,n_sample)):
    sentence = str(df_balanced["review"][i])
    #print(sentence)
    dict_sample.append(clean_up_text(sentence))

# Convert to numpy array
dict_sample = np.asarray(dict_sample)

[nltk_data] Downloading package wordnet to /home/javier/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/javier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/javier/nltk_data...


  0%|          | 0/12105 [00:00<?, ?it/s]

[nltk_data]   Package punkt is already up-to-date!


100%|██████████| 12105/12105 [03:46<00:00, 53.49it/s]


# Using preprocessed text, custom stop words, applying TFIDF transform build a vocabulary and generate a vectorizer

In [5]:
# we can pass in the same preprocessing parameters
from sklearn.feature_extraction.text import TfidfVectorizer
import operator

custom_stop_words = []
with open( "stopwords.txt", "r" ) as fin:
    for line in fin.readlines():
        custom_stop_words.append( line.strip() )

#----------------------------------------------------------------------------------------------
# Here we vectorize the dictionary sample
vectorizer = TfidfVectorizer(stop_words = custom_stop_words,min_df = 20)
A = vectorizer.fit_transform(dict_sample)
print( "Created %d X %d TF-IDF-normalized document-term matrix" % (A.shape[0], A.shape[1]) )
print("")
#----------------------------------------------------------------------------------------------

# extract the resulting vocabulary
terms = vectorizer.get_feature_names()

def rank_terms( A, terms ):
    # get the sums over each column
    sums = A.sum(axis=0)
    # map weights to the terms
    weights = {}
    for col, term in enumerate(terms):
        weights[term] = sums[0,col]
    # rank the terms by their weight over all documents
    return sorted(weights.items(), key=operator.itemgetter(1), reverse=True)

ranking = rank_terms( A, terms )
for i, pair in enumerate( ranking[0:20] ):
    print( "%02d. %s (%.2f)" % ( i+1, pair[0], pair[1] ) )
    
# Write the vocabulary to a file
f = open("vocabulary.txt", "w")
for i, pair in enumerate( ranking):
    f.write( "%02d. %s (%.2f) \n" % ( i+1, pair[0], pair[1] ) )
f.close()

Created 12105 X 2880 TF-IDF-normalized document-term matrix

01. nan (1477.05)
02. not (483.40)
03. get (313.45)
04. work (291.72)
05. like (283.83)
06. printer (279.66)
07. good (252.83)
08. can (242.46)
09. go (239.06)
10. product (225.25)
11. time (214.87)
12. print (213.22)
13. great (211.87)
14. no (208.41)
15. paper (204.17)
16. dont (200.01)
17. buy (197.94)
18. ink (194.55)
19. need (188.68)
20. look (184.50)


# Take the five-start ratings and condense into two outputs; +1 or -1, where +1 is the positive review, and -1 is a negative review

In [7]:
from sklearn.model_selection import train_test_split

# Now we must transform our original review data into a feature Matrix
y = df_balanced["rating"]

# Now we must transform our original review data into a feature Matrix

X = []
y = []

# converting the data frame into a feature matrix
for i in tqdm(range(0,df_balanced.shape[0])):
    
    r = df_balanced["rating"][i]
    
    if(r==1 or r==2):
        yi =-1
        X.append(df_balanced["review"][i])
        y.append(yi)
    elif(r==5 or r==4):
        yi=1
        X.append(df_balanced["review"][i])
        y.append(yi)
print('Balanced Dataset: ',len(y))        

## We store the remaining data 
df_final = df_balanced.drop(df_balanced[(df_balanced.rating == 2) | (df_balanced.rating == 3)| (df_balanced.rating== 4)].index)


X = np.asarray(X)
y = np.asarray(y)

# Converts the Document matrix consisting of strings into arrays according to the dictionary that we built previously
X= vectorizer.transform(X)

print('X- Feature matrix: ', X.shape)

# Now we split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, random_state=random_state)

print('X_train: ', len(y_train))
print('X_test: ', len(y_test))

100%|██████████| 12105/12105 [00:01<00:00, 10129.00it/s]


Balanced Dataset:  9684
X- Feature matrix:  (9684, 2880)
X_train:  7747
X_test:  1937


# Fit and test different models

In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from numpy.random import randint
from textblob import TextBlob

# Now we train different models
nb = MultinomialNB()
dt = DecisionTreeClassifier(random_state=0)
rf = RandomForestClassifier(max_depth=20, random_state=0)
lr = LogisticRegression(multi_class='multinomial',solver='newton-cg')

# Fit the different machine learning models
nb.fit(X_train, y_train)
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)

# The baseline model generates (-1 or 1) randomly
preds_bl = np.ones(len(y_test))-2*randint(0,2,len(y_test))
preds_nb = nb.predict(X_test)
preds_dt = dt.predict(X_test)
preds_rf = rf.predict(X_test)
preds_lr = lr.predict(X_test)

print('================================================================\n')
print('Train data set size: ', len(y_train),'\n')
print('Test data set size: ', len(y_test),'\n')
print('================================================================\n')
print("Baseline Model: \n",classification_report(y_test,preds_bl))
print('================================================================\n')
print("Naive Bayes: \n" ,classification_report(y_test,preds_nb))
print('================================================================\n')
print("Desicion Tree: \n",classification_report(y_test,preds_dt))
print('================================================================\n')
print("Random Forests: \n",classification_report(y_test,preds_rf))
print('================================================================\n')
print("Logistic Regression: \n",classification_report(y_test,preds_lr))
print('================================================================\n')


Train data set size:  7747 

Test data set size:  1937 


Baseline Model: 
              precision    recall  f1-score   support

         -1       0.49      0.51      0.50       954
          1       0.51      0.49      0.50       983

avg / total       0.50      0.50      0.50      1937


Naive Bayes: 
              precision    recall  f1-score   support

         -1       0.80      0.84      0.82       954
          1       0.84      0.79      0.81       983

avg / total       0.82      0.82      0.82      1937


Desicion Tree: 
              precision    recall  f1-score   support

         -1       0.74      0.73      0.73       954
          1       0.74      0.75      0.75       983

avg / total       0.74      0.74      0.74      1937


Random Forests: 
              precision    recall  f1-score   support

         -1       0.74      0.81      0.77       954
          1       0.79      0.73      0.76       983

avg / total       0.77      0.77      0.77      1937


Logistic 

## Now we save the models and the text used to generate the vectorizer

In [9]:
from sklearn.externals import joblib

# Here we save the model
terms = vectorizer.get_feature_names()

filename1 = 'lr_sentiment_model.sav'
filename2 = 'nb_sentiment_model.sav'
pickle.dump(lr, open(filename1, 'wb'))
pickle.dump(nb, open(filename2, 'wb'))
joblib.dump((X,terms,dict_sample), "articles-raw.pkl")

['articles-raw.pkl']

## Load the model, test the models on some example text

In [10]:
# Load the models and use them to make a prediction
import pickle
from sklearn.externals import joblib

# The names of the files containing the weights of the model
filename1 = 'lr_sentiment_model.sav'
filename2 = 'nb_sentiment_model.sav'

# Now we load in the trained models
loaded_lr = pickle.load(open(filename1, 'rb'))
loaded_nb = pickle.load(open(filename2, 'rb'))

sample_text = 'The best car is here'
s = str(clean_up_text(sample_text))

print(sample_text)

# Transform the text
X0 = vectorizer.transform([s])
print(X0)
preds_nb = loaded_nb.predict(X0)
preds_lr = loaded_lr.predict(X0)

prob_nb = np.max(loaded_nb.predict_proba(X0))
prob_lr = np.max(loaded_lr.predict_proba(X0))

print("")
print('lr: ',preds_lr,prob_lr)
print('nb: ',preds_nb,prob_nb)
print('TxtBlob: ', TextBlob(s).sentiment.polarity)

The best car is here
  (0, 357)	0.729356165256
  (0, 234)	0.684134185817

lr:  [1] 0.788777675056
nb:  [1] 0.620906800591
TxtBlob:  1.0
