In [1]:
# Read in the Yelp review data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords


In [2]:
# Read in the Yelp review data
yelp = pd.read_csv('yelp.csv')

# Read in Amazon review data
data_automotive = pd.read_json('reviews_Automotive_5.json', lines=True)
data_office_products = pd.read_json('reviews_Office_Products_5.json', lines=True)

# Combine the reviews into [ID,text,review]
columns = ['review','rating']
df = pd.DataFrame(columns=columns)


print('---------------------------------------------')
print(len(yelp))
print(len(data_automotive))
print(len(data_office_products))
print('---------------------------------------------')

for i in range(0,5000):
    df=df.append({"review":yelp["text"][i],"rating":yelp["stars"][i]},ignore_index=True)

for i in range(0,5000):
    df=df.append({"review":data_automotive['reviewText'][i],"rating": int(data_automotive["overall"][i])},ignore_index=True)

for i in range(0,5000):
    df=df.append({"review":data_office_products['reviewText'][i],"rating": int(data_office_products["overall"][i])},ignore_index=True)
    

df_1_star = df[df["rating"]== 1]
df_2_star = df[df["rating"]== 2]
df_3_star = df[df["rating"]== 3]
df_4_star = df[df["rating"]== 4]
df_5_star = df[df["rating"]== 5]

min_stars= min(df_1_star.count()[0],df_2_star.count()[0],df_3_star.count()[0],df_4_star.count()[0],df_5_star.count()[0])

# Randomize the data set
df = df.sample(frac=1)

df_1_star = df[df["rating"]== 1].head(min_stars)
df_2_star = df[df["rating"]== 2].head(min_stars)
df_3_star = df[df["rating"]== 3].head(min_stars)
df_4_star = df[df["rating"]== 4].head(min_stars)
df_5_star = df[df["rating"]== 5].head(min_stars)

print("")
print("1-Star reviews:",len(df_1_star))
print("2-Star reviews:",len(df_2_star))
print("3-Star reviews:",len(df_3_star))
print("4-Star reviews:",len(df_4_star))
print("5-Star reviews:",len(df_5_star))


# Combine all of the dataframes
frames = [df_1_star,df_2_star,df_3_star,df_4_star,df_5_star]

df_balanced = pd.concat(frames)

# Shuffle the data frame to randomize everything
df_balanced = df_balanced.sample(frac=1)
df_balanced.index = range(0,df_balanced.shape[0]) # Relabel the indices
df_balanced.head()

print(len(df_balanced))

---------------------------------------------
10000
20473
53258
---------------------------------------------

1-Star reviews: 575
2-Star reviews: 575
3-Star reviews: 575
4-Star reviews: 575
5-Star reviews: 575
2875


In [3]:
# Now we need to process the text. We define here a function that will remove the punctuation and stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import string
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import re
import operator
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter 

# In case we dont have the nltk stopwords documents
nltk.download("stopwords")

ps = PorterStemmer()

@np.vectorize
def preprocess(sentence):
    sentence = sentence.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    words = filter(lambda x: x not in stopwords.words('english'), tokens)
    # Remove punctuation
    #words = re.sub(r'[^\w\s]','',words)
    #words = [char for char in text if char not in string.punctuation]
    return " ".join(words)


ps = PorterStemmer()
wnl = WordNetLemmatizer()

def get_pos( word ):
    '''
    Part-Of-Speech Tagger
    '''
    w_synsets = wordnet.synsets(word)
    
    
    # n-noun, v-verb, a-adjective, r-
    pos_counts = Counter()
    pos_counts["n"] = len(  [ item for item in w_synsets if item.pos()=="n"]  )
    pos_counts["v"] = len(  [ item for item in w_synsets if item.pos()=="v"]  )
    pos_counts["a"] = len(  [ item for item in w_synsets if item.pos()=="a"]  )
    pos_counts["r"] = len(  [ item for item in w_synsets if item.pos()=="r"]  )
    
    most_common_pos_list = pos_counts.most_common(3)
    return most_common_pos_list[0][0]

def clean_up_tweet(text):
    # Remove @ from the string
    s=re.sub(r'@[A-Za-z0-9]+','',text)    
    
    # Remove URLS
    s=re.sub('https?://[A-Za-z0-9./]+','',s)
    
    # Set to lower case
    s=s.lower()
    
    # Remove numbers
    s= re.sub(r'[0-9]+', '', s) 
    
    # Remove punctuation
    s = re.sub(r'[^\w\s]','',s)
    
    # Remove underscore
    s=s.replace("_", " ")
    
    # Remove RT from the tweet
    s=s.replace("rt", "")
    
    # Tokenize the words
    words = word_tokenize(s)
    
    # Lemmatize the words
    s_stem = ''
    for wi in words:
        #wi = ps.stem(wi)
        wi = wnl.lemmatize(wi,get_pos(wi))
        s_stem += ' ' + wi
    
    return s_stem


# Let us now generate a subset of the data to create a vocabulary with
n_sample = len(df_balanced)
n_features = 1000 # Number of features to use for constructing the dictionary
dict_sample = []

# Now we preprocess the entire balanced data set
# yelp_balanced.shape[0]
for i in tqdm(range(0,n_sample)):
    sentence = df_balanced["review"][i]
    dict_sample.append(clean_up_tweet(sentence))

    # Convert to numpy array
dict_sample = np.asarray(dict_sample)

  0%|          | 0/2875 [00:00<?, ?it/s]

[nltk_data] Downloading package stopwords to /home/javier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


100%|██████████| 2875/2875 [00:53<00:00, 54.15it/s]


In [4]:
# we can pass in the same preprocessing parameters
vectorizer = TfidfVectorizer( min_df = 20)
A = vectorizer.fit_transform(dict_sample)
print( "Created %d X %d TF-IDF-normalized document-term matrix" % (A.shape[0], A.shape[1]) )
print("")

# extract the resulting vocabulary
terms = vectorizer.get_feature_names()
print("Vocabulary has %d distinct terms" % len(terms))



def rank_terms( A, terms ):
    # get the sums over each column
    sums = A.sum(axis=0)
    # map weights to the terms
    weights = {}
    for col, term in enumerate(terms):
        weights[term] = sums[0,col]
    # rank the terms by their weight over all documents
    return sorted(weights.items(), key=operator.itemgetter(1), reverse=True)

ranking = rank_terms( A, terms )
for i, pair in enumerate( ranking[0:20] ):
    print( "%02d. %s (%.2f)" % ( i+1, pair[0], pair[1] ) )

Created 2875 X 1312 TF-IDF-normalized document-term matrix

Vocabulary has 1312 distinct terms
01. the (435.55)
02. be (382.01)
03. and (270.07)
04. to (242.90)
05. it (223.35)
06. of (164.83)
07. have (154.40)
08. for (148.31)
09. this (138.79)
10. in (130.07)
11. that (122.17)
12. my (115.96)
13. you (115.57)
14. not (111.21)
15. but (110.34)
16. on (110.30)
17. with (104.73)
18. they (103.24)
19. use (86.29)
20. we (80.65)


In [5]:
from sklearn.model_selection import train_test_split

# Now we must transform our original review data into a feature Matrix
y = df_balanced["rating"]


# Now we must transform our original review data into a feature Matrix

X = []
y = []

# converting the data frame into a feature matrix
for i in range(0,df_balanced.shape[0]):
    
    r = df_balanced["rating"][i]
    
    if(r==1 or r==2):
        yi =-1
        X.append(df_balanced["review"][i])
        y.append(yi)
    elif(r==4 or r==5):
        yi=1
        X.append(df_balanced["review"][i])
        y.append(yi)

X = np.asarray(X)
y = np.asarray(y)

X = vectorizer.fit_transform(X)

# Now we split the data into a training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=101)

In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from numpy.random import randint

# Now we train different models
nb = MultinomialNB()
dt = DecisionTreeClassifier(random_state=0)
rf = RandomForestClassifier(max_depth=20, random_state=0)
lr = LogisticRegression(multi_class='multinomial',solver='newton-cg')


nb.fit(X_train, y_train)
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)


preds_bl = randint(-1,1,len(y_test))
preds_nb = nb.predict(X_test)
preds_dt = dt.predict(X_test)
preds_rf = rf.predict(X_test)
preds_lr = lr.predict(X_test)

print('================================================================\n')
print("Baseline Model: \n",classification_report(y_test,preds_bl))
print('================================================================\n')
print("Naive Bayes: \n" ,classification_report(y_test,preds_nb))
print('================================================================\n')
print("Desicion Tree: \n",classification_report(y_test,preds_dt))
print('================================================================\n')
print("Random Forests: \n",classification_report(y_test,preds_rf))
print('================================================================\n')
print("Logistic Regression: \n",classification_report(y_test,preds_lr))
print('================================================================\n')


Baseline Model: 
              precision    recall  f1-score   support

         -1       0.50      0.49      0.49       928
          0       0.00      0.00      0.00         0
          1       0.00      0.00      0.00       912

avg / total       0.25      0.25      0.25      1840


Naive Bayes: 
              precision    recall  f1-score   support

         -1       0.80      0.74      0.77       928
          1       0.76      0.81      0.79       912

avg / total       0.78      0.78      0.78      1840


Desicion Tree: 
              precision    recall  f1-score   support

         -1       0.63      0.58      0.61       928
          1       0.61      0.66      0.63       912

avg / total       0.62      0.62      0.62      1840


Random Forests: 
              precision    recall  f1-score   support

         -1       0.69      0.62      0.65       928
          1       0.65      0.71      0.68       912

avg / total       0.67      0.67      0.67      1840


Logistic Regre

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [11]:
#print(df_balanced["review"][310])
#print(vectorizer.transform([df_balanced["review"][0]]))
#print(X_train[0])
text = "I sure loved it"
s = clean_up_tweet(text)

print(text)
print(s)

X0 = vectorizer.transform([s])
preds_lr = nb.predict(X0)
prob_lr = nb.predict_proba(X0)

print("")
print(preds_lr,prob_lr)

I sure loved it
 i sure love it

[1] [[ 0.29031207  0.70968793]]
