In [18]:
import re, collections
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from spacy.en import English ##Note you'll need to install Spacy and download its dependencies
parser = English()
import string

In [19]:
# A custom stoplist
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve"]

In [20]:
# A custom function to clean the text before sending it into the vectorizer
def cleanText(text):
    # get rid of newlines
    text = text.strip().replace("\n", " ").replace("\r", " ")
    
    # replace twitter @mentions
    mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
    text = mentionFinder.sub("@MENTION", text)
    text = re.sub('[^a-zA-Z ]','',text)
    # replace HTML symbols
    text = text.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
    
    # lowercase
    text = text.lower()

    return text

# A custom function to tokenize the text using spaCy
# and convert to lemmas
def tokenizeText(sample):

    # get the tokens using spaCy
    tokens = parser(str(TextBlob(sample).correct()))

    # lemmatize
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # stoplist the tokens
    tokens = [tok for tok in tokens if tok not in STOPLIST]

    # stoplist symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLS]

    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")

    return tokens

In [32]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = CountVectorizer(ngram_range=(1,3),min_df=3, max_features=1000,tokenizer=tokenizeText)
tfvectorizer = TfidfVectorizer(ngram_range=(1,3),min_df = 3,max_features=1000,tokenizer=tokenizeText,sublinear_tf=True)

### Take note of the assumptions made in the vectorizer specifications. There are two kinds of vectorizers initialized: count and tfidf. I've limited both to consider only n-grams that appear at least ten times. I've also limited the feature set into the top 1,000 n-grams that appear the most often in the reviews. Also it only extracts unigrams to trigrams. You can edit any of the parameters.

### You can switch between count and tfidf vectorizers by changing between "vectorizer" and "tfvectorizer" in one of the cells below

In [23]:
import pandas as pd
import pickle
with open("C:/Users/kennd/Documents/Github/mcnulty_yelp/data/review_business_df.pkl", 'rb') as picklefile: 
    review_business_df = pickle.load(picklefile)

In [25]:
sample_df = review_business_df.sample(10000,random_state=1).dropna().reset_index(drop=True)

### Note that I've only extract 10,000 sample reviews. This is just to manage its tracktability. Lemmatizing takes a really long time. Indirectly, 10,000 reviews is also big enough relative to the 1,000 features (discussed above).

In [35]:
short_df = sample_df.iloc[:,0:2]
short_df.text= short_df.text.apply(cleanText)

In [None]:
## Gets the count of each word in each sentence
countfeature = vectorizer.fit_transform(short_df.text)
tffeature = tfvectorizer.fit_transform(short_df.text)

In [None]:
## Turns count/tfidf matrix into a dataframe
countfeaturedf = pd.DataFrame(countfeature.A, columns=vectorizer.get_feature_names())
tffeaturedf = pd.DataFrame(tffeature.A, columns=tfvectorizer.get_feature_names())

In [None]:
## Concat Y and X
new_df_count = pd.concat((new_df,countfeaturedf),axis=1)
new_df_tf = pd.concat((new_df,tffeaturedf),axis=1)

In [None]:
print(new_df_count.info())
print(new_df_tf.info())

In [None]:
import pickle
with open('C:/Users/kennd/Documents/Github/mcnulty_yelp/data/new_df_count.pkl', 'wb') as picklefile: 
    pickle.dump(new_df_count, picklefile)
import pickle
with open('C:/Users/kennd/Documents/Github/mcnulty_yelp/data/new_df_tf.pkl', 'wb') as picklefile: 
    pickle.dump(new_df_tf, picklefile)    

In [14]:
def string(x):
    if x == 5:
        return "five"
    elif x == 4:
        return 'four'
    elif x == 3:
        return 'three'
    elif x == 2:
        return 'two'
    elif x == 1:
        return 'one'

In [15]:
new_df['stars_x'] = short_df['stars_x'].apply(string)

In [16]:
new_df.head()

Unnamed: 0,s_spell_lem,stars_x,100,able,absolutely,ache,across,actually,add,added,...,would definitely,would recommend,wrap,wrong,year,yelp,yes,yet,york,young
0,i go out to eat often and have been to many p...,four,0.0,0.0,0.0,0.0,0.126137,0.103447,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,always happysatisfied when i leave they have ...,five,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,i dont like fried chicken there i said it sou...,five,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,love this little hidden gem best go by far in...,five,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,a bit conflict about this cafe because last n...,three,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062492,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
import pickle
with open('C:/Users/kenndanielso/Documents/Github/mcnulty_yelp/data/kenn_review_df.pkl', 'wb') as picklefile: 
    pickle.dump(new_df, picklefile)

In [10]:
import pickle
import pandas as pd
import numpy as np
with open("C:/Users/kennd/Documents/GitHub/mcnulty_yelp/data/review_business_df.pkl", 'rb') as picklefile: 
    new_df = pickle.load(picklefile)

In [11]:
collapse_df = new_df.copy()

In [13]:
##Apply this if you want to collapse the 5-star ratings by removing 3-star reviews and combining 1 & 2 stars 
##and 4 & 5 ratings

def collapse(x):
    if x == 'four' or x == 'five':
        return 'four/five'
    elif x == 'one' or x == 'two':
        return 'one/two'
    elif x == 'three':
        return 'three'

collapse_df.stars_x = new_df.stars_x.apply(collapse)

In [4]:
# Run if you want to delete 3-stars
# collapse_df = collapse_df[new_df['stars_x'] != 'three']

In [14]:
collapse_df.head()

Unnamed: 0,text,stars_x,date,business_id,user_id,review_id,name,city,open,review_count,stars_y,state,latitude,longitude
0,"Mr Hoagie is an institution. Walking in, it do...",,2012-08-01,5UmKMjUEUNdYWqANhGckJw,PUFPaY9KxDAcGqfsorJp3Q,Ya85v4eqdd6k9Od8HbQjyA,Mr Hoagie,Dravosburg,True,4,4.5,PA,40.354327,-79.900706
1,Excellent food. Superb customer service. I mis...,,2014-02-13,5UmKMjUEUNdYWqANhGckJw,Iu6AxdBYGR4A0wspR9BYHA,KPvLNJ21_4wbYNctrOwWdQ,Mr Hoagie,Dravosburg,True,4,4.5,PA,40.354327,-79.900706
2,Yes this place is a little out dated and not o...,,2015-10-31,5UmKMjUEUNdYWqANhGckJw,auESFwWvW42h6alXgFxAXQ,fFSoGV46Yxuwbr3fHNuZig,Mr Hoagie,Dravosburg,True,4,4.5,PA,40.354327,-79.900706
3,This place was DELICIOUS!! My parents saw a r...,,2012-12-01,mVHrayjG3uZ_RLHkLj-AMg,LWbYpcangjBMm4KPxZGOKg,6w6gMZ3iBLGcUM4RBIuifQ,Emil's Lounge,rankin,True,20,5.0,PA,40.413464,-79.880247
4,Can't miss stop for the best Fish Sandwich in ...,,2013-03-15,mVHrayjG3uZ_RLHkLj-AMg,m1FpV3EAeggaAdfPx0hBRQ,jVVv_DA5mCDB6mediuwHAw,Emil's Lounge,rankin,True,20,5.0,PA,40.413464,-79.880247


In [6]:
##Adds sentiment as a feature. Note that I added 1 because some algorithms won't accept negative sentiment scores
##Sentiment scores is based on TextBlob where it goes from -1.0 to 1.0 (negative to positive)
##Change the reference DF if you want to go back to using the 5-star categories

from textblob import TextBlob

collapse_df['senti'] = collapse_df['s_spell_lem'].apply(lambda x: TextBlob(x).sentiment[0] + 1)

collapse_df.head()

Unnamed: 0,s_spell_lem,stars_x,100,able,absolutely,ache,across,actually,add,added,...,would recommend,wrap,wrong,year,yelp,yes,yet,york,young,senti
0,i go out to eat often and have been to many p...,four/five,0.0,0.0,0.0,0.0,0.126137,0.103447,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.234848
1,always happysatisfied when i leave they have ...,four/five,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.578571
2,i dont like fried chicken there i said it sou...,four/five,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.18203
3,love this little hidden gem best go by far in...,four/five,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.249167
4,a bit conflict about this cafe because last n...,three,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.062492,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.279425


# Model testing

In [15]:
##Split into train and test at 75/25
##Change the reference DF if you want to go back to using the 5-star categories
from sklearn.cross_validation import train_test_split 
train, test = train_test_split(collapse_df,test_size = 0.25)

In [16]:
##Split X & Y
X_train = train.iloc[:,2:]
Y_train = train.iloc[:,1]
X_test = test.iloc[:,2:]
Y_test = test.iloc[:,1]

In [17]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [17]:
## KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5).fit(X_train,Y_train)
knn_Y_pred = knn.predict(X_test)
print("KNN Accuracy: ",np.mean(knn_Y_pred == np.array(Y_test)))
print(classification_report(Y_test,knn_Y_pred))

KNN Accuracy:  0.5644
[[ 162  213  107]
 [  37  204  132]
 [  66  534 1045]]
             precision    recall  f1-score   support

  four/five       0.81      0.64      0.71      1645
    one/two       0.61      0.34      0.43       482
      three       0.21      0.55      0.31       373

avg / total       0.69      0.56      0.60      2500



In [10]:
## NaiveBayes
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB().fit(X_train,Y_train)
nb_Y_pred = nb.predict(X_test)
print("NB Accuracy: ",np.mean(nb_Y_pred == np.array(Y_test)))
print(classification_report(Y_test,nb_Y_pred))

NB Accuracy:  0.7848
[[ 345   50   87]
 [  80  105  188]
 [  64   69 1512]]
             precision    recall  f1-score   support

  four/five       0.85      0.92      0.88      1645
    one/two       0.71      0.72      0.71       482
      three       0.47      0.28      0.35       373

avg / total       0.76      0.78      0.77      2500



In [11]:
## Logistic
from sklearn.linear_model import LogisticRegression
log = LogisticRegression().fit(X_train,Y_train)
log_Y_pred = log.predict(X_test)
print("Logistc Accuracy: ",np.mean(log_Y_pred == np.array(Y_test)))
print(classification_report(Y_test,log_Y_pred))

Logistc Accuracy:  0.792
[[ 350   40   92]
 [  83   75  215]
 [  43   47 1555]]
             precision    recall  f1-score   support

  four/five       0.84      0.95      0.89      1645
    one/two       0.74      0.73      0.73       482
      three       0.46      0.20      0.28       373

avg / total       0.76      0.79      0.77      2500



In [14]:
## Linear SVC
from sklearn.svm import SVC
svcl = SVC(kernel='linear').fit(X_train,Y_train)
svcl_Y_pred = svcl.predict(X_test)
print("SVC Linear Accuracy: ",np.mean(svcl_Y_pred == np.array(Y_test)))
print(classification_report(Y_test,svcl_Y_pred))

SVC Linear Accuracy:  0.7824
[[ 348   55   79]
 [  87   86  200]
 [  53   70 1522]]
             precision    recall  f1-score   support

  four/five       0.85      0.93      0.88      1645
    one/two       0.71      0.72      0.72       482
      three       0.41      0.23      0.29       373

avg / total       0.75      0.78      0.76      2500



In [15]:
## RBF SVC
from sklearn.svm import SVC
svcrbf = SVC(kernel='rbf', gamma=1).fit(X_train,Y_train)
svcrbf_Y_pred = svcrbf.predict(X_test)
print("SVC RBF Accuracy: ",np.mean(svcrbf_Y_pred == np.array(Y_test)))
print(classification_report(Y_test,svcrbf_Y_pred))

SVC RBF Accuracy:  0.7404
[[ 212    2  268]
 [  24    0  349]
 [   6    0 1639]]
             precision    recall  f1-score   support

  four/five       0.73      1.00      0.84      1645
    one/two       0.88      0.44      0.59       482
      three       0.00      0.00      0.00       373

avg / total       0.65      0.74      0.67      2500



In [12]:
## Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier().fit(X_train,Y_train)
dt_Y_pred = dt.predict(X_test)
print("Decision Tree Accuracy: ",np.mean(dt_Y_pred == np.array(Y_test)))
print(classification_report(Y_test,dt_Y_pred))

Decision Tree Accuracy:  0.6716
[[ 241  108  133]
 [  81   99  193]
 [ 108  198 1339]]
             precision    recall  f1-score   support

  four/five       0.80      0.81      0.81      1645
    one/two       0.56      0.50      0.53       482
      three       0.24      0.27      0.25       373

avg / total       0.67      0.67      0.67      2500



In [13]:
## Random Forests
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier().fit(X_train,Y_train)
rf_Y_pred = rf.predict(X_test)
print("Random Forests Accuracy: ",np.mean(rf_Y_pred == np.array(Y_test)))
print(classification_report(Y_test,rf_Y_pred))

Random Forests Accuracy:  0.7544
[[ 279   39  164]
 [  73   44  256]
 [  43   39 1563]]
             precision    recall  f1-score   support

  four/five       0.79      0.95      0.86      1645
    one/two       0.71      0.58      0.64       482
      three       0.36      0.12      0.18       373

avg / total       0.71      0.75      0.72      2500

