In [1]:
import re, collections
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from spacy.en import English ##Note you'll need to install Spacy and download its dependencies
parser = English()
import string

In [2]:
# A custom stoplist
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve"]

In [3]:
# A custom function to clean the text before sending it into the vectorizer
def cleanText(text):
    # get rid of newlines
    text = text.strip().replace("\n", " ").replace("\r", " ")
    
    # replace twitter @mentions
    mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
    text = mentionFinder.sub("@MENTION", text)
    text = re.sub('[^a-zA-Z ]','',text)
    # replace HTML symbols
    text = text.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
    
    # lowercase
    text = text.lower()
#     text = str(TextBlob(text).correct())
    return text

# A custom function to tokenize the text using spaCy
# and convert to lemmas
def tokenizeText(sample):

    # get the tokens using spaCy
    tokens = parser(sample)

    # lemmatize
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # stoplist the tokens
    tokens = [tok for tok in tokens if tok not in STOPLIST]

    # stoplist symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLS]

    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")

    return tokens

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = CountVectorizer(ngram_range=(1,3),min_df=3, max_features=3000,tokenizer=tokenizeText)
tfvectorizer = TfidfVectorizer(ngram_range=(1,3),min_df = 3,max_features=3000,tokenizer=tokenizeText,sublinear_tf=True)

### Take note of the assumptions made in the vectorizer specifications. There are two kinds of vectorizers initialized: count and tfidf. I've limited both to consider only n-grams that appear at least ten times. I've also limited the feature set into the top 3,000 n-grams that appear the most often in the reviews. Also it only extracts unigrams to trigrams. You can edit any of the parameters.

### You can switch between count and tfidf vectorizers by changing between "vectorizer" and "tfvectorizer" in one of the cells below

In [5]:
import pandas as pd
import pickle
with open("C:/Users/kenndanielso/Documents/Github/mcnulty_yelp/data/review_business_df.pkl", 'rb') as picklefile: 
    review_business_df = pickle.load(picklefile)

In [6]:
sample_df = review_business_df.sample(10000,random_state=1).dropna().reset_index(drop=True)

### Note that I've only extract 10,000 sample reviews. This is just to manage its tracktability. Lemmatizing takes a really long time. Indirectly, 10,000 reviews is also big enough relative to the 1,000 features (discussed above).

In [56]:
short_df = sample_df.iloc[:,0:2]
short_df.text= short_df.text.apply(cleanText)

In [57]:
## Gets the count of each word in each sentence
countfeature = vectorizer.fit_transform(short_df.text)
tffeature = tfvectorizer.fit_transform(short_df.text)

In [58]:
## Turns count/tfidf matrix into a dataframe
countfeaturedf = pd.DataFrame(countfeature.A, columns=vectorizer.get_feature_names())
tffeaturedf = pd.DataFrame(tffeature.A, columns=tfvectorizer.get_feature_names())

In [59]:
## Concat Y and X
new_df_count = pd.concat((short_df,countfeaturedf),axis=1)
new_df_tf = pd.concat((short_df,tffeaturedf),axis=1)

In [60]:
print(new_df_count.info())
print(new_df_tf.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 3002 entries, text to zucchini
dtypes: int64(3001), object(1)
memory usage: 229.0+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 3002 entries, text to zucchini
dtypes: float64(3000), int64(1), object(1)
memory usage: 229.0+ MB
None


In [61]:
## Do not run until the above cells are run
import pickle
new_df_count.to_pickle('new_df_count.pkl')
new_df_tf.to_pickle('new_df_tf.pkl')

In [7]:
import pickle
import pandas as pd
new_df_count = pd.read_pickle("new_df_count.pkl")
new_df_tf = pd.read_pickle('new_df_tf.pkl')

In [8]:
def string(x):
    if x == 5:
        return "five"
    elif x == 4:
        return 'four'
    elif x == 3:
        return 'three'
    elif x == 2:
        return 'two'
    elif x == 1:
        return 'one'

In [9]:
new_df_count['stars_x'] = new_df_count['stars_x'].apply(string)
new_df_tf['stars_x'] = new_df_tf['stars_x'].apply(string)

In [10]:
collapse_df_count = new_df_count.copy()
collapse_df_tf = new_df_tf.copy()

In [11]:
##Apply this if you want to collapse the 5-star ratings by removing 3-star reviews and combining 1 & 2 stars 
##and 4 & 5 ratings

def collapse(x):
    if x == 'four' or x == 'five':
        return 'four/five'
    elif x == 'one' or x == 'two':
        return 'one/two'
    elif x == 'three':
        return 'three'

# Run if you want to delete 3-stars
# collapse_df = collapse_df[new_df['stars_x'] != 'three']    
    
collapse_df_count.stars_x = new_df_count.stars_x.apply(collapse)
collapse_df_tf.stars_x = new_df_tf.stars_x.apply(collapse)

In [12]:
collapse_df_count.stars_x.value_counts()/collapse_df_count.shape[0]

four/five    0.6553
one/two      0.1953
three        0.1494
Name: stars_x, dtype: float64

In [13]:
##Adds sentiment as a feature. Note that I added 1 because some algorithms won't accept negative sentiment scores
##Sentiment scores is based on TextBlob where it goes from -1.0 to 1.0 (negative to positive)
##Change the reference DF if you want to go back to using the 5-star categories

from textblob import TextBlob

collapse_df_count['senti'] = collapse_df_count['text'].apply(lambda x: TextBlob(x).sentiment[0] + 1)
collapse_df_tf['senti'] = collapse_df_tf['text'].apply(lambda x: TextBlob(x).sentiment[0] + 1)

# Model testing

In [14]:
##Split into train and test at 75/25
from sklearn.cross_validation import train_test_split 
train, test = train_test_split(collapse_df_count.values,test_size = 0.25,random_state=1)

In [15]:
##Split X & Y
X_train = train[:,2:]
Y_train = train[:,1]
X_test = test[:,2:]
Y_test = test[:,1]

In [16]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
import numpy as np

In [None]:
## KNN
from sklearn.neighbors import KNeighborsClassifier
knn_param_grid = [{'n_neighbors':[1,2,3,4,5],'weights':['uniform','distance'],'metric':['minkowski','euclidean','manhattan']}]
knn = GridSearchCV(KNeighborsClassifier(),knn_param_grid,cv=5,n_jobs=-1)
knn.fit(X_train,Y_train)
knn_Y_pred = knn.predict(X_test)
print(knn.best_estimator_)
print("KNN Accuracy: ",np.mean(knn_Y_pred == np.array(Y_test)))
print(confusion_matrix(Y_test,knn_Y_pred))
print(classification_report(Y_test,knn_Y_pred))

In [40]:
## NaiveBayes
from sklearn.naive_bayes import MultinomialNB
nb_param_grid = [{'alpha':[0.01,0.1,0.25,0.5,0.75,1.0]}]
nb = GridSearchCV(MultinomialNB(),nb_param_grid,cv=5,n_jobs=-1)
nb.fit(X_train,Y_train)
nb_Y_pred = nb.predict(X_test)
print(nb.best_estimator_)
print("NB Accuracy: ",np.mean(nb_Y_pred == np.array(Y_test)))
print(confusion_matrix(Y_test,nb_Y_pred))
print(classification_report(Y_test,nb_Y_pred))

MultinomialNB(alpha=0.75, class_prior=None, fit_prior=True)
NB Accuracy:  0.766
[[1442   65  139]
 [  71  326   81]
 [ 156   73  147]]
             precision    recall  f1-score   support

  four/five       0.86      0.88      0.87      1646
    one/two       0.70      0.68      0.69       478
      three       0.40      0.39      0.40       376

avg / total       0.76      0.77      0.76      2500



In [41]:
## Logistic
from sklearn.linear_model import LogisticRegression
log_param_grid = [{'C':[0.01,0.1,1,10,100,1000], 'penalty':['l1','l2'],'class_weight':[None,'balanced']}]
log = GridSearchCV(LogisticRegression(),log_param_grid,cv=5,n_jobs=-1)
log.fit(X_train,Y_train)
log_Y_pred = log.predict(X_test)
print(log.best_estimator_)
print("Logistc Accuracy: ",np.mean(log_Y_pred == np.array(Y_test)))
print(confusion_matrix(Y_test,log_Y_pred))
print(classification_report(Y_test,log_Y_pred))

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Logistc Accuracy:  0.7872
[[1571   29   46]
 [ 118  319   41]
 [ 224   74   78]]
             precision    recall  f1-score   support

  four/five       0.82      0.95      0.88      1646
    one/two       0.76      0.67      0.71       478
      three       0.47      0.21      0.29       376

avg / total       0.76      0.79      0.76      2500



In [47]:
## Linear SVC
from sklearn.svm import LinearSVC
svcl_param_grid = [{'C':[0.01,0.1,1,10,100], 'loss':['hinge','squared_hinge'],'class_weight':[None,'balanced']}]
svcl = GridSearchCV(LinearSVC(),svcl_param_grid,cv=5,n_jobs=-1)
svcl.fit(X_train,Y_train)
svcl_Y_pred = svcl.predict(X_test)
print("SVC Linear Accuracy: ",np.mean(svcl_Y_pred == np.array(Y_test)))
print(svcl.best_estimator_)
print(confusion_matrix(Y_test,svcl_Y_pred))
print(classification_report(Y_test,svcl_Y_pred))

SVC Linear Accuracy:  0.7888
LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
[[1572   33   41]
 [ 114  329   35]
 [ 223   82   71]]
             precision    recall  f1-score   support

  four/five       0.82      0.96      0.88      1646
    one/two       0.74      0.69      0.71       478
      three       0.48      0.19      0.27       376

avg / total       0.76      0.79      0.76      2500



In [None]:
## Non-linear SVC
from sklearn.svm import SVC
svcrbf_param_grid = [{'kernel':['rbf','poly','sigmoid'], 'degree':[2,3],'gamma':[0.001,0.01,0.1,1,2,3],'class_weight':[None,'balanced']}]
svcrbf = GridSearchCV(SVC(),svcrbf_param_grid,cv=5,n_jobs=-1)
svcrbf.fit(X_train,Y_train)
svcrbf_Y_pred = svcrbf.predict(X_test)
print("SVC RBF Accuracy: ",np.mean(svcrbf_Y_pred == np.array(Y_test)))
print(confusion_matrix(Y_test,svcrbf_Y_pred))
print(classification_report(Y_test,svcrbf_Y_pred))

In [None]:
## Random Forests
from sklearn.ensemble import RandomForestClassifier
rf_param_grid = [{'criterion':['gini','entropy'],'class_weight':[None,'balanced']}]
rf = GridSearchCV(RandomForestClassifier(),rf_param_grid,cv=5,n_jobs=-1)
rf.fit(X_train,Y_train)
rf_Y_pred = rf.predict(X_test)
print("Random Forests Accuracy: ",np.mean(rf_Y_pred == np.array(Y_test)))
print(confusion_matrix(Y_test,rf_Y_pred))
print(classification_report(Y_test,rf_Y_pred))

# Read twitter feeds

In [28]:
tweet_df = pd.read_pickle('C:/Users/kenndanielso/Documents/Github/mcnulty_yelp/data/tweets_clean.pkl')
tweet_df = tweet_df.drop_duplicates('text').reset_index(drop=True)

tweet_sparse_matrix = vectorizer.transform(tweet_df.text)
tweet_count_df = pd.DataFrame(tweet_sparse_matrix.A, columns=vectorizer.get_feature_names())

In [32]:
# tweet_count_df.to_pickle("tweet_count_df.pkl")
tweet_count_df = pd.read_pickle("tweet_count_df.pkl")

In [33]:
import pandas as pd
tweet_df2 = pd.concat((tweet_df,tweet_count_df),axis=1)
tweet_df2['senti'] = tweet_df.text.apply(lambda x: TextBlob(x).sentiment[0] + 1)

In [34]:
X_tweet = tweet_df2.iloc[:,6:]

In [35]:
## Naive Bayes
tweet_pred = nb.predict(X_tweet)
tweet_pred_prob = nb.predict_proba(X_tweet)

In [36]:
tweet_df['pred'] = tweet_pred

tweet_prob_df = pd.DataFrame(tweet_pred_prob,columns = ['prob_4/5','prob_1/2','prob_3'])
tweet_df = pd.concat((tweet_df,tweet_prob_df),axis =1)


In [37]:
tweet_df.to_pickle("tweet_df_pred.pkl")

In [40]:
import pandas as pd
tweet_df_pred = pd.read_pickle("tweet_df_pred.pkl")

In [41]:
tweet_df_pred.to_csv("tweet_df_pred.csv")