# 0.0 Load modules

In [9]:
#Standard libraries
import re, collections
import numpy as np
import pandas as pd
import pickle

#Machine learning modules
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.externals import joblib

#Text processing libraries
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords ##Note you'll need to download NLTK and corpuses
from spacy.en import English ##Note you'll need to install Spacy and download its dependencies
parser = English()
import string

In [10]:
# A custom stoplist
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve"]

In [11]:
# A custom function to clean the text before sending it into the vectorizer
def cleanText(text):
    # get rid of newlines
    text = text.strip().replace("\n", " ").replace("\r", " ")
    
    # replace twitter @mentions
    mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
    text = mentionFinder.sub("@MENTION", text)
    text = re.sub('[^a-zA-Z ]','',text)
    # replace HTML symbols
    text = text.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
    
    # lowercase
    text = text.lower()
#     text = str(TextBlob(text).correct())
    return text

# A custom function to tokenize the text using spaCy
# and convert to lemmas
def tokenizeText(sample):
    # get the tokens using spaCy
    tokens = parser(sample)

    # lemmatize
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # stoplist the tokens
    tokens = [tok for tok in tokens if tok not in STOPLIST]

    # stoplist symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLS]

    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")

    return tokens

# 1.0 Data Munging

In [28]:
#Let's get our data. If you don't have the pickle its in the dropbox
with open("C:/Users/Administrator/Documents/Github/mcnulty_yelp/data/review_business_df.pkl", 'rb') as picklefile: 
    review_business_df = pickle.load(picklefile)

sample_df = review_business_df.sample(10000,random_state=1).dropna().reset_index(drop=True)

#### Note that I've only extract 10,000 sample reviews. This is just to manage its tracktability. Lemmatizing takes a really long time. Indirectly, 10,000 reviews is also big enough relative to the 3,000 features (refer to discussion on vectorizer below).

In [30]:
short_df = sample_df.iloc[:,0:2]
short_df.text= short_df.text.apply(cleanText)

## 1.1 Bag of Words Vectorizing

In [None]:
#Define your vectorizers. There's no need for tfidfvectorizer since we'll use tfidftransformer below
vectorizer = CountVectorizer(ngram_range=(1,3),min_df=3, max_features=3000,tokenizer=tokenizeText)
# tfvectorizer = TfidfVectorizer(ngram_range=(1,3),min_df = 3,max_features=3000,tokenizer=tokenizeText,sublinear_tf=True)
##This transforms our count vectors into tfidf vectors
tffeature = TfidfTransformer().fit_transform(countfeature)

#### Take note of the assumptions made in the vectorizer specifications. There are two kinds of vectorizers initialized: count and tfidf. I've limited both to consider only (1 to 3) n-grams that appear in at least 3 documents. I've also limited the feature set into the top 3,000 n-grams that appear the most often in the reviews. You can edit any of the parameters.

In [31]:
## Gets the count of each word in each sentence
countfeature = vectorizer.fit_transform(short_df.text)

In [35]:
## Turns count/tfidf matrices into dataframes
countfeaturedf = pd.DataFrame(countfeature.A, columns=vectorizer.get_feature_names())
tffeaturedf = pd.DataFrame(tffeature.A, columns=vectorizer.get_feature_names())

In [36]:
## Concat Y and X. Note that I'll be concatinating and transforming both the count and tfidf data frames
new_df_count = pd.concat((short_df,countfeaturedf),axis=1)
new_df_tf = pd.concat((short_df,tffeaturedf),axis=1)

In [37]:
print(new_df_count.info())
print(new_df_tf.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 3002 entries, text to zucchini
dtypes: int64(3001), object(1)
memory usage: 229.0+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 3002 entries, text to zucchini
dtypes: float64(3000), int64(1), object(1)
memory usage: 229.0+ MB
None


### Pickle Checkpoint

In [38]:
## Do not run until the above cells are run or else you'll replace your pickle files with empty files.
import pickle
new_df_count.to_pickle('C:/Users/Administrator/Documents/Github/mcnulty_yelp/data/new_df_count.pkl')
new_df_tf.to_pickle('C:/Users/Administrator/Documents/Github/mcnulty_yelp/data/new_df_tf.pkl')

In [25]:
import pickle
import pandas as pd
new_df_count = pd.read_pickle("C:/Users/Administrator/Documents/Github/mcnulty_yelp/data/new_df_count.pkl")
new_df_tf = pd.read_pickle('C:/Users/Administrator/Documents/Github/mcnulty_yelp/data/new_df_tf.pkl')

## 1.2 Transforming other features

In [26]:
##Turn star rating from integers to text
def string(x):
    if x == 5:
        return "five"
    elif x == 4:
        return 'four'
    elif x == 3:
        return 'three'
    elif x == 2:
        return 'two'
    elif x == 1:
        return 'one'

##Apply this if you want to collapse the 5-star ratings by removing 3-star reviews and combining 1 & 2 stars 
##and 4 & 5 ratings

def collapse(x):
    if x == 'four' or x == 'five':
        return 'four/five'
    elif x == 'one' or x == 'two':
        return 'one/two'
    elif x == 'three':
        return 'three'

# Run if you want to delete 3-stars
# collapse_df = collapse_df[new_df['stars_x'] != 'three']    

In [27]:
new_df_count['stars_x'] = new_df_count['stars_x'].apply(string)
new_df_tf['stars_x'] = new_df_tf['stars_x'].apply(string)
##Just assigning to new dataframes, in case we want to refer to the un-collapsed dataframes
collapse_df_count = new_df_count.copy()
collapse_df_tf = new_df_tf.copy()
collapse_df_count.stars_x = new_df_count.stars_x.apply(collapse)
collapse_df_tf.stars_x = new_df_tf.stars_x.apply(collapse)

In [28]:
collapse_df_count.stars_x.value_counts()/collapse_df_count.shape[0]

four/five    0.6553
one/two      0.1953
three        0.1494
Name: stars_x, dtype: float64

In [29]:
##Adds sentiment as a feature. Note that I added 1 because some algorithms won't accept negative sentiment scores
##Sentiment scores is based on TextBlob where it goes from -1.0 to 1.0 (negative to positive)
##Change the reference DF if you want to go back to using the 5-star categories

from textblob import TextBlob

collapse_df_count['senti'] = collapse_df_count['text'].apply(lambda x: TextBlob(x).sentiment[0] + 1)
collapse_df_tf['senti'] = collapse_df_tf['text'].apply(lambda x: TextBlob(x).sentiment[0] + 1)

In [86]:
##To be used for vectorization of non-Yelp data (or outside of those sampled)
pickle.dump(np.array(collapse_df_count.columns)[2:-1], open("C:/Users/Administrator/Documents/Github/mcnulty_yelp/data/vocab.pkl","wb")) 

# 2.0 Model Testing

## 2.1 Using count vectors

In [31]:
##Split into train and test at 75/25
from sklearn.cross_validation import train_test_split 
train, test = train_test_split(collapse_df_count.values,test_size = 0.25,random_state=1)

##Split X & Y
X_train = train[:,2:]
Y_train = train[:,1]
X_test = test[:,2:]
Y_test = test[:,1]

### 2.1.0 Neural network trial (can be ingnored and skip to 2.1.1)

In [33]:
##Integrize
def destring(x):
    if x == 'four/five':
        return 5
    elif x == 'three':
        return 3
    elif x == 'one/two':
        return 1

In [41]:
nn_df_count = collapse_df_count.copy()
nn_df_tf = collapse_df_tf.copy()
nn_df_count.stars_x = collapse_df_count.stars_x.apply(destring)
nn_df_tf.stars_x = collapse_df_tf.stars_x.apply(destring)
##Split into train and test at 75/25
from sklearn.cross_validation import train_test_split 
train, test = train_test_split(collapse_df_count.values,test_size = 0.25,random_state=1)

##Split X & Y
X_train = train[:,2:]
Y_train = train[:,1]
X_test = test[:,2:]
Y_test = test[:,1]

In [49]:
from sknn.mlp import Classifier, Layer

nn = Classifier(
    layers=[
        Layer("Tanh", units=100),
        Layer("Softmax",units=3)],
    learning_rate=0.001,
    n_iter=25)
nn.fit(X_train, Y_train)

Classifier(batch_size=1, callback=None, debug=False, dropout_rate=None,
      f_stable=0.001,
      hidden0=<sknn.nn.Layer `Tanh`: units=100, frozen=False, name='hidden0'>,
      layers=[<sknn.nn.Layer `Tanh`: units=100, frozen=False, name='hidden0'>, <sknn.nn.Layer `Softmax`: units=3, frozen=False, name='output'>],
      learning_momentum=0.9, learning_rate=0.001, learning_rule='sgd',
      loss_type=None, n_iter=25, n_stable=10, normalize=None,
      output=<sknn.nn.Layer `Softmax`: units=3, frozen=False, name='output'>,
      parameters=None, random_state=None, regularize=None, valid_set=None,

In [50]:
nn_Y_pred = nn.predict(X_test)
print("NN: ",np.mean(nn_Y_pred == np.array(Y_test)))
print(confusion_matrix(Y_test,nn_Y_pred))
print(classification_report(Y_test,nn_Y_pred))

[(2500, 3)]
NN:  0.51353088
[[1493   43  110]
 [  82  327   69]
 [ 176   79  121]]
             precision    recall  f1-score   support

  four/five       0.85      0.91      0.88      1646
    one/two       0.73      0.68      0.71       478
      three       0.40      0.32      0.36       376

avg / total       0.76      0.78      0.77      2500



In [52]:
##Split into train and test at 75/25
from sklearn.cross_validation import train_test_split 
train, test = train_test_split(collapse_df_tf.values,test_size = 0.25,random_state=1)

##Split X & Y
X_train = train[:,2:]
Y_train = train[:,1]
X_test = test[:,2:]
Y_test = test[:,1]

nn = Classifier(
    layers=[
        Layer("Tanh", units=100),
        Layer("Softmax",units=3)],
    learning_rate=0.001,
    n_iter=25)
nn.fit(X_train, Y_train)

nn_Y_pred = nn.predict(X_test)
print("NN: ",np.mean(nn_Y_pred == np.array(Y_test)))
print(confusion_matrix(Y_test,nn_Y_pred))
print(classification_report(Y_test,nn_Y_pred))

[(2500, 3)]
NN:  0.54405728
[[1577   55   14]
 [  97  368   13]
 [ 220  116   40]]
             precision    recall  f1-score   support

  four/five       0.83      0.96      0.89      1646
    one/two       0.68      0.77      0.72       478
      three       0.60      0.11      0.18       376

avg / total       0.77      0.79      0.75      2500



### 2.1.1 Relatively quick models

In [49]:
## NaiveBayes
from sklearn.naive_bayes import MultinomialNB
nb_param_grid = [{'alpha':[0.01,0.1,0.25,0.5,0.75,1.0]}]
nb = GridSearchCV(MultinomialNB(),nb_param_grid,cv=5,n_jobs=-1)
nb.fit(X_train,Y_train)
nb_Y_pred = nb.predict(X_test)
print(nb.best_estimator_)
print("NB Accuracy: ",np.mean(nb_Y_pred == np.array(Y_test)))
print(confusion_matrix(Y_test,nb_Y_pred))
print(classification_report(Y_test,nb_Y_pred))

MultinomialNB(alpha=0.75, class_prior=None, fit_prior=True)
NB Accuracy:  0.766
[[1442   65  139]
 [  71  326   81]
 [ 156   73  147]]
             precision    recall  f1-score   support

  four/five       0.86      0.88      0.87      1646
    one/two       0.70      0.68      0.69       478
      three       0.40      0.39      0.40       376

avg / total       0.76      0.77      0.76      2500



In [50]:
## Logistic
from sklearn.linear_model import LogisticRegression
log_param_grid = [{'C':[0.01,0.1,1,10], 'penalty':['l1','l2'],'class_weight':[None,'balanced']}]
log = GridSearchCV(LogisticRegression(),log_param_grid,cv=5,n_jobs=-1)
log.fit(X_train,Y_train)
log_Y_pred = log.predict(X_test)
print(log.best_estimator_)
print("Logistc Accuracy: ",np.mean(log_Y_pred == np.array(Y_test)))
print(confusion_matrix(Y_test,log_Y_pred))
print(classification_report(Y_test,log_Y_pred))

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Logistc Accuracy:  0.7872
[[1571   29   46]
 [ 118  319   41]
 [ 224   74   78]]
             precision    recall  f1-score   support

  four/five       0.82      0.95      0.88      1646
    one/two       0.76      0.67      0.71       478
      three       0.47      0.21      0.29       376

avg / total       0.76      0.79      0.76      2500



In [51]:
## Linear SVC
from sklearn.svm import LinearSVC
svcl_param_grid = [{'C':[0.01,0.1,1,10,100], 'loss':['hinge','squared_hinge'],'class_weight':[None,'balanced']}]
svcl = GridSearchCV(LinearSVC(),svcl_param_grid,cv=5,n_jobs=-1)
svcl.fit(X_train,Y_train)
svcl_Y_pred = svcl.predict(X_test)
print("SVC Linear Accuracy: ",np.mean(svcl_Y_pred == np.array(Y_test)))
print(svcl.best_estimator_)
print(confusion_matrix(Y_test,svcl_Y_pred))
print(classification_report(Y_test,svcl_Y_pred))

SVC Linear Accuracy:  0.7888
LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
[[1572   33   41]
 [ 114  329   35]
 [ 223   82   71]]
             precision    recall  f1-score   support

  four/five       0.82      0.96      0.88      1646
    one/two       0.74      0.69      0.71       478
      three       0.48      0.19      0.27       376

avg / total       0.76      0.79      0.76      2500



### 2.1.2 Relatively slow models 
Grid search will not be performed on the models below, takes too long.
(Warning: do not run unless you are using a high-powered cloud computer, for real.)

In [53]:
## KNN
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train,Y_train)
knn_Y_pred = knn.predict(X_test)
print("KNN Accuracy: ",np.mean(knn_Y_pred == np.array(Y_test)))
print(confusion_matrix(Y_test,knn_Y_pred))
print(classification_report(Y_test,knn_Y_pred))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=2, p=2,
           weights='uniform')
KNN Accuracy:  0.652
[[1457  165   24]
 [ 306  164    8]
 [ 287   80    9]]
             precision    recall  f1-score   support

  four/five       0.71      0.89      0.79      1646
    one/two       0.40      0.34      0.37       478
      three       0.22      0.02      0.04       376

avg / total       0.58      0.65      0.60      2500



In [56]:
## Non-linear SVC
from sklearn.svm import SVC
svcrbf = SVC(kernel='rbf')
svcrbf.fit(X_train,Y_train)
svcrbf_Y_pred = svcrbf.predict(X_test)
print("SVC RBF Accuracy: ",np.mean(svcrbf_Y_pred == np.array(Y_test)))
print(confusion_matrix(Y_test,svcrbf_Y_pred))
print(classification_report(Y_test,svcrbf_Y_pred))

SVC RBF Accuracy:  0.6828
[[1638    8    0]
 [ 409   69    0]
 [ 371    5    0]]
             precision    recall  f1-score   support

  four/five       0.68      1.00      0.81      1646
    one/two       0.84      0.14      0.25       478
      three       0.00      0.00      0.00       376

avg / total       0.61      0.68      0.58      2500



  'precision', 'predicted', average, warn_for)


In [57]:
## Random Forests
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train,Y_train)
rf_Y_pred = rf.predict(X_test)
print("Random Forests Accuracy: ",np.mean(rf_Y_pred == np.array(Y_test)))
print(confusion_matrix(Y_test,rf_Y_pred))
print(classification_report(Y_test,rf_Y_pred))

Random Forests Accuracy:  0.7416
[[1569   46   31]
 [ 202  254   22]
 [ 278   67   31]]
             precision    recall  f1-score   support

  four/five       0.77      0.95      0.85      1646
    one/two       0.69      0.53      0.60       478
      three       0.37      0.08      0.13       376

avg / total       0.69      0.74      0.69      2500



### 2.1.3 Pickle models

In [70]:
##Pickle models
joblib.dump(nb,'C:/Users/Administrator/Documents/Github/mcnulty_yelp/model/model_nb.pkl')
joblib.dump(log,'C:/Users/Administrator/Documents/Github/mcnulty_yelp/model/model_log.pkl')
joblib.dump(svcl,'C:/Users/Administrator/Documents/Github/mcnulty_yelp/model/model_svcl.pkl')
joblib.dump(knn,'C:/Users/Administrator/Documents/Github/mcnulty_yelp/model/model_knn.pkl')
joblib.dump(svcrbf,'C:/Users/Administrator/Documents/Github/mcnulty_yelp/model/model_svcrbf.pkl')
joblib.dump(rf,'C:/Users/Administrator/Documents/Github/mcnulty_yelp/model/model_rf.pkl');

## 2.2 Using tfidf vectors

In [61]:
##Split into train and test at 75/25
##Note the change to collase_df_tf dataframe below
from sklearn.cross_validation import train_test_split 
train, test = train_test_split(collapse_df_tf.values,test_size = 0.25,random_state=1)

##Split X & Y
X_train = train[:,2:]
Y_train = train[:,1]
X_test = test[:,2:]
Y_test = test[:,1]

### 2.2.1 Relatively fast models

In [62]:
## NaiveBayes
from sklearn.naive_bayes import MultinomialNB
nb_param_grid = [{'alpha':[0.01,0.1,0.25,0.5,0.75,1.0]}]
nb_tf = GridSearchCV(MultinomialNB(),nb_param_grid,cv=5,n_jobs=-1)
nb_tf.fit(X_train,Y_train)
nb_tf_Y_pred = nb_tf.predict(X_test)
print(nb_tf.best_estimator_)
print("NB Accuracy: ",np.mean(nb_tf_Y_pred == np.array(Y_test)))
print(confusion_matrix(Y_test,nb_tf_Y_pred))
print(classification_report(Y_test,nb_tf_Y_pred))

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)
NB Accuracy:  0.7632
[[1618   20    8]
 [ 194  262   22]
 [ 304   44   28]]
             precision    recall  f1-score   support

  four/five       0.76      0.98      0.86      1646
    one/two       0.80      0.55      0.65       478
      three       0.48      0.07      0.13       376

avg / total       0.73      0.76      0.71      2500



In [63]:
## Logistic
from sklearn.linear_model import LogisticRegression
log_param_grid = [{'C':[0.01,0.1,1,10], 'penalty':['l1','l2'],'class_weight':[None,'balanced']}]
log_tf = GridSearchCV(LogisticRegression(),log_param_grid,cv=5,n_jobs=-1)
log_tf.fit(X_train,Y_train)
log_tf_Y_pred = log_tf.predict(X_test)
print(log_tf.best_estimator_)
print("Logistc Accuracy: ",np.mean(log_tf_Y_pred == np.array(Y_test)))
print(confusion_matrix(Y_test,log_tf_Y_pred))
print(classification_report(Y_test,log_tf_Y_pred))

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Logistc Accuracy:  0.7968
[[1602   33   11]
 [ 114  343   21]
 [ 251   78   47]]
             precision    recall  f1-score   support

  four/five       0.81      0.97      0.89      1646
    one/two       0.76      0.72      0.74       478
      three       0.59      0.12      0.21       376

avg / total       0.77      0.80      0.76      2500



In [65]:
## Linear SVC
from sklearn.svm import LinearSVC
svcl_param_grid = [{'C':[0.01,0.1,1,10,100], 'loss':['hinge','squared_hinge'],'class_weight':[None,'balanced']}]
svcl_tf = GridSearchCV(LinearSVC(),svcl_param_grid,cv=5,n_jobs=-1)
svcl_tf.fit(X_train,Y_train)
svcl_tf_Y_pred = svcl_tf.predict(X_test)
print("SVC Linear Accuracy: ",np.mean(svcl_tf_Y_pred == np.array(Y_test)))
print(svcl_tf.best_estimator_)
print(confusion_matrix(Y_test,svcl_tf_Y_pred))
print(classification_report(Y_test,svcl_tf_Y_pred))

SVC Linear Accuracy:  0.7948
LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0)
[[1588   49    9]
 [ 101  356   21]
 [ 232  101   43]]
             precision    recall  f1-score   support

  four/five       0.83      0.96      0.89      1646
    one/two       0.70      0.74      0.72       478
      three       0.59      0.11      0.19       376

avg / total       0.77      0.79      0.75      2500



### 2.2.2 Relatively slow models

In [66]:
## KNN
from sklearn.neighbors import KNeighborsClassifier
knn_tf = KNeighborsClassifier(n_neighbors=2)
knn_tf.fit(X_train,Y_train)
knn_tf_Y_pred = knn_tf.predict(X_test)
print("KNN Accuracy: ",np.mean(knn_tf_Y_pred == np.array(Y_test)))
print(confusion_matrix(Y_test,knn_tf_Y_pred))
print(classification_report(Y_test,knn_tf_Y_pred))

KNN Accuracy:  0.2868
[[ 254 1392    0]
 [  14  463    1]
 [  27  349    0]]
             precision    recall  f1-score   support

  four/five       0.86      0.15      0.26      1646
    one/two       0.21      0.97      0.35       478
      three       0.00      0.00      0.00       376

avg / total       0.61      0.29      0.24      2500



In [67]:
## Non-linear SVC
from sklearn.svm import SVC
svcrbf_tf = SVC(kernel='rbf')
svcrbf_tf.fit(X_train,Y_train)
svcrbf_tf_Y_pred = svcrbf_tf.predict(X_test)
print("SVC RBF Accuracy: ",np.mean(svcrbf_tf_Y_pred == np.array(Y_test)))
print(confusion_matrix(Y_test,svcrbf_tf_Y_pred))
print(classification_report(Y_test,svcrbf_tf_Y_pred))

SVC RBF Accuracy:  0.6584
[[1646    0    0]
 [ 478    0    0]
 [ 376    0    0]]
             precision    recall  f1-score   support

  four/five       0.66      1.00      0.79      1646
    one/two       0.00      0.00      0.00       478
      three       0.00      0.00      0.00       376

avg / total       0.43      0.66      0.52      2500



  'precision', 'predicted', average, warn_for)


In [68]:
## Random Forests
from sklearn.ensemble import RandomForestClassifier
rf_tf = RandomForestClassifier()
rf_tf.fit(X_train,Y_train)
rf_tf_Y_pred = rf_tf.predict(X_test)
print("Random Forests Accuracy: ",np.mean(rf_tf_Y_pred == np.array(Y_test)))
print(confusion_matrix(Y_test,rf_tf_Y_pred))
print(classification_report(Y_test,rf_tf_Y_pred))

Random Forests Accuracy:  0.7552
[[1581   46   19]
 [ 178  283   17]
 [ 280   72   24]]
             precision    recall  f1-score   support

  four/five       0.78      0.96      0.86      1646
    one/two       0.71      0.59      0.64       478
      three       0.40      0.06      0.11       376

avg / total       0.71      0.76      0.70      2500



### 2.2.3 Pickle models

In [69]:
##Pickle models
joblib.dump(nb_tf,'C:/Users/Administrator/Documents/Github/mcnulty_yelp/model/model_nb_tf.pkl')
joblib.dump(log_tf,'C:/Users/Administrator/Documents/Github/mcnulty_yelp/model/model_log_tf.pkl')
joblib.dump(svcl_tf,'C:/Users/Administrator/Documents/Github/mcnulty_yelp/model/model_svcl.pkl')
joblib.dump(knn_tf,'C:/Users/Administrator/Documents/Github/mcnulty_yelp/model/model_knn_tf.pkl')
joblib.dump(svcrbf_tf,'C:/Users/Administrator/Documents/Github/mcnulty_yelp/model/model_svcrbf.pkl')
joblib.dump(rf_tf,'C:/Users/Administrator/Documents/Github/mcnulty_yelp/model/model_rf.pkl');

# 3.0 Applying model to tweets

In [102]:
##Load vocab list from previously run vectorizer so we can extract the same words that were used in our model
vocab_list = pickle.load( open( "C:/Users/Administrator/Documents/Github/mcnulty_yelp/data/vocab.pkl", "rb" ) )

In [103]:
#Download data
tweet_df = pd.read_pickle('C:/Users/Administrator/Documents/Github/mcnulty_yelp/data/tweets_clean.pkl')
tweet_df = tweet_df.drop_duplicates('text').reset_index(drop=True)

##Extract the same feature set we have in Yelp from tweets
vectorizer2 = CountVectorizer(ngram_range=(1,3),min_df=3, max_features=3000,tokenizer=tokenizeText,vocabulary=vocab_list)
tweet_sparse_matrix = vectorizer2.transform(tweet_df.text)
tweet_sparse_matrix_tf = TfidfTransformer().fit_transform(tweet_sparse_matrix)
tweet_count_df = pd.DataFrame(tweet_sparse_matrix.A, columns=vocab_list)
tweet_tf_df = pd.DataFrame(tweet_sparse_matrix_tf.A, columns=vocab_list)

##Pickle pickle
tweet_count_df.to_pickle("C:/Users/Administrator/Documents/Github/mcnulty_yelp/data/tweet_count_df.pkl")
tweet_count_df = pd.read_pickle("C:/Users/Administrator/Documents/Github/mcnulty_yelp/data/tweet_count_df.pkl")

tweet_tf_df.to_pickle("C:/Users/Administrator/Documents/Github/mcnulty_yelp/data/tweet_tf_df.pkl")
tweet_tf_df = pd.read_pickle("C:/Users/Administrator/Documents/Github/mcnulty_yelp/data/tweet_tf_df.pkl")

In [105]:
##Combine and add sentiment score
tweet_count_df = pd.concat((tweet_df,tweet_count_df),axis=1)
tweet_count_df['senti'] = tweet_df.text.apply(lambda x: TextBlob(x).sentiment[0] + 1)

tweet_tf_df = pd.concat((tweet_df,tweet_tf_df),axis=1)
tweet_tf_df['senti'] = tweet_df.text.apply(lambda x: TextBlob(x).sentiment[0] + 1)

## 3.1 Apply best model

In [106]:
## Get feature set
X_tweet = tweet_tf_df.iloc[:,6:]

In [108]:
## Best model was Tfidf Linear SVC
## Predicts categories and probabilities
tweet_pred = log_tf.predict(X_tweet)
tweet_pred_prob = log_tf.predict_proba(X_tweet)

In [109]:
## Combine into one dataframe
tweet_df['pred'] = tweet_pred
tweet_prob_df = pd.DataFrame(tweet_pred_prob,columns = ['prob_4/5','prob_1/2','prob_3'])
tweet_df = pd.concat((tweet_df,tweet_prob_df),axis =1)

In [112]:
##Pickle, pickle
tweet_df.to_pickle("C:/Users/Administrator/Documents/Github/mcnulty_yelp/data/tweet_df_pred.pkl")
tweet_df_pred = pd.read_pickle("C:/Users/Administrator/Documents/Github/mcnulty_yelp/data/tweet_df_pred.pkl")

In [113]:
##Save as CSV
tweet_df_pred.to_csv("C:/Users/Administrator/Documents/Github/mcnulty_yelp/datatweet_df_pred.csv")