In [134]:
import numpy as np
import pandas as pd
import sqlite3
#viz
import matplotlib.pyplot as plt
import seaborn as sns
#extract intersection
from collections import Counter

In [135]:
attribute=pd.read_csv("./data/filtered/business_attributes_on.csv")
hour=pd.read_csv("./data/filtered/business_hours_on.csv")
business_hours=pd.read_csv("./data/filtered/check_in_on.csv")
restaurant=pd.read_csv("./data/filtered/res_on.csv")
review=pd.read_csv("./data/filtered/review_res_on.csv")
tip=pd.read_csv("./data/filtered/tip_on.csv")
user=pd.read_csv("./data/filtered/user_res_on.csv")

In [136]:
def clean_df(review):
    # select only year 2015, 2016, 2017
    review.loc[:, ('date')] = pd.to_datetime(review['date'])
    review.loc[:, ('year')] = review['date'].dt.year
    review.loc[:, ('month')] = review['date'].dt.month
    review = review[review['year'] > 2014]
    review = review[['business_id','date','year','month','stars','text','useful','funny','cool']]
    
    # filter restaurants that have at least 20+ reviews in 2015 & 2016 & 2017
    reviews_2015 = review[review['year'] == 2015]
    reviews_2016 = review[review['year'] == 2016]
    reviews_2017 = review[review['year'] == 2017]
    
    over_20_reviews_2015 = reviews_2015['business_id'].value_counts()[reviews_2015['business_id'].value_counts() >= 20].index.tolist()
    over_20_reviews_2016 = reviews_2016['business_id'].value_counts()[reviews_2016['business_id'].value_counts() >= 20].index.tolist()
    over_20_reviews_2017 = reviews_2017['business_id'].value_counts()[reviews_2017['business_id'].value_counts() >= 20].index.tolist()

    in_2015 = set(over_20_reviews_2015)
    in_2016 = set(over_20_reviews_2016)
    in_2017 = set(over_20_reviews_2017)
    
    intersection = list(in_2015 & in_2016 & in_2017)

    review = review[review['business_id'].isin(intersection)]
    
    return review

In [137]:
def export_df(dataframe):
    dataframe = dataframe.drop(columns={'month','date'})
    names = ['2015_text','2015_rate','2016_text','2016_rate','2017_text','2017_rate']
    years = [2015, 2016, 2017]
    new_df = pd.DataFrame(columns=names)
    
    # extract business ids in the dataframe
    # (note: should be the same as 'in_all_3yrs')
    bus_id_list = dataframe['business_id'].unique().tolist()
    
    for res in bus_id_list:
        df_res = dataframe[dataframe.business_id == res]
        df_des = df_res.groupby('year')['stars'].mean().reset_index(drop=True)
        res_info = []

        for yr in years:
            df_res_temp = df_res[df_res['year']==yr]
            df_res_temp = df_res_temp.drop(columns={'business_id','year'})
            res_yr_rate = df_res[df_res['year']==yr]['stars'].mean()
            res_info.append(df_res_temp)
            res_info.append(res_yr_rate)

        new_df.loc[res] = res_info
        
    new_df.index.names = ['business_id']
    new_df = new_df.reset_index(drop=True)
        
    return new_df

In [138]:
review1 = clean_df(review)
review2 = export_df(review1)

In [139]:
len(review2)

648

In [140]:
review2['2015_text'][0].head()

Unnamed: 0,stars,text,useful,funny,cool
11587,3,"Very very cheap place, really. You got sausage...",0,0,0
22879,3,Very cheap banh mi. I didn't think it was that...,1,0,0
35709,5,My go to for take out comfort food. It can get...,0,0,0
76459,5,SO MUCH LOVE FOR THIS PLACE!!! It's only $2.75...,0,0,2
92386,4,They have the best sandwiches for the price. Y...,0,0,0


In [141]:
review2.isna().any()

business_id    False
2015_text      False
2015_rate      False
2016_text      False
2016_rate      False
2017_text      False
2017_rate      False
dtype: bool

## tfidf

In [142]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA, KernelPCA
from bs4 import BeautifulSoup
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.models import Word2Vec, FastText

In [143]:
# remove numbers
def remove_num(text):
    text_nonum = re.sub(r'\d+', '', text)
    text_nopunct = "".join([char.lower() for char in str(text_nonum) if char not in string.punctuation])
    # substitute multiple whitespace with single whitespace
    # Also, removes leading and trailing whitespaces
    text_no_doublespace = re.sub('\s+', ' ', text_nopunct).strip()
    return text_nonum

# remove_special_characters
def remove_special_characters(text):
    # define the pattern to keep
    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]' 
    return re.sub(pat, '', text)

# remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if str(i) not in string.punctuation])
    return punctuationfree

#  tokenization
def tokenization(text):
    tokens = re.split(' +',text)
    return tokens

# remove stopwords
nltk.download('stopwords')
my_stopwords = stopwords.words('english')
my_stopwords.remove('be') # BE -> Back Ends
def rm_stopwords(text):
  return [i for i in text if i not in my_stopwords]

# lemmatization
nltk.download('wordnet')
nltk.download('omw-1.4')
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatization(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

# remove_extra_whitespace_tabs
def remove_extra_whitespace_tabs(text):
    #pattern = r'^\s+$|\s+$'
    pattern = r'^\s*|\s\s*'
    return re.sub(pattern, ' ', text).strip()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/qinwenw/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/qinwenw/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/qinwenw/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [178]:
def tfidf_pca(df, n): # n: number of component for PCA
    df.loc[:,('text_clean')] = df['text'].apply(
        lambda x: lemmatization(rm_stopwords(tokenization(remove_punctuation(remove_extra_whitespace_tabs(remove_num(remove_special_characters(x.lower())))))))
    )
    df.loc[:,('text_clean_str')] = df['text_clean'].apply(lambda x: ' '.join(x))
    df = df.sort_values(by='business_id').reset_index(drop=True)
    
    # TFIDF
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(df['text_clean_str'])
    feature_names = vectorizer.get_feature_names()
    dense = vectors.todense()
    denselist = dense.tolist()
    df_tfidf = pd.DataFrame(denselist, columns=feature_names)
    
    # PCA
    pca = PCA(n_components=n)
    text_pca = pca.fit_transform(df_tfidf)
    temp = pd.DataFrame(text_pca)
    
    temp.columns = ['text_pca'+str(i) for i in range(1,n+1)]
    df =  pd.concat([df, temp], axis=1)
    
    return df

In [182]:
review1_2015 = review1[review1['year']==2015].sort_values(by='business_id')

In [183]:
review1_2015_vec = tfidf_pca(review1_2015,5)

In [184]:
review1_2015_vec.head()

Unnamed: 0,business_id,date,year,month,stars,text,useful,funny,cool,text_clean,text_clean_str,text_pca1,text_pca2,text_pca3,text_pca4,text_pca5
0,-0NrB58jqKqJfuUCDupcsw,2015-04-21,2015,4,5,"There is hype I know, but it really IS THAT go...",1,0,0,"[hype, know, really, good, several, time, star...",hype know really good several time starting ov...,-0.008541,-0.013594,0.009846,0.005136,0.026575
1,-0NrB58jqKqJfuUCDupcsw,2015-08-18,2015,8,5,Very delicious food. Too much food for me haha...,0,0,0,"[delicious, food, much, food, hahaha, service,...",delicious food much food hahaha service good o...,0.057771,0.012714,0.020116,-0.024486,0.111824
2,-0NrB58jqKqJfuUCDupcsw,2015-07-01,2015,7,3,Best friend and I decided to hang out after ou...,2,0,0,"[best, friend, decided, hang, movie, found, li...",best friend decided hang movie found little sp...,0.029395,0.011791,-0.024404,0.032245,-0.040576
3,-0NrB58jqKqJfuUCDupcsw,2015-11-25,2015,11,4,Korean fried chicken can't go wrong. their bat...,0,0,0,"[korean, fried, chicken, cant, go, wrong, batt...",korean fried chicken cant go wrong batter thic...,-0.053239,0.134091,-0.065809,-0.111815,0.032089
4,-0NrB58jqKqJfuUCDupcsw,2015-10-11,2015,10,4,Came here on a Friday night and it was pretty ...,0,0,0,"[came, friday, night, pretty, packed, luckily,...",came friday night pretty packed luckily got em...,-0.123059,-0.0635,-0.084363,-0.024051,0.036432


In [186]:
review1_2016_jan = review1[(review1['year']==2016) & review1['month']==1].sort_values(by='business_id')

In [195]:
review1_2016_jan_vec = tfidf_pca(review1_2016_jan,5)

In [196]:
review1_2016_jan_vec.head()

Unnamed: 0,business_id,date,year,month,stars,text,useful,funny,cool,text_clean,text_clean_str,text_pca1,text_pca2,text_pca3,text_pca4,text_pca5
0,-0NrB58jqKqJfuUCDupcsw,2016-09-25,2016,9,2,The service was slow and not attentive. The wa...,1,0,0,"[service, slow, attentive, waitress, didnt, of...",service slow attentive waitress didnt offer br...,0.047904,-0.020134,-0.086337,-0.038422,-0.119714
1,-0NrB58jqKqJfuUCDupcsw,2016-07-06,2016,7,3,A good place to fulfill a fried chicken fix. I...,2,0,1,"[good, place, fulfill, fried, chicken, fix, iv...",good place fulfill fried chicken fix ive korea...,0.142944,-0.049254,-0.078695,-0.137127,-0.198242
2,-0NrB58jqKqJfuUCDupcsw,2016-03-27,2016,3,5,Great service especially if you bring a Korean...,1,0,1,"[great, service, especially, bring, koreanspea...",great service especially bring koreanspeaking ...,0.102288,-0.127845,-0.01631,-0.159451,-0.232646
3,-0NrB58jqKqJfuUCDupcsw,2016-11-23,2016,11,4,"Delicious food, okay service.\n\nI tried Korea...",1,0,0,"[delicious, food, okay, service, tried, korean...",delicious food okay service tried korean fried...,0.099417,0.014488,-0.084756,-0.025352,-0.116971
4,-0NrB58jqKqJfuUCDupcsw,2016-01-17,2016,1,4,The fried chicken is pretty damn good. Not muc...,0,0,0,"[fried, chicken, pretty, damn, good, much, way...",fried chicken pretty damn good much way vegeta...,0.126597,0.009243,-0.051635,-0.065157,-0.089583


In [197]:
len(review1_2016_jan_vec)

16567

In [198]:
len(review1_2015_vec)

30904

## Train & Test

Train on 2015's dataset and test on 2016.

### rf

In [203]:
X_train = review1_2015_vec.iloc[:,-5:]
y_train = review1_2015_vec['stars']
X_test = review1_2016_jan_vec.iloc[:,-5:]
y_test = review1_2016_jan_vec['stars']

In [204]:
y_train

0        5
1        5
2        3
3        4
4        4
        ..
30899    3
30900    3
30901    4
30902    4
30903    2
Name: stars, Length: 30904, dtype: int64

In [214]:
import sklearn
from sklearn.model_selection import train_test_split # for splitting the data into train and test samples
from sklearn.metrics import classification_report # for model evaluation metrics
from sklearn.ensemble import RandomForestClassifier # for random forest models

from sklearn.model_selection import RandomizedSearchCV # parameter-tune; Random Hyperparameter Grid
from sklearn.model_selection import GridSearchCV # parameter-tune; Grid Search with Cross Validation

from pprint import pprint # prettyprint

In [217]:
def eval_rf_performance(model, train_features, train_y, test_features, test_y, cv_method=""):
    print('***************** Random Forest Summay *****************')
    
    if cv_method == 'random':
        print(model.best_params_)
        importances = list(model.best_estimator_.feature_importances_)
    if cv_method == 'cv':
        print(model.best_params_)
        importances = list(model.best_estimator_.feature_importances_)
    if cv_method == '': # baseline model
        pprint(model.get_params())
        importances = list(model.feature_importances_)

    # List of tuples with variable and importance
    feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(X_train, importances)]
    # Sort the feature importances by most important first
    feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
    # Print out the feature and importances 
    [print('Variable: {:10} Importance: {}'.format(*pair)) for pair in feature_importances[1:20]];
    print("")

    print('--------------------------------------------------------')
    print("")

    # get predictions for train & test
    pred_labels_tr = model.predict(X_train)
    pred_labels_te = model.predict(X_test)

    print('*************** Evaluation on Test Data ***************')
    print(classification_report(y_test, pred_labels_te))
    print('--------------------------------------------------------')
    print("")

    print('*************** Evaluation on Train Data ***************')
    print(classification_report(y_train, pred_labels_tr))
    print('--------------------------------------------------------')
    print("")

    return pred_labels_tr, pred_labels_te

In [218]:
def rf(X_train, y_train, X_test, y_test, cv_method=""): # cv_method = ['random', 'cv']; default is none

    # gird for parameters tuning
    n_estimators = [int(x) for x in np.arange(20, 80, 20)] # # tree in random forest
    max_features = ['auto', 'sqrt'] # # feature to consider at every split
    max_depth = [int(x) for x in np.arange(20, 50, 10)] # max level in tree
    max_depth.append(None)
    min_samples_split = [2, 5, 10] # min #samples required to split a node
    min_samples_leaf = [1, 2, 4] # min #samples required at each leaf node
    bootstrap = [True, False] # Method of selecting samples for training each tree
    grid = {'n_estimators': n_estimators,
            'max_features': max_features,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'bootstrap': bootstrap}

    if cv_method == 'random':
        rf = RandomForestClassifier()

        ## Random Hyperparameter Grid
        # using 3 fold cross validation, search across 100 different combinations, and use all available cores
        rf_random = RandomizedSearchCV(estimator = rf, param_distributions = grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

        rf_random.fit(X_train, y_train)

        print('========================================================')
        print('============== Random Hyperparameter Grid ==============')
        print('========================================================')
        print("")
        pred_labels_tr, pred_labels_te = eval_rf_performance(rf_random, X_train, y_train, X_test, y_test, cv_method = cv_method)

        return rf_random, pred_labels_tr, pred_labels_te
 
    if cv_method == 'cv':
        rf = RandomForestClassifier()

        ## Grid Search with Cross Validation
        rf_grid_search = GridSearchCV(estimator = rf, param_grid = grid, cv = 3, n_jobs = -1, verbose = 2)
        rf_grid_search.fit(X_train, y_train)
      
        print('========================================================')
        print('========== Grid Search with Cross Validation ==========')
        print('========================================================')
        print("")
        pred_labels_tr, pred_labels_te = eval_rf_performance(rf_grid_search, X_train, y_train, X_test, y_test, cv_method = cv_method)

        return rf_grid_search, pred_labels_tr, pred_labels_te

    # baseline model

    rf_base = RandomForestClassifier(n_estimators = 10, random_state = 42)
    rf_base.fit(X_train, y_train)

    print('========================================================')
    print('============= Random Forest without Tuning =============')
    print('========================================================')
    print("")
    pred_labels_tr, pred_labels_te = eval_rf_performance(rf_base, X_train, y_train, X_test, y_test, cv_method = cv_method)

    return rf_base, pred_labels_tr, pred_labels_te

In [212]:
rf_base, pred_labels_tr_base, pred_labels_te_base = rf(X_train, y_train, X_test, y_test)


***************** Random Forest Summay *****************
{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}
Variable: text_pca2  Importance: 0.2
Variable: text_pca5  Importance: 0.2
Variable: text_pca3  Importance: 0.19
Variable: text_pca4  Importance: 0.19

--------------------------------------------------------

*************** Evaluation on Test Data ***************
              precision    recall  f1-score   support

           1       0.03      0.01      0.02      1274
           2       0.04      0.03      0.04      1501
           3       0.13      0.13      0.13      2851
           4       0.33 

In [219]:
rf_cv, pred_labels_tr_cv, pred_labels_te_cv = rf(X_train, y_train, X_test, y_test, cv_method="cv")

Fitting 3 folds for each of 432 candidates, totalling 1296 fits
[CV] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=20; total time=   1.0s
[CV] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=60; total time=   3.0s
[CV] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=20; total time=   1.0s
[CV] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=40; total time=   2.0s
[CV] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=20; total time=   0.9s
[CV] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=20; total time=   0.9s
[CV] END bootstrap=True, max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=40; total t

In [220]:
rf_rand, pred_labels_tr_rand, pred_labels_te_rand = rf(X_train, y_train, X_test, y_test, cv_method="random")

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END bootstrap=False, max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=20; total time=   1.5s
[CV] END bootstrap=False, max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=40; total time=   2.9s
[CV] END bootstrap=False, max_depth=40, max_features=sqrt, min_samples_leaf=4, min_samples_split=10, n_estimators=60; total time=   4.4s
[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=40; total time=   3.4s
[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=20; total time=   1.7s
[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=20; total time=   1.7s
[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimato

In [221]:
pred_labels_te_cv

array([5, 5, 5, ..., 5, 4, 4])

In [224]:
review1_2016_jan['pred_rf_cv'] = pred_labels_te_cv

In [230]:
stars_mean = review1_2016_jan.groupby('business_id')['stars'].mean()

In [231]:
pred_mean = review1_2016_jan.groupby('business_id')['pred_rf_cv'].mean()

In [237]:
diff = (stars_mean-pred_mean).tolist()

In [239]:
squared_diff = [i ** 2 for i in diff]

In [242]:
# MSE
sum(squared_diff) / len(squared_diff)

0.865101856192075