In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics
from sklearn.metrics import r2_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
#import pydotplus
#import io

import seaborn.apionly as sns
sns.set_style("whitegrid", {'grid.color': '0.9','grid.linestyle': u'--'})
sns.set_context("poster")

%matplotlib inline
import json

from datetime import datetime 

import warnings
warnings.filterwarnings("ignore")

In [2]:
def chunkload(data_json, n_size):
    url_json = 'C:/Users/liche/Desktop/Document/Harvard Extension School/Project/dataset/%s.json' %(data_json)
    urlw_json = 'C:/Users/Public/Documents/Harvard/Project/dataset/%s.json' %(data_json)
    
    df_data = pd.DataFrame()
    
    for chunk in pd.read_json(url_json, lines = True, chunksize = n_size):
        df_data = df_data.append(chunk)
        
    return(df_data)

### Load business, review, user dataset

In [3]:
### load business.json
df_business = chunkload( 'business', 5000 )
print('business.json loaded up')

### load checkin.json
#df_checkin = chunkload( 'checkin', 5000 )
#print('checkin.json loaded up')

### load review.json
df_review = chunkload( 'review', 500000)
print('review.json loaded up')

### load tip.json
#df_tip = chunkload( 'tip', 500000)
#print('tip.json loaded up')

### load user.json
df_user = chunkload( 'user', 500000)
print('user.json loaded up')

business.json loaded up
review.json loaded up
user.json loaded up


### Merge three dataset together and only select restaurant

In [None]:
### Merge business data with review data

# Timer
print('time: {}'.format(datetime.now().strftime("%X")))
bus_col_list = ['business_id','review_count', 'stars','categories','attributes','city', 'state']

### Select restaurant
df_restaurant = df_business[bus_col_list][df_business['categories'].apply(lambda x: 'Restaurants' in x)]

review_list = ['review_id','user_id','stars','date','business_id']
df_bus_rev =pd.merge(df_review[review_list],df_restaurant, on =['business_id'])

df_bus_rev.columns = ['review_id', 'user_id', 'review_stars', 'date', 'business_id',
       'review_count', 'business_stars', 'categories', 'attributes','city', 'state']

#Timer
print('time: {}'.format(datetime.now().strftime("%X")))

df_bus_rev.head();

In [None]:
### Merger user data with review data
print('time: {}'.format(datetime.now().strftime("%X")))

user_col_list = ['user_id','average_stars','review_count','useful','yelping_since']
df_bus_rev_user =pd.merge(df_bus_rev, df_user[user_col_list], on =['user_id'] )
df_bus_rev_user.columns = ['review_id', 'user_id', 'review_stars', 'date', 'business_id',
       'business_review_count', 'business_stars', 'categories', 'attributes','city', 'state', 
       'user_average_stars', 'user_review_count', 'useful', 'yelping_since']

print('time: {}'.format(datetime.now().strftime("%X")))

## Drop nan
df_bus_rev_user = df_bus_rev_user.dropna(axis=1, how='any')

print('time: {}'.format(datetime.now().strftime("%X")))

In [6]:
#### Delete unused variable to free up memory
del df_user
del df_review

In [7]:
#### Add year column
df_bus_rev_user['Year'] =df_bus_rev_user['date'].dt.year

##### Add the years of yelping for user 
df_bus_rev_user['yelping_years'] = (2018 - pd.to_datetime(df_bus_rev_user['yelping_since']).dt.year)

#### Predictors are the difference of using rating and average and the difference of business rating and average
df_bus_rev_user['user_review_bias'] =  df_bus_rev_user['user_average_stars'] - df_bus_rev_user['user_average_stars'].mean()
df_bus_rev_user['business_review_bias'] =  df_bus_rev_user['business_stars'] - df_bus_rev_user['business_stars'].mean()

In [None]:
df_bus_rev_user.head();

### Split Train and Test

In [8]:
#function to split into training and test sets
def split_train_test(df):
    np.random.seed(9001)
    
    msk = np.random.rand(len(df)) < 0.5
    data_train = df[msk]
    data_test = df[~msk]
    return data_train, data_test

df_bus_rev_user_train ,df_bus_rev_user_test = split_train_test(df_bus_rev_user)

### Use data in Pittsburg to predict user reviews through both baseline model and KNN model.

## Baseline Model 

In [11]:
def baseline_city(train, test, city):
    train = train[train['city'] == city]
    test = test[test['city'] == city]
    
    #### Intercept of Baseline Model is the average of Business Rating
    intercept_train = train['business_stars'].mean() 
    intercept_test = test['business_stars'].mean()
    
    #### Baseline Model
    y_train_predict = intercept_train + train['user_review_bias'] + train['business_review_bias']

    y_test_predict = intercept_train + (test['user_average_stars'] - train['user_average_stars'].mean()) + (test['business_stars'] - intercept_train)

    #### Response variable are the review stars
    y_train = train['review_stars']
    y_test = test['review_stars']

    ##### Baseline Model

    score_train = r2_score(y_true=y_train.values.ravel(), y_pred=y_train_predict)

    score_test = r2_score(y_true=y_test.values.ravel(), y_pred=y_test_predict)
    
    print('Baseline Model Train Accuracy Score %0.6s\nBaseline Model Test Accuracy Score %0.6s' %(score_train, score_test))
    
    #### Show the prediction and real test rating of baseline model
    baseline_model = pd.DataFrame()

    baseline_model['prediction'] =  y_test_predict
    baseline_model['test_star'] =  y_test.values
   
    return(baseline_model)

baseline_city(train = df_bus_rev_user_train, test = df_bus_rev_user_test, city = 'Pittsburgh').head(10)

Baseline Model Train Accuracy Score 0.3423
Baseline Model Test Accuracy Score 0.3428


Unnamed: 0,prediction,test_star
1473,3.000774,2
2450,2.860774,5
2820,3.870774,4
3088,4.200774,4
3199,3.390774,5
3440,2.990774,3
3464,3.490774,3
3518,3.490774,4
3520,4.490774,3
3632,3.990774,4


## KNN Model

### Fucntion get_similiarity_coeff is to caculate the KNN distance

In [12]:
from scipy.stats.stats import pearsonr
### Calculate distance only if the number of common users are more than 2

def get_similiarity_coeff(x,y,common_users, reg):
    if common_users == 0:
        shrunk =0 
    if (common_users == 1):
        shrunk = np.nan 
    else:
        pcoeff = pearsonr(x, y)[0]
        shrunk = common_users*pearsonr(x, y)[0]/(common_users + reg)
    
    distance = (1-shrunk)/2
    return distance

### Function restaurant_city_percentile is to get top 10% restaurant in Pittsburg

In [23]:
def restaurant_city_percentile(train_test, city, percentile):
    city_data = train_test[train_test['city'] == city]
    #print('The number of row in the subset of Pittsburg: %i' %(len(city_data)))
    #print('The number of restaurant in the subset of Pittsburg: %i' %(len(city_data['business_id'].unique())))
    city_sort = city_data.groupby('business_id', as_index = False)['business_review_count'].mean().sort_values(
                                                                                'business_review_count', ascending = False)
    #Get top 10 percentile restaurant based on the amount of revews
    city_top = city_sort[city_sort['business_review_count'] >= city_sort[
                                                               'business_review_count'].quantile(percentile)]['business_id']
    
    city_top_list = city_sort[city_sort['business_review_count'] >= city_sort[
                                                               'business_review_count'].quantile(percentile)]
    
    city_top_data = pd.merge(city_top_list, city_data, on = ['business_id'])
    return(city_top_data,  city_top)

x,y= restaurant_city_percentile(train_test = df_bus_rev_user_train, city = 'Pittsburgh', percentile = 0.9)

print('The number of row in the top 10 percentage restaurant: %i' %(len(x)))
print('The number of top 10 percentage restaurant of Pittsburg: %i' %(len(y)))

The number of row in the subset of Pittsburg: 57516
The number of restaurant in the subset of Pittsburg: 2064
The number of row in the top 10 percentage restaurant: 27554
The number of top 10 percentage restaurant of Pittsburg: 207


### Function get_common_support is to get distance based on the number of common factor, pearson coefficient, and regulatization

In [26]:
def get_common_support(bus_id1, bus_id2, df_reviewlist, reg):
    common_users = -1
    shrunk_coeff = -1
    collist = ['business_id','user_id','user_average_stars','review_stars']
    df_users_bus1 = df_reviewlist[collist].loc[df_reviewlist['business_id'] == bus_id1]
    df_users_bus2 = df_reviewlist[collist].loc[df_reviewlist['business_id'] == bus_id2]  

    df_users_bus1.columns = ['business_id','user_id','bus1_user_average_stars','review1_stars'] 
    df_users_bus2.columns = ['business_id','user_id','bus2_user_average_stars','review2_stars']

    # Take out restaurant itself
    if bus_id1 != bus_id2:
        df_commonusers =  pd.merge(df_users_bus1,   df_users_bus2, on =['user_id'] )
        common_users = df_commonusers['user_id'].size
        
     #To spead up only consider common use number larger than three
        if common_users >= 3:
            df_commonusers['bus1_userrating_bias']= df_commonusers['review1_stars']-  df_commonusers['bus1_user_average_stars']
            df_commonusers['bus_2_userrating_bias']=  df_commonusers['review2_stars']-  df_commonusers['bus2_user_average_stars']
            x =  df_commonusers['bus1_userrating_bias'].values
            y=   df_commonusers['bus_2_userrating_bias'].values
            distance_coeff = get_similiarity_coeff(x,y, common_users, reg)
            
        else: 
            distance_coeff = 1
    if bus_id1 == bus_id2: 
        distance_coeff = 1
 
    return distance_coeff

#print('time: {}'.format(datetime.now().strftime("%X")))
x, y = restaurant_city_percentile(train_test = df_bus_rev_user_train, city = 'Pittsburgh', percentile = 0.9)

get_common_support('1M6tA3TqxcpptHW0_hP9Kw', 'oS96aJIHFWcFAlGHKKXjaw', x, 10)
#print('time: {}'.format(datetime.now().strftime("%X")))

0.66755852563095064

### Function knearest is to get nearest neigbour based on k (number of neribour), threshold (distance threshold), reg

In [27]:
def knearest(restaurant_id, set_of_restaurants, df_reviewlist, k, threshold, reg):
    sim_dict = []
    #### Take out the restaurant itself 
    for restaurant in set(set_of_restaurants) - set(restaurant_id): 
        coeff =get_common_support(restaurant_id, restaurant, df_reviewlist, reg)
        
    ### Set up a threshold to speed up
        if coeff < threshold :
            sim_dict.append({'restaurant_id':restaurant_id, 'neighbours':restaurant, 'distance': coeff})
    
    #### If no coeff less than threshold
    if len(sim_dict) >= 1:
        sim_pd = pd.DataFrame(sim_dict)
        sim_pd = sim_pd.groupby('restaurant_id').apply(lambda x: x.sort_values('distance'
                                                                               , ascending = True)).reset_index(drop=True)
    else:
        sim_dict = [{'restaurant_id':restaurant_id, 'neighbours':np.nan, 'distance': coeff}]
        sim_pd = pd.DataFrame(sim_dict)
        
    neighbour_k = sim_pd[:k]
    return(neighbour_k)

#print('time: {}'.format(datetime.now().strftime("%X")))

data_by_city, bus_by_city = restaurant_city_percentile(train_test = df_bus_rev_user_train
                                                       , city = 'Pittsburgh', percentile = 0.9)
bus_id1 = '1M6tA3TqxcpptHW0_hP9Kw'
temp = knearest(bus_id1, bus_by_city, data_by_city, k= 6, threshold = 0.5,  reg = 4)

#print('time: {}'.format(datetime.now().strftime("%X")))
temp

Unnamed: 0,distance,neighbours,restaurant_id
0,0.245174,kwMJ4KfhEcrk9jiMe-S6wQ,1M6tA3TqxcpptHW0_hP9Kw
1,0.269906,ejaUQ1hYo7Q7xCL1HdPINw,1M6tA3TqxcpptHW0_hP9Kw
2,0.276999,u4sTiCzVeIHZY8OlaL346Q,1M6tA3TqxcpptHW0_hP9Kw
3,0.282303,KTPRYqiFdLowAUEAnN7e3g,1M6tA3TqxcpptHW0_hP9Kw
4,0.285714,RvwZqjdkZ_pER0moPXLZAQ,1M6tA3TqxcpptHW0_hP9Kw
5,0.285714,4c19YWOjPmbFUK4-V2GEvg,1M6tA3TqxcpptHW0_hP9Kw


### Fucntion get_neighbours is to get the neighbour dataframe

In [28]:
#### Pittsburg has ~2000 restaurants
#### n is the restruant_id
#### m is the size of set_of_restaurant we want to use out of Pittsburg ~2000 restaurant

def get_neighbours(n, m, train_test, city, percentile, k, threshold, reg):
    neighbours = pd.DataFrame()
    data_city, bus_city_data = restaurant_city_percentile(train_test = train_test
                                                          ,city = city, percentile = percentile)  
    
    ### n, m is None to run through all top 10 percentile restaurant
    if n is None and m is None:
        ### time print
        i = 1 
        for restaurants_id in bus_by_city:
            ### time print at first i =1 and divisiable by 10
           #if i == 1 or i % 20 ==0:
           #     print('iteration {}: {}'.format(i, datetime.now().strftime("%X")))
                
            set_of_restaurants = bus_city_data

            neighbours = pd.concat([neighbours, knearest(restaurants_id 
                                                         ,set_of_restaurants, data_city, k= k, threshold = threshold,  reg = reg)])
            
            neighbours = neighbours.reset_index(drop = True)
            i = i +1
    ### Else can specify the number of restaurants to run through
    else:
        ### time print
        i = 1
        for restaurants_id in bus_by_city[:n]:
            ### time print at first i =1 and divisiable by 10
            #if i == 1 or i % 20 ==0:
            #    print('iteration {}: {}'.format(i, datetime.now().strftime("%X")))
                
            set_of_restaurants = bus_city_data[:m]
            
            ### dataframe of neighbours
            neighbours = pd.concat([neighbours, knearest(restaurants_id 
                                                         ,set_of_restaurants, data_city, k= k, threshold = threshold,  reg = reg)])
            neighbours = neighbours.reset_index(drop = True)
            i = i +1
            
    return(neighbours)

neighbours_test = get_neighbours(n = 50, m = 50, train_test = df_bus_rev_user_test
                                 , city = 'Pittsburgh', percentile = 0.9
                                 , k =3, threshold = 0.5, reg = 4)

neighbours_train = get_neighbours(n = 50, m = 50, train_test = df_bus_rev_user_train
                                 , city = 'Pittsburgh', percentile = 0.9
                                 , k =3, threshold = 0.5, reg = 4)

neighbours_train.head(10)

Unnamed: 0,distance,neighbours,restaurant_id
0,0.192579,oS96aJIHFWcFAlGHKKXjaw,JLbgvGM4FXh9zNP4O5ZWjQ
1,0.293648,FgTzITgrmvrZqoHvkTSDzA,JLbgvGM4FXh9zNP4O5ZWjQ
2,0.3023,K-SsrPH0nFExdpLrTo1X1w,JLbgvGM4FXh9zNP4O5ZWjQ
3,0.28802,i39--wZD6L9hm9Lg90Uziw,u4sTiCzVeIHZY8OlaL346Q
4,0.305355,CK-Gv3vqIlWOrKP4fhT8_g,u4sTiCzVeIHZY8OlaL346Q
5,0.323429,KTPRYqiFdLowAUEAnN7e3g,u4sTiCzVeIHZY8OlaL346Q
6,0.258463,CK-Gv3vqIlWOrKP4fhT8_g,lKom12WnYEjH5FFemK3M1Q
7,0.259572,dLc1d1zwd1Teu2QED5TmlA,lKom12WnYEjH5FFemK3M1Q
8,0.303344,xcmmTXhuMx2fZF2Bt69F4w,lKom12WnYEjH5FFemK3M1Q
9,0.249109,e2ng0CQ69anIawqIKzhtlg,ejaUQ1hYo7Q7xCL1HdPINw


### Function merger_neighbours is to merge top 10 percentile restaurant with the rest of restaurants

In [29]:
def merger_neighbours(n, m, train_test, city, percentile, k, threshold, reg):
       
    data = train_test[['business_id', 'business_stars']][train_test['city'] == city]
    neighbours = get_neighbours(n = n, m = m, train_test = train_test
                                 , city = city, percentile = percentile
                                 , k = k, threshold = threshold, reg = reg)
    
    ### Timer 
    #print('merge neighbours start: {}'.format(datetime.now().strftime("%X")))
        
    data_neighbour = pd.merge(neighbours,  data, left_on = 'neighbours', right_on ='business_id', how = 'left' )

    neighbour_average = data_neighbour.groupby('restaurant_id', as_index = False)['distance', 'business_stars'].mean()

    neighbour_average = neighbour_average.rename(columns = {'restaurant_id': 'restaurant_id'
                                                        , 'distance': 'distance'
                                                        , 'business_stars': 'neighbour_stars'})
    data_train = train_test[['business_id', 'user_id', 'user_review_bias', 
                                     'business_review_bias', 'business_stars']][train_test['city'] =='Pittsburgh']

    data_neighbour_average = pd.merge(data_train, neighbour_average, left_on = 'business_id', 
                                 right_on = 'restaurant_id', how = 'left')

    data_neighbour_average['business_stars_average'] = data_neighbour_average['business_stars'].mean()
    
    ### Use local neighbour average otherwise use global average if there is no neighbour
    data_neighbour_average['business_average_neighbour'] =data_neighbour_average['neighbour_stars'].fillna(
                                                    value= data_neighbour_average['business_stars_average'])

    
    return(data_neighbour_average)
merger_neighbours_train = merger_neighbours(n = 50, m = 50, train_test = df_bus_rev_user_test
                                 , city = 'Pittsburgh', percentile = 0.9
                                 , k =3, threshold = 0.5, reg = 4)
merger_neighbours_test = merger_neighbours(n = 50, m = 50, train_test = df_bus_rev_user_test
                                 , city = 'Pittsburgh', percentile = 0.9
                                 , k =3, threshold = 0.5, reg = 4)
merger_neighbours_test.head(15)


Unnamed: 0,business_id,user_id,user_review_bias,business_review_bias,business_stars,restaurant_id,distance,neighbour_stars,business_stars_average,business_average_neighbour
0,u7CxxEzx8hvjoJ8onN4zTg,faaOI6hU64h6SSaF0f11eg,-0.476715,-0.205125,3.5,u7CxxEzx8hvjoJ8onN4zTg,0.275478,4.308271,3.72116,4.308271
1,mt9mrG8wALTzD3YYGim3mQ,yt3Z_CVnx6-0vwyd-46LSA,-0.616715,-0.205125,3.5,,,,3.72116,3.72116
2,cbddL2q8uRA38RwycB0FJg,iIZhrDYOmcyGdiWSWAldmw,0.393285,-0.205125,3.5,,,,3.72116,3.72116
3,PdDpIGwBZoTYzOVasT-WuA,5z587IBRnjCbo51IaHNPzQ,0.223285,0.294875,4.0,,,,3.72116,3.72116
4,P1a2WApEoMb65s14RmqV2g,XeOCjJwfLlKpkSd_oYxyGQ,0.913285,-1.205125,2.5,,,,3.72116,3.72116
5,MYyJ9orpiRq0prjVq4ku7w,Xxvz5g67eaCr3emnkY5M6w,0.013285,-0.705125,3.0,,,,3.72116,3.72116
6,_IAxXD30S4ODGh92m8tLJw,Xxvz5g67eaCr3emnkY5M6w,0.013285,-0.205125,3.5,,,,3.72116,3.72116
7,VkeVbH5zcm0MBKuKtytGKw,Xxvz5g67eaCr3emnkY5M6w,0.013285,-0.205125,3.5,,,,3.72116,3.72116
8,OFdMlXxtkU_nUh8jJ4W6GA,Xxvz5g67eaCr3emnkY5M6w,0.013285,0.794875,4.5,,,,3.72116,3.72116
9,SvCjBtbN1cKElDKPTw9dOA,Xxvz5g67eaCr3emnkY5M6w,0.013285,0.294875,4.0,SvCjBtbN1cKElDKPTw9dOA,0.269231,3.846377,3.72116,3.846377


In [None]:
#### Predictors are the difference of using rating and average and the difference of business rating and average
df_merger_neighbours_train = merger_neighbours(neighbours_train, 'Pittsburgh', df_bus_rev_user_train)
df_merger_neighbours_test = merger_neighbours(neighbours_test, 'Pittsburgh', df_bus_rev_user_test)

### KNN Model

In [78]:
def knn_prediction(n, m, train, test, city, percentile, k, threshold, reg):
    
    df_merger_neighbours_train = merger_neighbours(n = n, m = m, train_test = train
                                 , city = city, percentile = percentile
                                 , k = k, threshold = threshold, reg = reg)
    
    df_merger_neighbours_test = merger_neighbours(n = n, m = m, train_test = test
                                 , city = city, percentile = percentile
                                 , k = k, threshold = threshold, reg = reg)

    ### Timer
    #print('knn prediction start: {}'.format(datetime.now().strftime("%X")))
    
    df_merger_neighbours_train['business_neighbour_bias'] =  df_merger_neighbours_train['business_stars'] - df_merger_neighbours_train['business_average_neighbour']

    df_merger_neighbours_test['business_neighbour_bias'] =  df_merger_neighbours_test['business_stars'] - df_merger_neighbours_test['business_average_neighbour']

    intercept_train = df_merger_neighbours_train['business_average_neighbour']
    intercept_test =  df_merger_neighbours_test['business_average_neighbour']

    y_train_city_predict = intercept_train + df_merger_neighbours_train['user_review_bias'] + df_merger_neighbours_train['business_neighbour_bias']

    y_test_city_predict = intercept_test + df_merger_neighbours_test['user_review_bias'] + df_merger_neighbours_test['business_neighbour_bias']

    #### Response variable are the review stars
    y_train_city = train[train['city'] == 'Pittsburgh']['review_stars']
    y_test_city = test[test['city'] == 'Pittsburgh']['review_stars']

    ##### Baseline Model

    score_train = r2_score(y_true=y_train_city.values.ravel(), y_pred=y_train_city_predict)

    score_test = r2_score(y_true=y_test_city.values.ravel(), y_pred=y_test_city_predict)

    #rint('KNN Train Accuracy Score %0.6s\nKNN Test Accuracy Score %0.6s' %(score_train, score_test))
    
    return(score_test)
    
knn_prediction(n = None, m = None, train = df_bus_rev_user_train, test = df_bus_rev_user_test
                                 , city = 'Pittsburgh', percentile = 0.9
                                 , k =3, threshold = 0.5, reg = 4)

### Cross Validation to get optimal K and Regulation 

In [79]:
kf = KFold(n_splits=3)
k_list = [3,5,7]
reg_list = [3,5,7]
max_score =0 

# i is for the iteration of splits
#i = 1

for k in k_list:
    for reg in reg_list:
            #print('split = %i, k = %i, reg = %i' %(i, k, reg))
        validation_accuracy_sqs = []
        for train_index, val_index in kf.split(df_bus_rev_user_train):   
            train, val = df_bus_rev_user_train.iloc[train_index], df_bus_rev_user_train.iloc[val_index]
            
            validation_accuracy_sqs.append(knn_prediction(n = 50, m = 50, train = train, test = val
                                 , city = 'Pittsburgh', percentile = 0.9
                                 , k = k , threshold = 0.5, reg =  reg))
        
        if max_score <= np.mean(validation_accuracy_sqs):
            max_score = np.mean(validation_accuracy_sqs)
            k_max = k
            reg_max = reg
    #i = i+1  

print('Optimal accuracy score: %0.4f\nOptimal K: %0.1f\nOptimal Regulation: %i' 
      %(max_score, k_max, reg_max))

KeyboardInterrupt: 

In [18]:
kf = KFold(n_splits=3)
k_list = [3,5,7]
reg_list = [3,5,7]
max_score =0 

# i is for the iteration of splits
i = 1
for train_index, val_index in kf.split(df_bus_rev_user_train):
    for k in k_list:
        for reg in reg_list:
            print('split = %i, k = %i, reg = %i' %(i, k, reg))
            validation_accuracy_sqs = []
    
            train, val = df_bus_rev_user_train.iloc[train_index], df_bus_rev_user_train.iloc[val_index]
            
            validation_accuracy_sqs.append(knn_prediction(n = 100, m = 100, train = train, test = val
                                 , city = 'Pittsburgh', percentile = 0.9
                                 , k = k , threshold = 0.5, reg =  reg))
        
        if max_score <= np.mean(validation_accuracy_sqs):
            max_score = np.mean(validation_accuracy_sqs)
            k_max = k
            reg_max = reg
    i = i+1  

print('Optimal accuracy score: %0.4f\nOptimal drop rate: %0.1f\nOptimal number of trees: %i' 
      %(max_score, k_max, reg_max))

split = 1, k = 3, reg = 3
merge neighbours start: 08:33:44
merge neighbours start: 08:35:08
knn prediction start: 08:35:08
KNN Train Accuracy Score 0.3891
KNN Test Accuracy Score 0.2316
split = 1, k = 3, reg = 5
merge neighbours start: 08:36:43
merge neighbours start: 08:38:07
knn prediction start: 08:38:07
KNN Train Accuracy Score 0.3891
KNN Test Accuracy Score 0.2316
split = 1, k = 3, reg = 7
merge neighbours start: 08:39:43
merge neighbours start: 08:41:07
knn prediction start: 08:41:07
KNN Train Accuracy Score 0.3891
KNN Test Accuracy Score 0.2316
split = 1, k = 5, reg = 3
merge neighbours start: 08:42:43
merge neighbours start: 08:44:07
knn prediction start: 08:44:07
KNN Train Accuracy Score 0.3891
KNN Test Accuracy Score 0.2316
split = 1, k = 5, reg = 5
merge neighbours start: 08:45:42
merge neighbours start: 08:47:05
knn prediction start: 08:47:05
KNN Train Accuracy Score 0.3891
KNN Test Accuracy Score 0.2316
split = 1, k = 5, reg = 7
merge neighbours start: 08:48:40
merge neigh

### User Recommender List

In [34]:
from scipy.stats.stats import pearsonr
def get_coeff(x,y,common_users):
    if common_users == 0:
        coeff =0 
    if (common_users == 1) or (common_users == 2):
        coeff = np.nan 
    else:
        coeff = pearsonr(x, y)[0]

    return coeff

In [75]:
def user_common_support(bus_id1, bus_id2, df_reviewlist):
    common_users = -1
    shrunk_coeff = -1
    collist = ['business_id','user_id','user_average_stars','review_stars']
    df_users_bus1 = df_reviewlist[collist].loc[df_reviewlist['business_id'] == bus_id1]
    df_users_bus2 = df_reviewlist[collist].loc[df_reviewlist['business_id'] == bus_id2]  

    df_users_bus1.columns = ['business_id','user_id','bus1_user_average_stars','review1_stars'] 
    df_users_bus2.columns = ['business_id','user_id','bus2_user_average_stars','review2_stars']

    if bus_id1 != bus_id2:
        df_commonusers =  pd.merge(df_users_bus1,   df_users_bus2, on =['user_id'] )
        common_users = df_commonusers['user_id'].size
         
        df_commonusers['bus1_userrating_bias']= df_commonusers['review1_stars']-  df_commonusers['bus1_user_average_stars']
        df_commonusers['bus_2_userrating_bias']=  df_commonusers['review2_stars']-  df_commonusers['bus2_user_average_stars']
        x =  df_commonusers['bus1_userrating_bias'].values
        y=   df_commonusers['bus_2_userrating_bias'].values
        coeff = get_coeff(x,y, common_users)
            
    if bus_id1 == bus_id2: 
        coeff = 1
        common_users = 1
        #if common_users > 1:
        #    print('business_id1: {}, bus1_user_bias: {}'.format(bus_id1, x))
        #    print('business_id2: {}, bus2_user_bias: {}' .format(bus_id2, y))
        #    print('shrunk similarities is %0.4f, common user %i\n' %(distance_coeff, common_users))
        
    return (coeff, common_users)

#print('time: {}'.format(datetime.now().strftime("%X")))
x, y = restaurant_city_percentile(train_test = df_bus_rev_user_train, city = 'Pittsburgh', percentile = 0.9)

print(user_common_support('1M6tA3TqxcpptHW0_hP9Kw', 'oS96aJIHFWcFAlGHKKXjaw', x))

#print('time: {}'.format(datetime.now().strftime("%X")))

(-0.89364547003173667, 6)


In [74]:
def user_knearest(restaurant_id, set_of_restaurants, df_reviewlist, k, threshold):
    sim_dict = []
    #### Take out the restaurant itself but not always works
    for restaurant in set(set_of_restaurants) - set(restaurant_id): 
        coeff, common_users =user_common_support(restaurant_id, restaurant, df_reviewlist)
        if coeff > threshold :
            sim_dict.append({'restaurant_id':restaurant_id, 'neighbours':restaurant
                             , 'coeff': coeff, 'common_users':common_users})
    
    #### If no coeff less than threshold
    if len(sim_dict) > 1:
        sim_pd = pd.DataFrame(sim_dict)
        sim_pd = sim_pd.groupby('restaurant_id').apply(lambda x: x.sort_values(['common_users','coeff']
                                                                               , ascending = False)).reset_index(drop=True)
    else:
        sim_dict = [{'restaurant_id':restaurant_id, 'neighbours':np.nan, 'coeff': coeff, 'common_users':common_users}]
        sim_pd = pd.DataFrame(sim_dict)
        
    neighbour_k = sim_pd[:k]
    return(neighbour_k)

#print('time: {}'.format(datetime.now().strftime("%X")))

data_by_city, bus_by_city = restaurant_city_percentile(train_test = df_bus_rev_user_train
                                                       , city = 'Pittsburgh', percentile = 0.9)
bus_id1 = '1M6tA3TqxcpptHW0_hP9Kw'
temp = user_knearest(bus_id1, bus_by_city, data_by_city, k= 3, threshold = 0)

#print('time: {}'.format(datetime.now().strftime("%X")))
temp

Unnamed: 0,coeff,common_users,neighbours,restaurant_id
0,0.644264,10,ejaUQ1hYo7Q7xCL1HdPINw,1M6tA3TqxcpptHW0_hP9Kw
1,0.134823,9,CK-Gv3vqIlWOrKP4fhT8_g,1M6tA3TqxcpptHW0_hP9Kw
2,0.669002,8,u4sTiCzVeIHZY8OlaL346Q,1M6tA3TqxcpptHW0_hP9Kw


In [76]:
def user_recommender(user_id, city, top, neighbours):
    user_city = df_bus_rev_user_train[df_bus_rev_user_train['city'] == city]
    user_data = user_city[['user_id', 'business_id', 'review_stars']][user_city['user_id'] == user_id]
    user_restaurant = user_data.groupby(['user_id','business_id'], as_index = False)['review_stars'].mean()
    
    user_restaurant_top = user_restaurant.sort_values('review_stars', ascending = False)[['business_id', 'review_stars']][:top]
    
    user_restaurant.sort_values('review_stars', ascending = False)[['business_id', 'review_stars']]
    data_by_city, bus_by_city = restaurant_city_percentile(train_test = df_bus_rev_user_train
                                                       , city = 'Pittsburgh', percentile = 0.9)
    neighbour = pd.DataFrame()
    for top_restaurant in user_restaurant_top['business_id']:
        neighbour = pd.concat([neighbour
                              , user_knearest(top_restaurant, bus_by_city, data_by_city, k= neighbours, threshold = 0)])
                           
    recommender = pd.merge(user_restaurant_top, neighbour
                           , left_on = 'business_id'
                           , right_on = 'restaurant_id'
                           , how = 'left')[['business_id', 'review_stars', 'coeff', 'common_users', 'neighbours']]
    
    recommender = recommender.sort_values(['common_users', 'coeff'], ascending = False)
    recommender.columns = ['user_reviewd_restaurant', 'review_stars','coeff'
                           , 'common_users','recommended_restaurant']

    recommender = recommender[['user_reviewd_restaurant'
                           ,'review_stars','recommended_restaurant'
                           ,'coeff' , 'common_users' ]]

    return(recommender)

user_recommender(user_id = '3ew6BEeK14K6x6Omt5gbig', city = 'Pittsburgh'
                , top = 3, neighbours =1)


Unnamed: 0,user_reviewd_restaurant,review_stars,recommended_restaurant,coeff,common_users
1,dLc1d1zwd1Teu2QED5TmlA,5,JLbgvGM4FXh9zNP4O5ZWjQ,0.24859,23
2,kkD0tv_e5E6a8kRpLYEcaA,5,xULATz2siGXOPia614mg2A,0.188059,13
0,_ucDskZqK5w1QHkoA_nlRw,5,dLc1d1zwd1Teu2QED5TmlA,0.208826,9


In [63]:
df_bus_rev_user_train[df_bus_rev_user_train['city'] == 'Pittsburgh']['user_id'].unique()[:15]
#print(df_bus_rev_user_train[df_bus_rev_user_train['state'] == 'PA']['city'].unique())

array(['faaOI6hU64h6SSaF0f11eg', '5z587IBRnjCbo51IaHNPzQ',
       'Xxvz5g67eaCr3emnkY5M6w', 'AhoxHm569hH_PRkoegDwcA',
       'mb_8jXannipO5T5V5kGXiQ', 'CxDOIDnH8gp9KXzpBHJYXw',
       '3KkT6SmPFLGvBS1pnDBr8g', 'TsBUWbhRhuiEMXb56kL0Cg',
       'Um2iec4NKMXVpJEME3PfKg', '135DbbQnr3BEkQbBzZ9T1A',
       '3ew6BEeK14K6x6Omt5gbig', 'g0EQGDEVFl4DMN6jfarJFg',
       'hcZqq-a16ZTjaM2p2MljTg', 'ZLS7cwa1UplSB8nRrwrHIQ',
       'f44In4p5PicSF4E4GaeTrw'], dtype=object)

In [None]:
user_city = df_bus_rev_user_train[df_bus_rev_user_train['city'] == 'Pittsburgh']
user_data = user_city[['user_id', 'business_id', 'review_stars']][user_city['user_id'] == '3ew6BEeK14K6x6Omt5gbig']
user_restaurant = user_data.groupby(['user_id','business_id'], as_index = False)['review_stars'].mean()

user_restaurant_top = user_restaurant.sort_values('review_stars', ascending = False)[['business_id', 'review_stars']][:3]

data_by_city, bus_by_city = restaurant_city_percentile(train_test = df_bus_rev_user_train
                                                       , city = 'Pittsburgh', percentile = 0.9)
neighbour = pd.DataFrame()
for top_restaurant in user_restaurant_top['business_id']:
    neighbour = pd.concat([neighbour
                           , user_knearest(top_restaurant, bus_by_city, data_by_city, k= 1, threshold = 0)]
                           )

pd.merge(user_restaurant_top, neighbour, left_on = 'business_id', right_on = 'restaurant_id', how = 'left')
recommender = pd.merge(user_restaurant_top, neighbour
                           , left_on = 'business_id'
                           , right_on = 'restaurant_id'
                           , how = 'left')[['business_id', 'review_stars', 'coeff', 'common_users', 'neighbours']]
    
recommender = recommender.sort_values(['common_users', 'coeff'], ascending = False)
recommender.columns = ['user_reviewd_restaurant', 'review_stars','coeff'
                           , 'common_users','recommended_restaurant']

recommender = recommender[['user_reviewd_restaurant'
                           ,'review_stars','recommended_restaurant'
                           ,'coeff' , 'common_users' ]]
#user_restaurant.sort_values(['review_stars'], ascending = False)[:10]

#user_restaurant = user_data.groupby(['user_id','business_id']).apply(lambda x: x.sort_values(['review_stars']
# user_restaurant                                                                               , ascending = True)).reset_index(drop=True)

#user_restaurant 