# Model Testing
### Import libraries

In [1]:
import os
import json
import pandas as pd

In [13]:
from surprise import Dataset
from surprise import Reader
from surprise import SVD, evaluate, accuracy
from surprise import dump
from surprise.model_selection import cross_validate, train_test_split
from surprise.model_selection import GridSearchCV


# Testing on Final Data

In [3]:
file_path = os.path.expanduser('finaldata.csv')

In [4]:
reader = Reader(line_format='user item rating', sep=',')

In [5]:
data = Dataset.load_from_file(file_path, reader=reader)

In [11]:
data.split(n_folds=5)

## Chosen Model: SVD (results of other models at the bottom)

In [16]:
svd = SVD()

In [17]:
evaluate(svd, data, measures=['RMSE', 'MAE'])



Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 1.0919
MAE:  0.8586
------------
Fold 2
RMSE: 1.0911
MAE:  0.8584
------------
Fold 3
RMSE: 1.0906
MAE:  0.8573
------------
Fold 4
RMSE: 1.0926
MAE:  0.8598
------------
Fold 5
RMSE: 1.0926
MAE:  0.8592
------------
------------
Mean RMSE: 1.0917
Mean MAE : 0.8587
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [1.0919205372150012,
                             1.0911238090890574,
                             1.0905641164574549,
                             1.0925744889239362,
                             1.0925621922072946],
                            'mae': [0.858605564047045,
                             0.8583596034838482,
                             0.8573090440636588,
                             0.8597598485326621,
                             0.8592491339068206]})

## Gridsearch 1 -- tuning bias and learning rate

In [23]:
param_grid = {'biased': [True, False], 'lr_all': [0.001, 0.005, 0.05]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
# 'n_epochs': [10, 20, 50],'reg_all': [0.02, 0.05, 0.08]

In [24]:
gs.fit(data)

In [25]:
results = gs.cv_results

In [26]:
results_df = pd.DataFrame.from_dict(gs.cv_results)
results_df.to_csv('results.csv')

In [27]:
results_df

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_biased,param_lr_all
0,1.121815,1.123022,1.122295,1.122377,0.000496,2,0.894917,0.89533,0.895943,0.895397,0.000421,3,101.287115,12.756166,55.179468,56.157928,"{'biased': True, 'lr_all': 0.001}",True,0.001
1,1.095546,1.096588,1.09628,1.096138,0.000437,1,0.863294,0.863555,0.863951,0.8636,0.00027,1,122.237027,4.096766,125.242564,10.203921,"{'biased': True, 'lr_all': 0.005}",True,0.005
2,1.145895,1.146561,1.146847,1.146434,0.000399,3,0.887526,0.887307,0.888239,0.88769,0.000398,2,415.258017,388.697796,2648.197713,2215.837836,"{'biased': True, 'lr_all': 0.05}",True,0.05
3,2.665475,2.674566,2.661799,2.66728,0.005366,6,2.331765,2.341412,2.32667,2.333283,0.006113,6,129.907277,11.421524,178.133461,49.572544,"{'biased': False, 'lr_all': 0.001}",False,0.001
4,1.397292,1.40031,1.397937,1.398513,0.001297,5,1.110692,1.112792,1.11097,1.111485,0.000931,5,107.601211,8.414771,120.542719,10.293368,"{'biased': False, 'lr_all': 0.005}",False,0.005
5,1.246154,1.24663,1.246168,1.246317,0.000221,4,0.982923,0.982756,0.983399,0.983026,0.000272,4,105.472106,6.496184,81.059657,12.616613,"{'biased': False, 'lr_all': 0.05}",False,0.05


#### Best LR = 0.005, Bias = True

## Gridsearch 2 -- tuning n_factors

In [47]:
param_grid = {'n_factors': [1, 5, 10], 'biased': [True], 'lr_all': [0.005],
              'n_epochs': [20]}
             
gs2 = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
# 'n_epochs': [10, 20, 50],'reg_all': [0.02, 0.05, 0.08]

In [48]:
gs2.fit(data)

In [49]:
results = gs2.cv_results

In [50]:
results_df = pd.DataFrame.from_dict(results)
results_df

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,...,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_factors,param_biased,param_lr_all,param_n_epochs
0,1.086359,1.085331,1.084574,1.085421,0.000731,1,0.855883,0.854242,0.853917,0.85468,...,1,30.64818,0.463524,11.101873,1.299885,"{'n_factors': 1, 'biased': True, 'lr_all': 0.0...",1,True,0.005,20
1,1.086826,1.085677,1.085088,1.085864,0.000722,2,0.856256,0.854531,0.85441,0.855066,...,2,32.52039,1.041553,10.938498,0.78254,"{'n_factors': 5, 'biased': True, 'lr_all': 0.0...",5,True,0.005,20
2,1.08733,1.086413,1.0856,1.086447,0.000707,3,0.856862,0.855194,0.854764,0.855607,...,3,35.638312,0.684767,9.675809,0.923847,"{'n_factors': 10, 'biased': True, 'lr_all': 0....",10,True,0.005,20


#### Best - 1 factor (also tested higher values)

# Gridsearch 3 -- tuning n_epochs

In [19]:
param_grid = {'n_epochs': [10, 20, 50], 'n_factors': [1], 'biased': [True], 'lr_all': [0.005]}
gs3 = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

In [20]:
gs3.fit(data)

In [21]:
results = gs3.cv_results
results_df = pd.DataFrame.from_dict(results)
results_df

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,...,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_epochs,param_n_factors,param_biased,param_lr_all
0,1.096411,1.095981,1.096711,1.096368,0.000299,2,0.869464,0.869122,0.869546,0.869377,...,2,32.706436,1.962642,11.108491,0.908315,"{'n_epochs': 10, 'n_factors': 50, 'biased': Tr...",10,50,True,0.005
1,1.091622,1.091048,1.091759,1.091476,0.000308,1,0.85999,0.859374,0.860053,0.859806,...,1,70.06996,8.85121,25.61329,14.807396,"{'n_epochs': 20, 'n_factors': 50, 'biased': Tr...",20,50,True,0.005
2,1.140297,1.13845,1.139194,1.139314,0.000759,3,0.885813,0.884742,0.884791,0.885116,...,3,196.621014,28.375468,20.432284,8.81673,"{'n_epochs': 50, 'n_factors': 50, 'biased': Tr...",50,50,True,0.005


#### Best n_epochs = 20

# Gridsearch 4: Tuning reg_all

In [63]:
param_grid = {'n_epochs': [20], 'n_factors': [1], 'biased': [True], 
              'lr_all': [0.005], 'reg_all': [0.05, 0.06, 0.07]}
gs4 = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)


In [64]:
gs4.fit(data)

In [65]:
results = gs4.cv_results
results_df = pd.DataFrame.from_dict(results)
results_df

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,...,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_epochs,param_n_factors,param_biased,param_lr_all,param_reg_all
0,1.084832,1.08587,1.085679,1.08546,0.000451,3,0.855846,0.856065,0.855736,0.855882,...,32.002656,1.522894,11.47689,0.891822,"{'n_epochs': 20, 'n_factors': 1, 'biased': Tru...",20,1,True,0.005,0.05
1,1.084791,1.085866,1.085683,1.085446,0.00047,1,0.856182,0.856403,0.856082,0.856223,...,30.851868,0.170873,11.295406,1.275715,"{'n_epochs': 20, 'n_factors': 1, 'biased': Tru...",20,1,True,0.005,0.06
2,1.084823,1.085885,1.085647,1.085451,0.000455,2,0.856534,0.856766,0.856393,0.856565,...,29.938706,1.564058,10.551808,0.588312,"{'n_epochs': 20, 'n_factors': 1, 'biased': Tru...",20,1,True,0.005,0.07


In [66]:
results_df[['rank_test_rmse', 'rank_test_mae']]

Unnamed: 0,rank_test_rmse,rank_test_mae
0,3,1
1,1,2
2,2,3


#### Best reg = 0.06

# Other Models (not used)

In [None]:
svdpp = SVDpp()

In [None]:
evaluate(svdpp, data, measures=['RMSE', 'MAE'])

In [18]:
nmf = NMF()

In [19]:
evaluate(nmf, data, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm NMF.

------------
Fold 1
RMSE: 1.3794
MAE:  1.0783
------------
Fold 2
RMSE: 1.3801
MAE:  1.0787
------------
Fold 3
RMSE: 1.3803
MAE:  1.0797
------------
Fold 4
RMSE: 1.3790
MAE:  1.0774
------------
Fold 5
RMSE: 1.3789
MAE:  1.0773
------------
Fold 6
RMSE: 1.3810
MAE:  1.0798
------------
Fold 7
RMSE: 1.3789
MAE:  1.0777
------------
Fold 8
RMSE: 1.3796
MAE:  1.0786
------------
Fold 9
RMSE: 1.3799
MAE:  1.0785
------------
Fold 10
RMSE: 1.3790
MAE:  1.0779
------------
------------
Mean RMSE: 1.3796
Mean MAE : 1.0784
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [1.3794350253411565,
                             1.3800631753244952,
                             1.3803470449633781,
                             1.3790474183963373,
                             1.3788653317907658,
                             1.3809921876160387,
                             1.3788922356505084,
                             1.3796261813374224,
                             1.3799325331811927,
                             1.3790269179343493],
                            'mae': [1.0782924691081697,
                             1.078739278586975,
                             1.07971219559644,
                             1.077435373740628,
                             1.0773078568785115,
                             1.0797835618332265,
                             1.0776545359866114,
                             1.0785939513347529,
                             1.0785215794765843,
                        

# Best Model

In [6]:
final = SVD(n_epochs=20, n_factors=1, biased=True, 
              lr_all=0.005, reg_all=0.06)

In [7]:
data = Dataset.load_from_file(file_path, reader=reader)
trainset, testset = train_test_split(data, test_size=.2)

In [8]:
import time
start = time.time()

In [9]:
final.fit(trainset)
runtime = time.time() - start
print(runtime)

48.00040888786316


In [10]:
predictions = final.test(testset)

In [11]:
accuracy.rmse(predictions)

RMSE: 1.0780


1.0779943041910856

In [12]:
accuracy.mae(predictions)

MAE:  0.8495


0.8494536443316303

In [15]:
dump.dump('finalmodel', algo=final, predictions=predictions, verbose=1)

The dump has been saved as file finalmodel


In [31]:
preds, model = dump.load('finalmodel')

# Getting Predictions

number_of_businesses = 73100


number_of_users = 81416

In [121]:
model.predict('30678', '51871')

Prediction(uid='30678', iid='51871', r_ui=None, est=3.32747770110857, details={'was_impossible': False})

### Mapping restaurant information to item IDs

In [147]:
def find(f, seq):
    for item in seq:
        if f(item): 
            return item

In [157]:
def get_info(iid):
    return find(lambda b: iid == b['id'], businesses)

In [112]:
import json
with open('finalbusinessesindexed.json') as f:
    businesses = json.load(f)

In [182]:
def get_n_preds(uid, n):
    ratings = []
    for i in range(1, 73101):
        pred = model.predict(str(uid), str(i))
        ratings.append((int(pred.iid), pred.est))
    ratingsdesc = sorted(ratings, reverse=True, key=lambda x: x[1])[:n]
    namedratings = [(get_info(r[0])['name'], r[1]) for r in ratingsdesc]
    return namedratings

In [148]:
def get10(uid):
    top10 = get_all_preds(uid)[:10]
    restaurants = []
    for top in top10:
        restaurants.append(find(lambda b: top[0] == b['id'], businesses))
    return restaurants

In [183]:
get_all_preds(44, 10)

[('Les Canailles', 4.363268454858763),
 ('Shanghai Buffet', 4.355797642983137),
 ('China Express Chinese Restaurant', 4.352076144276189),
 ('Fiesta Margarita', 4.342375712785092),
 ('Melt Grilled Cheese', 4.335112018951794),
 ('Sorry Coffee', 4.332301364655802),
 ('Meltwich Food Co', 4.3314136777176),
 ('Bronte Restaurant', 4.328005253636554),
 ('Bar Freddo Caffe', 4.31712454289031),
 ('Texas Roadhouse', 4.315537423514774)]

In [152]:
import csv
with open('finaldata.csv', 'r') as f:
    reader = csv.reader(f)
    reviews = list(reader)

In [169]:
def get_reviewed_restaurants(uid, desc=True):
    userreviews = list(filter(lambda r: r[0] == str(uid), reviews))
    ratings = [r[2] for r in userreviews]
    restaurants = list(map(lambda r: get_info(int(r[1])), userreviews))
    names = [r['name'] for r in restaurants]
    if desc==True:
        return sorted(list(zip(names, ratings)), reverse=True, key=lambda x: x[1])
    return sorted(list(zip(names, ratings)), key=lambda x: x[1])

In [185]:
get_reviewed_restaurants(81415)

[('McDonalds', '5.0'),
 ('Restaurant Miracle', '4.0'),
 ("Lloydie's", '4.0'),
 ("Gino's Pizza", '4.0'),
 ('Cafe Rio Mexican Grill', '4.0'),
 ('Burger King', '4.0'),
 ('DS Cakes & Sweet Cafe', '4.0'),
 ('Teriyaki Stop', '4.0'),
 ('Dolce Yogurt', '4.0'),
 ('Switch Steak', '4.0'),
 ("Teresa's Pizza", '4.0'),
 ('El Carnicero', '3.0'),
 ('Mad Mexican', '3.0'),
 ('La Catrina Mexican Grill', '3.0'),
 ('El Puente', '3.0'),
 ('Rainforest Cafe', '3.0'),
 ("Lloydie's", '2.0'),
 ('Choco Churros', '2.0'),
 ('Burger King', '2.0')]