In [22]:
import numpy as np
import pandas as pd
import timeit
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from surprise import Reader
from surprise import Dataset
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split, GridSearchCV, cross_validate
from joblib import dump
import pickle

# Collaborative Filtering Model with Surprise Package

In [3]:
df = pd.read_csv("df_Combined.csv")
df.head(10)

Unnamed: 0,Name,16_Avenue_Tiled_Steps,California_Academy_of_Sciences,Alcatraz_Island,Cable_Cars,Exploratorium,or5-Golden_Gate_Bridge,Golden_Gate_Park,Lands_End,Oracle_Park,...,Baker_Beach,Angel_Island_State_Park,Mission_Dolores_Park,Fisherman_s_Wharf,Japanese_Tea_Garden,Ghirardelli_Square,Chinatown,Union_Square,Painted_Ladies,Haight_Ashbury
0,Swimmy128,50.0,,,,,40.0,,,,...,,,,30.0,,,,,,
1,atriciaff,50.0,,,40.0,,50.0,,50.0,,...,,,,,,,,,,
2,Denise1236123,50.0,,50.0,,,,,,,...,,,,,,,,,30.0,
3,reise-tanta-di,50.0,,,,,,,,,...,,,,,,,,,,
4,ViorikaMontreal,50.0,,,,,,,,,...,,,,,,,,,,
5,beltonlim,50.0,,,50.0,50.0,,,,,...,,,,50.0,,,,,,
6,justweems,50.0,,,,,50.0,,,,...,,,,30.0,,40.0,,,,
7,KissXX,50.0,,,,,,50.0,,,...,,,,,,,,,,
8,rosievil,40.0,50.0,,50.0,,50.0,,,,...,,,,50.0,,,,,,
9,WaitForIttttt,50.0,,40.0,50.0,,50.0,,,,...,50.0,,,30.0,,,,,40.0,50.0


In [4]:
df_cluster = df.copy()
df_cluster.fillna(0, inplace = True)

In [5]:
nan_name_index = df.loc[df['Name'].isna()].index
df.loc[nan_name_index, 'Name'] = 'NaN'
df.isna().any()

Name                                         False
16_Avenue_Tiled_Steps                         True
California_Academy_of_Sciences                True
Alcatraz_Island                               True
Cable_Cars                                    True
Exploratorium                                 True
or5-Golden_Gate_Bridge                        True
Golden_Gate_Park                              True
Lands_End                                     True
Oracle_Park                                   True
Palace_of_Fine_Arts_Theatre                   True
Twin_Peaks                                    True
Walt_Disney_Family_Museum                     True
Ferry_Building_Marketplace                    True
Lombard_Street                                True
San_Francisco_Museum_of_Modern_Art_SFMOMA     True
Coit_Tower                                    True
Legion_of_Honor                               True
San_Francisco_Bay                             True
Pier_39                        

In [6]:
places = df.columns[1:]

In [7]:
#convert to Surprise package dataframe format
#df2.to_csv('df2.csv', index=False)
min_place_ratings = 5
min_user_ratings = 2

cf_df = pd.melt(df, id_vars=['Name'], value_vars=places, var_name='Place', value_name='Rating')

In [8]:
place_group = cf_df[~cf_df['Rating'].isnull()].groupby('Place').count()
name_group = cf_df[~cf_df['Rating'].isnull()].groupby('Name').count()

place_group[place_group['Name'] < min_place_ratings] #None
drop_names = list(name_group[name_group['Place'] <= min_user_ratings].index)

In [9]:
cf_df = cf_df[~cf_df['Name'].isin(drop_names)]
cf_df.fillna(0, inplace = True)
cf_df.shape

(160990, 3)

In [11]:
#divide all ratings by 10 so the scale becomes 1-5
cf_df["Rating"] = cf_df["Rating"].div(10)
cf_df.describe()

Unnamed: 0,Rating
count,160990.0
mean,0.55432
std,1.492297
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,5.0


In [63]:
neighbor_df = cf_df.pivot(index='Name', columns='Place')
neighbor_df.reset_index(inplace = True)


neighbor_df.columns = list(neighbor_df.columns.droplevel())
neighbor_df.rename(columns={'': 'Name'}, inplace = True)

In [13]:
print(cf_df.shape)
cf_df.isna().any()

(160990, 3)


Name      False
Place     False
Rating    False
dtype: bool

In [14]:
#initiate Surprise
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(cf_df[['Name', 'Place', 'Rating']], reader)

# Use Nearest Neighbor

In [42]:
knn_algo = KNNBasic()
cross_validate(knn_algo, data, measures=["rmse", "mae"], cv=3, verbose=1)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.4273  1.4356  1.4333  1.4321  0.0035  
MAE (testset)     0.9330  0.9283  0.9272  0.9295  0.0025  
Fit time          14.59   18.01   15.52   16.04   1.44    
Test time         102.62  95.63   99.39   99.21   2.86    


{'test_rmse': array([1.42725581, 1.43558894, 1.43331992]),
 'test_mae': array([0.93296781, 0.92829526, 0.92721622]),
 'fit_time': (14.591031789779663, 18.01096796989441, 15.524974584579468),
 'test_time': (102.62017345428467, 95.62899994850159, 99.38902378082275)}

In [43]:
dump(knn_algo, 'knn_algo.joblib')

['knn_algo.joblib']

In [78]:
def closest_neighbor(df, x):
    
    closest = np.inf
    neighbor = None
    
    for index, row in df.iterrows():
        dist = np.linalg.norm(row.values[1:]-x)
        if  dist < closest:
            closest = dist
            neighbor = row.Name
    
    return neighbor
    

In [70]:
test_x = np.zeros(neighbor_df.shape[1]-1)

In [93]:
test_x[[1, 2, 3]] = 3

In [94]:
closest_neighbor(neighbor_df, test_x)

'leahp730'

In [92]:
neighbor_df[neighbor_df['Name'] == 'jimnhelen'].loc[3559]

Name                                         jimnhelen
16_Avenue_Tiled_Steps                                0
Alcatraz_Island                                      0
Angel_Island_State_Park                              0
Baker_Beach                                          0
Cable_Car_Museum                                     0
Cable_Cars                                           0
California_Academy_of_Sciences                       0
Chinatown                                            0
Coit_Tower                                           0
De_Young_Museum                                      0
Exploratorium                                        0
Ferry_Building_Marketplace                           0
Fisherman_s_Wharf                                    0
Ghirardelli_Square                                   0
Golden_Gate_Park                                     0
Haight_Ashbury                                       5
Japanese_Tea_Garden                                  0
Lands_End 

In [65]:
neighbor_df.to_csv('neighbor_df.csv')

In [68]:
test = pd.read_csv('neighbor_df.csv', index_col = 0)

In [69]:
test.head(5)

Unnamed: 0,Name,16_Avenue_Tiled_Steps,Alcatraz_Island,Angel_Island_State_Park,Baker_Beach,Cable_Car_Museum,Cable_Cars,California_Academy_of_Sciences,Chinatown,Coit_Tower,...,Pier_39,Presidio_of_San_Francisco,San_Francisco_Bay,San_Francisco_Botanical_Garden,San_Francisco_Museum_of_Modern_Art_SFMOMA,Twin_Peaks,Union_Square,Walt_Disney_Family_Museum,or1395-Museum_of_Ice_Cream,or5-Golden_Gate_Bridge
0,0ozlems,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0tammyc,5.0,0.0,0.0,5.0,0.0,0.0,5.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0
2,10022011m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,101mannyt,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
4,102audreyc,5.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [98]:
recs = []

In [99]:
for item in neighbor_df.columns[1:]:
    recs.append(knn_algo.predict('leahp730', item))

In [106]:
rec_df = pd.DataFrame(recs).sort_values(by='est', ascending=False)

In [110]:
rec_df.iloc[0, :]['iid']

'Exploratorium'