In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import scipy as sp
import pandas as pd
# import geopandas as gpd
# import geoplot as gplt
from shapely.geometry import Point
import shapely
import reverse_geocoder as rg

import sklearn as sk
import matplotlib as mpl
import matplotlib.pylab as plt
import matplotlib.font_manager as fm
from mpl_toolkits.mplot3d import Axes3D

import seaborn as sns
sns.set(rc={'figure.figsize':(13.7,10.27)})
sns.set_style("whitegrid")
sns.set_color_codes()

In [92]:
from dask.distributed import Client
import dask.bag as db
import dask.dataframe as dd
import dask.array as da
import dask

from ast import literal_eval
from collections.abc import MutableMapping
from collections import Counter, defaultdict
# import h5py
import io
import os

In [49]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection.split import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV
import pandas as pd
import numpy as np
import os, io
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise import SVDpp, SVD, NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise import accuracy
from sklearn.linear_model import Ridge, RidgeCV, SGDRegressor
from sklearn.metrics import mean_squared_error as mse
import math

In [52]:
BLENDER_RIDGE = "RidgeCV"
BLENDER_SGD = "SGDRegressor"

In [None]:
Client()

In [4]:
# client = Client(n_workers=8)
# client

0,1
Client  Scheduler: tcp://127.0.0.1:34469  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 16  Memory: 33.65 GB


In [5]:
joined_df = dd.read_csv('data/joined_df.csv').compute()
joined_df = joined_df.loc[:, ~joined_df.columns.str.match('Unnamed')]
joined_df.columns

Index(['rating', 'reviewerName', 'categories', 'gPlusPlaceId', 'gPlusUserId',
       'user_lat', 'user_long', 'placeName', 'price', 'address', 'place_lat',
       'place_long'],
      dtype='object')

In [6]:
def places_filter(df, lat_l, lat_h, lon_l, lon_h):
    return df[(lat_l <= df.place_lat ) & (df.place_lat <= lat_h) & (lon_l <= df.place_long) & (df.place_long <= lon_h)]

In [7]:
%%time
usa_df = places_filter(joined_df, 19.50139, 64.85694, -161.75583, -68.01197)
usa_df.shape

CPU times: user 192 ms, sys: 14 ms, total: 206 ms
Wall time: 199 ms


(2285757, 12)

# Starting Stacking for Collaborative filtering RecSys

### Reading File

In [8]:
reader = Reader(rating_scale=(1,5))  #invoke reader instance of surprise library
data = Dataset.load_from_df(usa_df[['gPlusUserId','gPlusPlaceId','rating']], reader)

### Creating holdout set

In [9]:
#create training set
trainingSet, testSet = train_test_split(data, test_size=0.2, train_size=None, random_state=42, shuffle=True)

# Training first layer for Stacking!

In [10]:
pearson_item_sim_option = {
    'name': 'pearson',
    'user_based': False
}
pearson_user_sim_option = {
    'name': 'pearson',
    'user_based': True
}

In [15]:
def train_first_layer(algorithms, hparams, train_set, test_set):
    trained_algorithms = []
    performances = []
    
    for idx, algorithm in enumerate(algorithms):
        print("{} training started".format(idx+1))
        configured_algorithm = algorithm(n_factors=hparams[idx]["n_factors"], n_epochs=hparams[idx]["n_epochs"], reg_all= hparams[idx]["reg_all"])
        configured_algorithm.fit(train_set)
        trained_algorithms.append(configured_algorithm)
        performances.append(accuracy.rmse(configured_algorithm.test(test_set), verbose=True))
    
    return trained_algorithms, performances

In [16]:
def predict_first_layer(trained_algorithms, test_set):
    return [
        trained_algorithm.test(test_set)
        for trained_algorithm in trained_algorithms
    ]

In [87]:
def train_last_layer(trained_algorithms, train_set, blender_algorithm=BLENDER_RIDGE):
    actual_trainingSet = [each for each in train_set.all_ratings()]
    predictions_for_trainingSet = predict_first_layer(trained_algorithms, actual_trainingSet)
    print("Generating predictions Complete !")
    
    train_pred = [[each.est for each in prediction] for prediction in predictions_for_trainingSet]
    train_true = [each.r_ui for each in predictions_for_trainingSet[0]]
    
    blender_train_X = np.column_stack(train_pred)
    blender_train_y = train_true
    print("Organizing data for Blender Complete !")
    
    assert blender_train_X.shape[0] == len(blender_train_y), "There's problem in dimension for training set"
    
    blender = None
    if blender_algorithm is BLENDER_RIDGE: blender = RidgeCV(cv=5)
    elif blender_algorithm is BLENDER_SGD: blender = SGDRegressor(max_iter=5000)
    print("Determination for Blender Algorithm Complete !")    
    
    blender.fit(blender_train_X, blender_train_y)
    print("Blender Training Complete !")
    return blender

In [110]:
def predict_last_layer(trained_algorithms, blender, userID, iid):
    preds = [algo.predict(userID, iid) for algo in trained_algorithms]
    blender_X = np.column_stack((pred.est for pred in preds))
    return blender.predict(blender_X)

In [17]:
%%time
algorithms = [SVD, SVD, SVD]
hparams = [
    {
        "n_factors": 20,
        "n_epochs": 20,
        "reg_all": 0.02
    },
    {
        "n_factors": 30,
        "n_epochs": 30,
        "reg_all": 0.02
    },
    {
        "n_factors": 40,
        "n_epochs": 40,
        "reg_all": 0.02
    }
]
trained_algorithms, performances = train_first_layer(algorithms, hparams, trainingSet, testSet)

1 training started
RMSE: 1.1041
2 training started
RMSE: 1.1018
3 training started
RMSE: 1.1031
CPU times: user 3min 3s, sys: 2.03 s, total: 3min 5s
Wall time: 3min 1s


In [58]:
predictions = predict_first_layer(trained_algorithms, testSet)

In [83]:
test_true = [each.r_ui for each in predictions[0]]
test_pred = [[each.est for each in prediction] for prediction in predictions]
blender_test_X = np.column_stack(test_pred)
blender_test_y = test_true

In [84]:
# Sanity-Check for custom RMSE for Stacking
for idx in range(3):
    print(math.sqrt(mse(true, pred[idx]))) # RMSE

1.1041362280847715
1.101832985965809
1.1031233851381275


# Training Last layer, Blender, for Stacking !

In [85]:
blender = train_last_layer(trained_algorithms, trainingSet, BLENDER_SGD)
final_pred = blender.predict(blender_test_X)
math.sqrt(mse(final_pred, blender_test_y))



Generating predictions Complete !
Organizing data for Blender Complete !
Determination for Blender Algorithm Complete !
Blender Training Complete !


1.1009706442546292

In [86]:
blender = train_last_layer(trained_algorithms, trainingSet, BLENDER_RIDGE)
final_pred = blender.predict(blender_test_X)
math.sqrt(mse(final_pred, blender_test_y))



Generating predictions Complete !
Organizing data for Blender Complete !
Determination for Blender Algorithm Complete !
Blender Training Complete !


1.1736991631041538

### Find user-item pairs with no ratings



In [102]:
%%time
trainset = data.build_full_trainset()
algo=SVD(n_factors=10,reg_all=0.01)
algo.fit(trainset)

CPU times: user 2.05 s, sys: 25.4 ms, total: 2.08 s
Wall time: 2.06 s


In [114]:
from math import sin, cos, sqrt, atan2, radians
def calculate_distance(lat1, lon1, lat2, lon2):

    # approximate radius of earth in km
    R = 6373.0

    lat1_r = radians(lat1)
    lon1_r = radians(lon1)
    lat2_r = radians(lat1)
    lon2_r = radians(lon2)

    dlon = lon2_r - lon1_r
    dlat = lat2_r - lat1_r

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c

    return distance

In [118]:
def getRecommendations_from_df(df, userID='100000053212755369563', topN=3, lat=None, lon=None, distance_limit=100):
    top_recs = defaultdict(list)
    output_dict = {}
    dis = None
    for row in df.itertuples():
        iid = row.gPlusPlaceId
        predicted_value = algo.predict(userID, iid)
        if lat and lon:
            try:
                dis = calculate_distance(row.place_lat, row.place_long, lat, lon)
            except Exception as e:
                  pass
        if dis <= distance_limit:
            top_recs[iid].append({'PlaceName':row.placeName, 'Distance':dis, 'Category':row.categories, 'Prediction':predicted_value.est})
     
    for iid, item_ratings in sorted(top_recs.items(), key=lambda item: (item[1][0].get('Prediction'), item[1][0].get('Distance')), reverse=True):
        output_dict[iid]=item_ratings
     
    return output_dict 

In [119]:
%%time
recs= getRecommendations_from_df(usa_df[:10000], userID='118446742455312620560', lat=40.179159, lon=-122.236162, distance_limit= 0.5)
print(len(recs.items()))
recs

5
CPU times: user 109 ms, sys: 9.35 ms, total: 118 ms
Wall time: 91.6 ms


{'110040354606321397134': [{'PlaceName': 'New Asian Restaurant',
   'Distance': 0.1560081344773754,
   'Category': "['Asian Restaurant']",
   'Prediction': 3.437684978048763}],
 '103770440322838225806': [{'PlaceName': 'Franz Bakery Outlet Store',
   'Distance': 0.27501351859849166,
   'Category': "['Bakery']",
   'Prediction': 3.157654664834565}],
 '109420033090810328045': [{'PlaceName': 'Firehouse Pizza',
   'Distance': 0.08141167410460685,
   'Category': "['European Restaurant', 'Italian Restaurant', 'Pizza Restaurant']",
   'Prediction': 3.01128192116214}],
 '106591714648856494903': [{'PlaceName': 'Peking Chinese Restaurant',
   'Distance': 2.2331361724487817e-12,
   'Category': "['Asian Restaurant', 'Chinese Restaurant']",
   'Prediction': 2.971059380024141}],
 '101530031206675973002': [{'PlaceName': 'Kenmore Lanes',
   'Distance': 0.13456006148815874,
   'Category': "['Bowling Alley', 'Lounge', 'Restaurant']",
   'Prediction': 2.6280560689843084}]}

In [120]:
def getRecommendations_from_df_Blender(df, userID='100000053212755369563', topN=3, lat=None, lon=None, distance_limit=100):
    top_recs = defaultdict(list)
    output_dict = {}
    dis = None
    for row in df.itertuples():
        iid = row.gPlusPlaceId
#         predicted_value = algo.predict(userID, iid)
        predicted_value = predict_last_layer(trained_algorithms, blender, userID, iid) # For Blender
        if lat and lon:
            try:
                dis = calculate_distance(row.place_lat, row.place_long, lat, lon)
            except Exception as e:
                  pass
#                 print("Exception caught: {}".format(e))
        if dis <= distance_limit:
#             top_recs[iid].append({'PlaceName':row.placeName, 'Distance':dis, 'Category':row.categories, 'Prediction':predicted_value.est})
            top_recs[iid].append({'PlaceName':row.placeName, 'Distance':dis, 'Category':row.categories, 'Prediction':predicted_value}) # For Blender
     
    for iid, item_ratings in sorted(top_recs.items(), key=lambda item: (item[1][0].get('Prediction'), item[1][0].get('Distance')), reverse=True):
        output_dict[iid]=item_ratings
     
    return output_dict 

In [121]:
%%time
recs= getRecommendations_from_df_Blender(usa_df[:10000], userID='118446742455312620560', lat=40.179159, lon=-122.236162, distance_limit= 0.5)
print(len(recs.items()))
recs

5
CPU times: user 1.01 s, sys: 101 ms, total: 1.11 s
Wall time: 995 ms


{'103770440322838225806': [{'PlaceName': 'Franz Bakery Outlet Store',
   'Distance': 0.27501351859849166,
   'Category': "['Bakery']",
   'Prediction': array([3.96850495])}],
 '110040354606321397134': [{'PlaceName': 'New Asian Restaurant',
   'Distance': 0.1560081344773754,
   'Category': "['Asian Restaurant']",
   'Prediction': array([3.96850495])}],
 '101530031206675973002': [{'PlaceName': 'Kenmore Lanes',
   'Distance': 0.13456006148815874,
   'Category': "['Bowling Alley', 'Lounge', 'Restaurant']",
   'Prediction': array([3.96850495])}],
 '109420033090810328045': [{'PlaceName': 'Firehouse Pizza',
   'Distance': 0.08141167410460685,
   'Category': "['European Restaurant', 'Italian Restaurant', 'Pizza Restaurant']",
   'Prediction': array([3.96850495])}],
 '106591714648856494903': [{'PlaceName': 'Peking Chinese Restaurant',
   'Distance': 2.2331361724487817e-12,
   'Category': "['Asian Restaurant', 'Chinese Restaurant']",
   'Prediction': array([3.96850495])}]}