In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import scipy as sp
import pandas as pd
# import geopandas as gpd
# import geoplot as gplt
from shapely.geometry import Point
import shapely
import reverse_geocoder as rg

import sklearn as sk
import matplotlib as mpl
import matplotlib.pylab as plt
import matplotlib.font_manager as fm
from mpl_toolkits.mplot3d import Axes3D

import seaborn as sns
sns.set(rc={'figure.figsize':(13.7,10.27)})
sns.set_style("whitegrid")
sns.set_color_codes()

In [2]:
from dask.distributed import Client
import dask.bag as db
import dask.dataframe as dd
import dask.array as da
import dask

from ast import literal_eval
from collections.abc import MutableMapping
from collections import Counter, defaultdict
# import h5py
import io
import os

In [3]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection.split import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV
import pandas as pd
import numpy as np
import os, io
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise import SVDpp, SVD, NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise import accuracy
from sklearn.linear_model import Ridge, RidgeCV, SGDRegressor
from sklearn.metrics import mean_squared_error as mse
import math

In [4]:
SVD_ALGO = "SVD"
BLENDER_RIDGE = "RidgeCV"
BLENDER_SGD = "SGDRegressor"

In [5]:
client = Client(n_workers=8)
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 39341 instead
  http_address["port"], self.http_server.port


0,1
Client  Scheduler: tcp://127.0.0.1:35549  Dashboard: http://127.0.0.1:39341/status,Cluster  Workers: 8  Cores: 16  Memory: 33.65 GB


In [6]:
joined_df = dd.read_csv('data/joined_df.csv').compute()
joined_df = joined_df.loc[:, ~joined_df.columns.str.match('Unnamed')]
joined_df.columns

Index(['rating', 'reviewerName', 'categories', 'gPlusPlaceId', 'gPlusUserId',
       'user_lat', 'user_long', 'placeName', 'price', 'address', 'place_lat',
       'place_long'],
      dtype='object')

In [7]:
def places_filter(df, lat_l, lat_h, lon_l, lon_h):
    return df[(lat_l <= df.place_lat ) & (df.place_lat <= lat_h) & (lon_l <= df.place_long) & (df.place_long <= lon_h)]

In [8]:
%%time
usa_df = places_filter(joined_df, 19.50139, 64.85694, -161.75583, -68.01197)
usa_df.shape

CPU times: user 170 ms, sys: 32.8 ms, total: 203 ms
Wall time: 200 ms


(2285757, 12)

# Starting Stacking for Collaborative filtering RecSys

### Reading File

In [9]:
reader = Reader(rating_scale=(1,5))  #invoke reader instance of surprise library
data = Dataset.load_from_df(usa_df[['gPlusUserId','gPlusPlaceId','rating']], reader)

### Creating holdout set

In [10]:
#create training set
trainingSet, testSet = train_test_split(data, test_size=0.2, train_size=None, random_state=42, shuffle=True)

# Training first layer for Stacking!

In [11]:
pearson_item_sim_option = {
    'name': 'pearson',
    'user_based': False
}
pearson_user_sim_option = {
    'name': 'pearson',
    'user_based': True
}

In [17]:
def train_first_layer(algorithms, hparams, train_set, test_set):
    trained_algorithms = []
    performances = []
    
    for idx, algorithm in enumerate(algorithms):
        print("{} training started".format(idx+1))
        print(hparams[idx])
        configured_algorithm = algorithm(n_factors=hparams[idx]["n_factors"], n_epochs=hparams[idx]["n_epochs"], reg_all= hparams[idx]["reg_all"])
        configured_algorithm.fit(train_set)
        trained_algorithms.append(configured_algorithm)
        performances.append(accuracy.rmse(configured_algorithm.test(test_set), verbose=True))
    
    return trained_algorithms, performances

In [13]:
def predict_first_layer(trained_algorithms, test_set):
    return [
        trained_algorithm.test(test_set)
        for trained_algorithm in trained_algorithms
    ]

In [14]:
def train_last_layer(trained_algorithms, train_set, blender_algorithm=BLENDER_RIDGE):
    actual_trainingSet = [each for each in train_set.all_ratings()]
    predictions_for_trainingSet = predict_first_layer(trained_algorithms, actual_trainingSet)
    print("Generating predictions Complete !")
    
    train_pred = [[each.est for each in prediction] for prediction in predictions_for_trainingSet]
    train_true = [each.r_ui for each in predictions_for_trainingSet[0]]
    
    blender_train_X = np.column_stack(train_pred)
    blender_train_y = train_true
    print("Organizing data for Blender Complete !")
    
    assert blender_train_X.shape[0] == len(blender_train_y), "There's problem in dimension for training set"
    
    blender = None
    if blender_algorithm is BLENDER_RIDGE: blender = RidgeCV(cv=5)
    elif blender_algorithm is BLENDER_SGD: blender = SGDRegressor(max_iter=5000)
    print("Determination for Blender Algorithm Complete !")    
    
    blender.fit(blender_train_X, blender_train_y)
    print("Blender Training Complete !")
    return blender

In [15]:
def predict_last_layer(trained_algorithms, blender, userID, iid):
    preds = [algo.predict(userID, iid) for algo in trained_algorithms]
    blender_X = np.column_stack((pred.est for pred in preds))
    return blender.predict(blender_X)

In [18]:
%%time
algorithms = [SVD, SVD, SVD]
hparams = [
    {
        "n_factors": 20,
        "n_epochs": 20,
        "reg_all": 0.02
    },
    {
        "n_factors": 30,
        "n_epochs": 30,
        "reg_all": 0.02
    },
    {
        "n_factors": 40,
        "n_epochs": 40,
        "reg_all": 0.02
    }
]
trained_algorithms, performances = train_first_layer(algorithms, hparams, trainingSet, testSet)

1 training started
{'n_factors': 20, 'n_epochs': 20, 'reg_all': 0.02}


TypeError: 'str' object is not callable

In [17]:
predictions = predict_first_layer(trained_algorithms, testSet)

In [18]:
test_true = [each.r_ui for each in predictions[0]]
test_pred = [[each.est for each in prediction] for prediction in predictions]
blender_test_X = np.column_stack(test_pred)
blender_test_y = test_true

In [19]:
# Sanity-Check for custom RMSE for Stacking
for idx in range(3):
    print(math.sqrt(mse(test_true, test_pred[idx]))) # RMSE

1.1041435036283367
1.1014235174780553
1.1023922315002417


# Training Last layer, Blender, for Stacking !

In [37]:
SGD_blender = train_last_layer(trained_algorithms, trainingSet, BLENDER_SGD)
final_pred = SGD_blender.predict(blender_test_X)
math.sqrt(mse(final_pred, blender_test_y))

Generating predictions Complete !
Organizing data for Blender Complete !
Determination for Blender Algorithm Complete !
Blender Training Complete !


1.1009622087229745

In [21]:
RIDGE_blender = train_last_layer(trained_algorithms, trainingSet, BLENDER_RIDGE)
final_pred = RIDGE_blender.predict(blender_test_X)
math.sqrt(mse(final_pred, blender_test_y))

Generating predictions Complete !
Organizing data for Blender Complete !
Determination for Blender Algorithm Complete !
Blender Training Complete !


1.1741878518117768

### Find user-item pairs with no ratings



In [22]:
%%time
trainset = data.build_full_trainset()
algo=SVD(n_factors=10,reg_all=0.01)
algo.fit(trainset)

CPU times: user 27.6 s, sys: 125 ms, total: 27.7 s
Wall time: 27.7 s


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f24481016d8>

In [38]:
def print_result(recs):
    for gPlusPlaceID, values in recs.items():
        dict_ = values[0]
        print(dict_["PlaceName"])
        print("\t Predicted Rating : {}".format(dict_["Prediction"]))
        print("\t Category : {}".format(dict_["Category"]))
        print("\t Distance : {}".format(dict_["Distance"]))

In [39]:
from math import sin, cos, sqrt, atan2, radians
def calculate_distance(lat1, lon1, lat2, lon2):

    # approximate radius of earth in km
    R = 6373.0

    lat1_r = radians(lat1)
    lon1_r = radians(lon1)
    lat2_r = radians(lat1)
    lon2_r = radians(lon2)

    dlon = lon2_r - lon1_r
    dlat = lat2_r - lat1_r

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c

    return distance

In [43]:
def get_prediction(trained_algorithms, userID, iid, algorithm):
    if algorithm is SVD_ALGO: return algo.predict(userID, iid)
    elif algorithm is BLENDER_RIDGE: return predict_last_layer(trained_algorithms, RIDGE_blender, userID, iid)
    elif algorithm is BLENDER_SGD: return predict_last_layer(trained_algorithms, SGD_blender, userID, iid)    

In [48]:
def getRecommendations_from_df(df, userID='100000053212755369563', topN=3, lat=None, lon=None, distance_limit=100, algorithm=SVD_ALGO):
    top_recs = defaultdict(list)
    output_dict = {}
    dis = None
    
    for row in df.itertuples():
        iid = row.gPlusPlaceId
        predicted_value = get_prediction(trained_algorithms, userID, iid, algorithm)
        
        if lat and lon:
            try:
                dis = calculate_distance(row.place_lat, row.place_long, lat, lon)
            except Exception as e:
                  pass
        if dis <= distance_limit:
            input_prediction = predicted_value.est if algorithm is SVD_ALGO else predicted_value[0]
            top_recs[iid].append({'PlaceName':row.placeName, 'Distance':dis, 'Category':row.categories, 'Prediction': input_prediction})
     
    for iid, item_ratings in sorted(top_recs.items(), key=lambda item: (item[1][0].get('Prediction'), item[1][0].get('Distance')), reverse=True):
        output_dict[iid]=item_ratings
     
    return output_dict 

## Single Algorithm Rec

In [49]:
%%time
recs= getRecommendations_from_df(usa_df[:10000], userID='118446742455312620560', lat=40.179159, lon=-122.236162, distance_limit= 0.5, algorithm=SVD_ALGO)
print(len(recs.items()))
print_result(recs)

5
New Asian Restaurant
	 Predicted Rating : 3.3635538441432797
	 Category : ['Asian Restaurant']
	 Distance : 0.1560081344773754
Firehouse Pizza
	 Predicted Rating : 3.1923502219609796
	 Category : ['European Restaurant', 'Italian Restaurant', 'Pizza Restaurant']
	 Distance : 0.08141167410460685
Franz Bakery Outlet Store
	 Predicted Rating : 3.033451393429345
	 Category : ['Bakery']
	 Distance : 0.27501351859849166
Peking Chinese Restaurant
	 Predicted Rating : 2.9408921255956106
	 Category : ['Asian Restaurant', 'Chinese Restaurant']
	 Distance : 2.2331361724487817e-12
Kenmore Lanes
	 Predicted Rating : 2.6797339185978095
	 Category : ['Bowling Alley', 'Lounge', 'Restaurant']
	 Distance : 0.13456006148815874
CPU times: user 96.1 ms, sys: 15.3 ms, total: 111 ms
Wall time: 85.8 ms


## SVC Stacking with SGDRegressor rec

In [51]:
%%time
recs= getRecommendations_from_df(usa_df[:10000], userID='118446742455312620560', lat=40.179159, lon=-122.236162, distance_limit= 0.5, algorithm=BLENDER_SGD)
print(len(recs.items()))
print_result(recs)

5
New Asian Restaurant
	 Predicted Rating : 3.4837443795601324
	 Category : ['Asian Restaurant']
	 Distance : 0.1560081344773754
Firehouse Pizza
	 Predicted Rating : 3.1774137665314655
	 Category : ['European Restaurant', 'Italian Restaurant', 'Pizza Restaurant']
	 Distance : 0.08141167410460685
Franz Bakery Outlet Store
	 Predicted Rating : 3.1609283744914234
	 Category : ['Bakery']
	 Distance : 0.27501351859849166
Kenmore Lanes
	 Predicted Rating : 2.983262149281949
	 Category : ['Bowling Alley', 'Lounge', 'Restaurant']
	 Distance : 0.13456006148815874
Peking Chinese Restaurant
	 Predicted Rating : 2.9794438227443143
	 Category : ['Asian Restaurant', 'Chinese Restaurant']
	 Distance : 2.2331361724487817e-12
CPU times: user 889 ms, sys: 96.7 ms, total: 986 ms
Wall time: 897 ms


## SVC Stacking with L2 Regression rec

In [52]:
%%time
recs= getRecommendations_from_df(usa_df[:10000], userID='118446742455312620560', lat=40.179159, lon=-122.236162, distance_limit= 0.5, algorithm=BLENDER_RIDGE)
print(len(recs.items()))
print_result(recs)

5
Franz Bakery Outlet Store
	 Predicted Rating : 3.969011350182243
	 Category : ['Bakery']
	 Distance : 0.27501351859849166
New Asian Restaurant
	 Predicted Rating : 3.969011350182243
	 Category : ['Asian Restaurant']
	 Distance : 0.1560081344773754
Kenmore Lanes
	 Predicted Rating : 3.969011350182243
	 Category : ['Bowling Alley', 'Lounge', 'Restaurant']
	 Distance : 0.13456006148815874
Firehouse Pizza
	 Predicted Rating : 3.969011350182243
	 Category : ['European Restaurant', 'Italian Restaurant', 'Pizza Restaurant']
	 Distance : 0.08141167410460685
Peking Chinese Restaurant
	 Predicted Rating : 3.969011350182243
	 Category : ['Asian Restaurant', 'Chinese Restaurant']
	 Distance : 2.2331361724487817e-12
CPU times: user 887 ms, sys: 76.6 ms, total: 964 ms
Wall time: 897 ms


## Note that L2's predictions hardly vary due to its regularzation