In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np
import scipy as sp
import pandas as pd
# import geopandas as gpd
# import geoplot as gplt
from shapely.geometry import Point
import shapely
import reverse_geocoder as rg

import sklearn as sk
import matplotlib as mpl
import matplotlib.pylab as plt
import matplotlib.font_manager as fm
from mpl_toolkits.mplot3d import Axes3D

import seaborn as sns
sns.set(rc={'figure.figsize':(13.7,10.27)})
sns.set_style("whitegrid")
sns.set_color_codes()

In [2]:
from dask.distributed import Client
import dask.bag as db
import dask.dataframe as dd
import dask.array as da
import dask

from ast import literal_eval
from collections.abc import MutableMapping
from collections import Counter, defaultdict
# import h5py
import io
import os

In [3]:
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection.split import train_test_split
from surprise.model_selection import cross_validate, GridSearchCV
import pandas as pd
import numpy as np
import os, io
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise import SVDpp, SVD, NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise import accuracy
from sklearn.linear_model import Ridge, RidgeCV, SGDRegressor
from sklearn.metrics import mean_squared_error as mse
import math

In [4]:
SVD_ALGO = "SVD"
BLENDER_RIDGE = "RidgeCV"
BLENDER_SGD = "SGDRegressor"

In [5]:
client = Client(n_workers=8)
client

0,1
Client  Scheduler: tcp://127.0.0.1:43009  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 16  Memory: 33.65 GB


In [6]:
joined_df = dd.read_csv('data/joined_df.csv').compute()
joined_df = joined_df.loc[:, ~joined_df.columns.str.match('Unnamed')]
joined_df.columns

Index(['rating', 'reviewerName', 'categories', 'gPlusPlaceId', 'gPlusUserId',
       'user_lat', 'user_long', 'placeName', 'price', 'address', 'place_lat',
       'place_long'],
      dtype='object')

In [7]:
def places_filter(df, lat_l, lat_h, lon_l, lon_h):
    return df[(lat_l <= df.place_lat ) & (df.place_lat <= lat_h) & (lon_l <= df.place_long) & (df.place_long <= lon_h)]

In [8]:
%%time
usa_df = places_filter(joined_df, 19.50139, 64.85694, -161.75583, -68.01197)
usa_df.shape

CPU times: user 214 ms, sys: 17.5 ms, total: 232 ms
Wall time: 227 ms


(2285757, 12)

In [9]:
# Create training set
reader = Reader(rating_scale=(1,5))  #invoke reader instance of surprise library
data = Dataset.load_from_df(usa_df[['gPlusUserId','gPlusPlaceId','rating']], reader)

# Starting Stacking for Collaborative filtering RecSys

# Training first layer for Stacking!

In [21]:
def train_first_layer(algorithms, hparams, train_set, test_set):
    trained_algorithms = []
    performances = []
    
    for idx, algorithm in enumerate(algorithms):
        hparam = hparams[idx]
        print("\t\t{} training started: {}".format(idx+1, hparam))
        if algorithm is SVD or algorithms is SVDpp: configured_algorithm = algorithm(n_factors=hparam["n_factors"], n_epochs=hparam["n_epochs"], reg_all= hparam["reg_all"])
        elif algorithm is NMF: configured_algorithm = algorithm(n_factors=hparam["n_factors"], n_epochs=hparam["n_epochs"])
        elif algorithm is SlopeOne: configured_algorithm = algorithm()
        elif algorithm is CoClustering: configured_algorithm = algorithm(n_cltr_u=hparam["n_cltr_u"], n_cltr_i=hparam["n_cltr_i"], n_epochs=hparam["n_epochs"], random_state=42)
        configured_algorithm.fit(train_set)
        trained_algorithms.append(configured_algorithm)
        rmse = accuracy.rmse(configured_algorithm.test(test_set), verbose=False)
        print("\t\tRMSE= ", rmse)
        performances.append(rmse)
    
    return trained_algorithms, performances

In [22]:
def predict_first_layer(trained_algorithms, test_set):
    return [
        trained_algorithm.test(test_set)
        for trained_algorithm in trained_algorithms
    ]

In [23]:
def train_last_layer(trained_algorithms, train_set, blender_algorithm=BLENDER_RIDGE):
    actual_trainingSet = [each for each in train_set.all_ratings()]
    predictions_for_trainingSet = predict_first_layer(trained_algorithms, actual_trainingSet)
    print("\t\tGenerating predictions Complete !")
    
    train_pred = [[each.est for each in prediction] for prediction in predictions_for_trainingSet]
    train_true = [each.r_ui for each in predictions_for_trainingSet[0]]
    
    blender_train_X = np.column_stack(train_pred)
    blender_train_y = train_true
    print("\t\tOrganizing data for Blender Complete !")
    
    assert blender_train_X.shape[0] == len(blender_train_y), "There's problem in dimension for training set"
    
    blender = None
    if blender_algorithm is BLENDER_RIDGE: blender = RidgeCV(cv=5)
    elif blender_algorithm is BLENDER_SGD: blender = SGDRegressor(max_iter=5000)
    print("\t\tDetermination for Blender Algorithm Complete !")    
    
    blender.fit(blender_train_X, blender_train_y)
    print("\t\tBlender Training Complete !")
    return blender

In [24]:
def predict_last_layer(trained_algorithms, blender, userID, iid):
    preds = [algo.predict(userID, iid) for algo in trained_algorithms]
    blender_X = np.column_stack((pred.est for pred in preds))
    return blender.predict(blender_X)

In [29]:
def CV(cv, algorithms, hparams):
    individual_performances = []
    sgd_performances = []
    ridge_performances = []
    
    for idx in range(cv):
        print("{} Cross Validation started".format(idx+1))
        trainingSet, testSet = train_test_split(data, test_size=0.2, train_size=None, random_state=42, shuffle=True)
        print("\tData Perparation Completed")
        
        trained_algorithms, performances = train_first_layer(algorithms, hparams, trainingSet, testSet)
        individual_performances.append(performances)
        
        predictions = predict_first_layer(trained_algorithms, testSet)
        test_true = [each.r_ui for each in predictions[0]]
        test_pred = [[each.est for each in prediction] for prediction in predictions]
        blender_test_X = np.column_stack(test_pred)
        blender_test_y = test_true
        print("\tPreparing data for Blender Completed")
        
        SGD_blender = train_last_layer(trained_algorithms, trainingSet, BLENDER_SGD)
        final_pred = SGD_blender.predict(blender_test_X)
        sgd_performance = math.sqrt(mse(final_pred, blender_test_y))
        sgd_performances.append(sgd_performance)
        print("\tSGD Performance: ", sgd_performance)
        
        RIDGE_blender = train_last_layer(trained_algorithms, trainingSet, BLENDER_RIDGE)
        final_pred = RIDGE_blender.predict(blender_test_X)
        ridge_performance = math.sqrt(mse(final_pred, blender_test_y))
        ridge_performances.append(ridge_performance)
        print("\tRidge Performance: ", ridge_performance)
    
    return individual_performances, sgd_performances, ridge_performances

In [30]:
# Experiment 1
algorithms = [SVD, SVD, SVD]
hparams = [
    {
        "n_factors": 20,
        "n_epochs": 20,
        "reg_all": 0.02
    },
    {
        "n_factors": 10,
        "n_epochs": 10,
        "reg_all": 0.02
    },
    {
        "n_factors": 40,
        "n_epochs": 40,
        "reg_all": 0.02
    }
]
individual_performances1, sgd_performances1, ridge_performances1 = CV(5, algorithms, hparams)

1 Cross Validation started
	Data Perparation Completed
		1 training started: {'n_factors': 20, 'n_epochs': 20, 'reg_all': 0.02}
		RMSE=  1.103935517116222
		2 training started: {'n_factors': 10, 'n_epochs': 10, 'reg_all': 0.02}
		RMSE=  1.1162911604502943
		3 training started: {'n_factors': 40, 'n_epochs': 40, 'reg_all': 0.02}
		RMSE=  1.1023788340225846
	Preparing data for Blender Completed
		Generating predictions Complete !
		Organizing data for Blender Complete !
		Determination for Blender Algorithm Complete !
		Blender Training Complete !
	SGD Performance:  1.1035889526123572
		Generating predictions Complete !
		Organizing data for Blender Complete !
		Determination for Blender Algorithm Complete !
		Blender Training Complete !
	Ridge Performance:  1.1741878518117768
2 Cross Validation started
	Data Perparation Completed
		1 training started: {'n_factors': 20, 'n_epochs': 20, 'reg_all': 0.02}
		RMSE=  1.104043343065962
		2 training started: {'n_factors': 10, 'n_epochs': 10, 'reg

In [31]:
# Experiment 2
algorithms = [SVD, SVDpp, NMF]
hparams = [
    {
        "n_factors": 20,
        "n_epochs": 20,
        "reg_all": 0.02
    },
    {
        "n_factors": 10,
        "n_epochs": 10,
        "reg_all": 0.02
    },
    {
        "n_factors": 10,
        "n_epochs": 10
    }
]
individual_performances2, sgd_performances2, ridge_performances2 = CV(5, algorithms, hparams)

1 Cross Validation started
	Data Perparation Completed
		1 training started: {'n_factors': 20, 'n_epochs': 20, 'reg_all': 0.02}
		RMSE=  1.1040837470495315
		2 training started: {'n_factors': 10, 'n_epochs': 10, 'reg_all': 0.02}
		RMSE=  1.1039870059684254
		3 training started: {'n_factors': 10, 'n_epochs': 10}
		RMSE=  1.5189232189030826
	Preparing data for Blender Completed
		Generating predictions Complete !
		Organizing data for Blender Complete !
		Determination for Blender Algorithm Complete !
		Blender Training Complete !
	SGD Performance:  1.1384661803840035
		Generating predictions Complete !
		Organizing data for Blender Complete !
		Determination for Blender Algorithm Complete !
		Blender Training Complete !
	Ridge Performance:  1.1741878518117768
2 Cross Validation started
	Data Perparation Completed
		1 training started: {'n_factors': 20, 'n_epochs': 20, 'reg_all': 0.02}
		RMSE=  1.1040754400070263
		2 training started: {'n_factors': 10, 'n_epochs': 10, 'reg_all': 0.02}
		

In [34]:
# Experiment 3
algorithms = [SVD, NMF, CoClustering]
hparams = [
    {
        "n_factors": 20,
        "n_epochs": 20,
        "reg_all": 0.02
    },
    {
        "n_factors": 10,
        "n_epochs": 10
    },
    {
        "n_cltr_u": 4,
        "n_cltr_i": 4,
        "n_epochs": 30,
    }
]
individual_performances3, sgd_performances3, ridge_performances3 = CV(5, algorithms, hparams)

1 Cross Validation started
	Data Perparation Completed
		1 training started: {'n_factors': 20, 'n_epochs': 20, 'reg_all': 0.02}
		RMSE=  1.1040032417561487
		2 training started: {'n_factors': 10, 'n_epochs': 10}
		RMSE=  1.5188572619521732
		3 training started: {'n_cltr_u': 4, 'n_cltr_i': 4, 'n_epochs': 30}
		RMSE=  1.2169442432729836
	Preparing data for Blender Completed
		Generating predictions Complete !
		Organizing data for Blender Complete !
		Determination for Blender Algorithm Complete !
		Blender Training Complete !
	SGD Performance:  1.1950134748239858
		Generating predictions Complete !
		Organizing data for Blender Complete !
		Determination for Blender Algorithm Complete !
		Blender Training Complete !
	Ridge Performance:  1.1741878518117768
2 Cross Validation started
	Data Perparation Completed
		1 training started: {'n_factors': 20, 'n_epochs': 20, 'reg_all': 0.02}
		RMSE=  1.1039200101903337
		2 training started: {'n_factors': 10, 'n_epochs': 10}
		RMSE=  1.518166835614

In [None]:
individual_performance

In [50]:
# First Experiment Result for SGD Blender
np.mean(sgd_performances1), np.std(sgd_performances1)

(1.104737047921786, 0.0017710920843745522)

In [51]:
# Second Experminet Result for SGD Blender
np.mean(sgd_performances2), np.std(sgd_performances2)

(1.147773036033205, 0.010666261632265889)

In [52]:
# Third Experiment result for SGD Blender
np.mean(sgd_performances3), np.std(sgd_performances3)

(1.1756741825093209, 0.012421941611743524)

In [54]:
individual_performances1

[[1.103935517116222, 1.1162911604502943, 1.1023788340225846],
 [1.104043343065962, 1.1163710342646314, 1.1025087656896562],
 [1.104043995284319, 1.116348791662053, 1.1025061993535208],
 [1.1041353847726503, 1.1163741327858927, 1.1026240140853532],
 [1.1040856522785134, 1.1163543681852452, 1.1024711432758518]]

In [55]:
sgd_performances1

[1.1035889526123572,
 1.1034479310930017,
 1.1080566650088652,
 1.1034912604713671,
 1.1051004304233378]