In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from datetime import datetime
import random
from sklearn.model_selection import RandomizedSearchCV

In [2]:
# Code Courtesy: AAIC

def get_error_metrics(y_true, y_pred):
    rmse = np.sqrt(np.mean([(y_true[i] - y_pred[i])**2 for i in range(len(y_pred))]))
    mape = np.mean(np.abs( (y_true - y_pred)/y_true )) * 100
    return rmse, mape

In [3]:
## Preparing the training data
x_train = pd.read_csv('./dataset/train_10000_1000.csv',
                     names = ['user', 'movie',
                                    'sur1', 'sur2', 'sur3', 'sur4', 'sur5',
                                    'sur6', 'sur7', 'sur8', 'sur9', 'sur10',
                                    'smr1', 'smr2', 'smr3', 'smr4', 'smr5',
                                    'smr6', 'smr7', 'smr8', 'smr9', 'smr10',
                                    'MAvg', 'UAvg', 'GAvg', 'rating'], header=None)

x_train.drop(['user', 'movie'], axis=1, inplace=True)
y_train = x_train.rating
x_train.drop(['rating'], axis=1, inplace=True)

In [4]:
## Preparing the testing data
x_test = pd.read_csv('./dataset/test_10000_1000.csv', 
                    names = ['user', 'movie',
                                    'sur1', 'sur2', 'sur3', 'sur4', 'sur5',
                                    'sur6', 'sur7', 'sur8', 'sur9', 'sur10',
                                    'smr1', 'smr2', 'smr3', 'smr4', 'smr5',
                                    'smr6', 'smr7', 'smr8', 'smr9', 'smr10',
                                    'MAvg', 'UAvg', 'GAvg', 'rating'], header=None)
x_test.drop(['user', 'movie'], axis=1, inplace=True)
y_test = x_test.rating
x_test.drop(['rating'], axis=1, inplace=True)

In [5]:
x_train.shape

(129286, 23)

In [6]:
x_test.shape

(36017, 23)

In [7]:
def run_xgb(train_frames, test_frames, eval_metric, tuning=False, var_imp=False, **kwargs):
    """
    It runs XGBoost with the `training_frames` and reports the `eval_metric`
    on the `test_frame`. It can also plot the variable importance if `var_imp` is true.
    It also performs hyperparam tuning if `tuning` is true.
    """
    x_train, y_train = train_frames
    x_test, y_test = test_frames
    jobs = kwargs.get('n_jobs', -1)
    rs = kwargs.get('rs', 42)
    model = kwargs.get('model', None)
    if model is None:
        model = xgb.XGBRegressor(n_jobs=jobs, random_state=rs)
    if tuning:
        prams = {
             'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.15, 0.2],
             'n_estimators': [100, 200, 500, 1000, 2000],
             'max_depth': [3, 5, 10],
             'colsample_bytree': [0.1, 0.3, 0.5, 1],
             'subsample': [0.1, 0.3, 0.5, 1]
        }
        cv = kwargs.get('cv', 2)
        random_cfl = RandomizedSearchCV(model, param_distributions=prams, 
                                        verbose=2, n_jobs=-1, cv=cv,
                                       scoring=['neg_mean_squared_error', 'neg_mean_absolute_error',
                                               'explained_variance', 'r2'],
                                       refit='neg_mean_squared_error')
        model = random_cfl
    if isinstance(model, xgb.sklearn.XGBRegressor):
        model.fit(x_train, y_train, eval_metric='rmse', verbose=True)
        # Get the training results
        y_train_pred = model.predict(x_train)
        rmse_train, mape_train = eval_metric(y_train.values, y_train_pred)
        train_results = {'rmse': rmse_train,
                        'mape' : mape_train}

        # Get the testing results
        y_test_pred = model.predict(x_test) 
        rmse_test, mape_test = eval_metric(y_test.values, y_test_pred)
        test_results = {'rmse': rmse_test,
                        'mape' : mape_test}

    elif isinstance(model, RandomizedSearchCV):
        model.fit(x_train, y_train)
        best_model = model.best_estimator_
        # Get the training results
        y_train_pred = best_model.predict(x_train)
        rmse_train, mape_train = eval_metric(y_train.values, y_train_pred)
        train_results = {'rmse': rmse_train,
                        'mape' : mape_train}

        # Get the testing results
        y_test_pred = best_model.predict(x_test) 
        rmse_test, mape_test = eval_metric(y_test.values, y_test_pred)
        test_results = {'rmse': rmse_test,
                        'mape' : mape_test}
    return train_results, test_results, model

In [76]:
%%time
trr, tsr, _ = run_xgb(train_frames=(x_train, y_train), test_frames=(x_test, y_test), 
        eval_metric=get_error_metrics, tuning=False, n_jobs=16)

CPU times: user 21.6 s, sys: 44 ms, total: 21.7 s
Wall time: 1.95 s


In [77]:
trr

{'mape': 24.95936775643008, 'rmse': 0.8367030215239278}

In [78]:
tsr

{'mape': 35.91318104810172, 'rmse': 1.1030815541160461}

In [79]:
%%time
trr, tsr, model = run_xgb(train_frames=(x_train, y_train), test_frames=(x_test, y_test), 
        eval_metric=get_error_metrics, tuning=True, n_jobs=1)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:   56.4s remaining:   46.1s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 15.1min finished


CPU times: user 48.5 s, sys: 360 ms, total: 48.9 s
Wall time: 15min 55s


In [80]:
trr

{'mape': 24.364184919431214, 'rmse': 0.822279872042983}

In [81]:
tsr

{'mape': 34.187791961329296, 'rmse': 1.1550337863181954}

In [83]:
model.best_params_

{'colsample_bytree': 0.1,
 'learning_rate': 0.15,
 'max_depth': 3,
 'n_estimators': 1000,
 'subsample': 0.5}

In [86]:
x_train_less = x_train.drop(['sur6', 'sur7', 'sur8', 'sur9', 'sur10',
             'smr6', 'smr7', 'smr8', 'smr9', 'smr10'], axis=1)
x_test_less = x_test.drop(['sur6', 'sur7', 'sur8', 'sur9', 'sur10',
             'smr6', 'smr7', 'smr8', 'smr9', 'smr10'], axis=1)

In [88]:
%%time
trr, tsr, model = run_xgb(train_frames=(x_train_less, y_train), test_frames=(x_test_less, y_test), 
        eval_metric=get_error_metrics, tuning=True, n_jobs=1)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:   44.1s remaining:   36.1s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 11.7min finished


CPU times: user 38.9 s, sys: 416 ms, total: 39.4 s
Wall time: 12min 18s


In [89]:
trr

{'mape': 25.19446355233596, 'rmse': 0.844005927313099}

In [90]:
tsr

{'mape': 35.63025169406275, 'rmse': 1.1082421026611329}

In [121]:
prams = {
             'reg_alpha': [0.001, 0.003, 0.005, 0.008, 0.01]
        }

In [122]:
from sklearn.model_selection import GridSearchCV

In [123]:
grid_cfl = GridSearchCV(model.best_estimator_, param_grid=prams, 
                                        verbose=2, n_jobs=-1, cv=2,
                                       scoring=['neg_mean_squared_error', 'neg_mean_absolute_error',
                                               'explained_variance', 'r2'],
                                       refit='neg_mean_squared_error')

In [124]:
grid_cfl.fit(x_train_less, y_train)

Fitting 2 folds for each of 5 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:   34.1s remaining:  1.3min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   40.9s finished


GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=42,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'reg_alpha': [0.001, 0.003, 0.005, 0.008, 0.01]},
       pre_dispatch='2*n_jobs', refit='neg_mean_squared_error',
       return_train_score='warn',
       scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'explained_variance', 'r2'],
       verbose=2)

In [125]:
best_model = grid_cfl.best_estimator_
y_train_pred = best_model.predict(x_train_less)
rmse_train, mape_train = get_error_metrics(y_train.values, y_train_pred)

In [126]:
rmse_train

0.8440662455906432

In [127]:
mape_train

25.197463780748354

In [128]:
y_test_pred = best_model.predict(x_test_less)
rmse_test, mape_test = get_error_metrics(y_test.values, y_test_pred)

In [129]:
rmse_test

1.1088560249897377

In [130]:
mape_test

35.595289820630754

In [131]:
grid_cfl.best_params_

{'reg_alpha': 0.01}

In [62]:
## Code Courtesy: AAIC

# it is just to makesure that all of our algorithms should produce same results
# everytime they run...

my_seed = 15
random.seed(my_seed)
np.random.seed(my_seed)

##########################################################
# get  (actual_list , predicted_list) ratings given list 
# of predictions (prediction is a class in Surprise).    
##########################################################
def get_ratings(predictions):
    actual = np.array([pred.r_ui for pred in predictions])
    pred = np.array([pred.est for pred in predictions])
    
    return actual, pred

################################################################
# get ''rmse'' and ''mape'' , given list of prediction objecs 
################################################################
def get_errors(predictions, print_them=False):

    actual, pred = get_ratings(predictions)
    rmse = np.sqrt(np.mean((pred - actual)**2))
    mape = np.mean(np.abs(pred - actual)/actual)

    return rmse, mape*100

##################################################################################
# It will return predicted ratings, rmse and mape of both train and test data   #
##################################################################################
def run_surprise(algo, trainset, testset, verbose=True): 
    '''
        return train_dict, test_dict
    
        It returns two dictionaries, one for train and the other is for test
        Each of them have 3 key-value pairs, which specify ''rmse'', ''mape'', and ''predicted ratings''.
    '''
    start = datetime.now()
    # dictionaries that stores metrics for train and test..
    train = dict()
    test = dict()
    
    # train the algorithm with the trainset
    st = datetime.now()
    if verbose:
        print('Training the model...')
    algo.fit(trainset)
    if verbose:
        print('Done. time taken : {} \n'.format(datetime.now()-st))
    
    # ---------------- Evaluating train data--------------------#
    st = datetime.now()
    if verbose:
        print('Evaluating the model with train data..')
    # get the train predictions (list of prediction class inside Surprise)
    train_preds = algo.test(trainset.build_testset())
    # get predicted ratings from the train predictions..
    train_actual_ratings, train_pred_ratings = get_ratings(train_preds)
    # get ''rmse'' and ''mape'' from the train predictions.
    train_rmse, train_mape = get_errors(train_preds)
    if verbose:
        print('time taken : {}'.format(datetime.now()-st))
    
    if verbose:
        print('-'*15)
        print('Train Data')
        print('-'*15)
        print("RMSE : {}\n\nMAPE : {}\n".format(train_rmse, train_mape))
    
    #store them in the train dictionary
    if verbose:
        print('adding train results in the dictionary..')
    train['rmse'] = train_rmse
    train['mape'] = train_mape
    train['predictions'] = train_pred_ratings
    
    #------------ Evaluating Test data---------------#
    st = datetime.now()
    if verbose:
        print('\nEvaluating for test data...')
    # get the predictions( list of prediction classes) of test data
    test_preds = algo.test(testset)
    # get the predicted ratings from the list of predictions
    test_actual_ratings, test_pred_ratings = get_ratings(test_preds)
    # get error metrics from the predicted and actual ratings
    test_rmse, test_mape = get_errors(test_preds)
    if verbose:
        print('time taken : {}'.format(datetime.now()-st))
    
    if verbose:
        print('-'*15)
        print('Test Data')
        print('-'*15)
        print("RMSE : {}\n\nMAPE : {}\n".format(test_rmse, test_mape))
    # store them in test dictionary
    if verbose:
        print('storing the test results in test dictionary...')
    test['rmse'] = test_rmse
    test['mape'] = test_mape
    test['predictions'] = test_pred_ratings
    
    if verbose:
        print('\n'+'-'*45)
        print('Total time taken to run this algorithm :', datetime.now() - start)
    
    # return two dictionaries train and test
    return train, test

In [63]:
from surprise import BaselineOnly, Reader, Dataset

In [9]:
reg_train = pd.read_csv('./dataset/train_10000_1000.csv',
                     names = ['user', 'movie',
                                    'sur1', 'sur2', 'sur3', 'sur4', 'sur5',
                                    'sur6', 'sur7', 'sur8', 'sur9', 'sur10',
                                    'smr1', 'smr2', 'smr3', 'smr4', 'smr5',
                                    'smr6', 'smr7', 'smr8', 'smr9', 'smr10',
                                    'MAvg', 'UAvg', 'GAvg', 'rating'], header=None)
reg_train.head()

Unnamed: 0,user,movie,sur1,sur2,sur3,sur4,sur5,sur6,sur7,sur8,...,smr5,smr6,smr7,smr8,smr9,smr10,MAvg,UAvg,GAvg,rating
0,2562859,4356,4.0,3.0,4.0,5.0,3.0,4.0,3.0,3.0,...,5.0,5.0,2.0,3.0,4.0,5.0,3.684725,3.333333,3.581679,3
1,769577,6673,4.0,4.0,3.0,4.0,3.0,2.0,3.0,3.0,...,3.0,3.333333,3.333333,3.333333,3.333333,3.333333,2.976744,3.333333,3.581679,2
2,1134808,8301,1.0,4.0,1.0,4.0,4.0,4.0,4.0,2.0,...,4.0,3.0,4.0,4.0,3.0,4.0,3.076835,3.25,3.581679,2
3,1677588,3196,1.0,4.0,3.0,3.0,2.0,3.0,3.0,3.0,...,3.0,3.0,2.0,3.0,2.0,2.0,2.730769,2.66,3.581679,1
4,319758,7635,5.0,1.0,2.0,3.0,3.0,2.0,4.0,3.0,...,4.4,4.4,4.4,4.4,4.4,4.4,2.90839,4.4,3.581679,2


In [10]:
reg_test = pd.read_csv('./dataset/test_10000_1000.csv', 
                    names = ['user', 'movie',
                                    'sur1', 'sur2', 'sur3', 'sur4', 'sur5',
                                    'sur6', 'sur7', 'sur8', 'sur9', 'sur10',
                                    'smr1', 'smr2', 'smr3', 'smr4', 'smr5',
                                    'smr6', 'smr7', 'smr8', 'smr9', 'smr10',
                                    'MAvg', 'UAvg', 'GAvg', 'rating'], header=None)
reg_test.head()

Unnamed: 0,user,movie,sur1,sur2,sur3,sur4,sur5,sur6,sur7,sur8,...,smr5,smr6,smr7,smr8,smr9,smr10,MAvg,UAvg,GAvg,rating
0,1371302,1962,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,...,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3
1,1592891,4931,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,...,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,1
2,808635,71,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,...,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,5
3,427178,5226,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,...,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,5
4,942784,5768,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,...,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,4


In [11]:
reader = Reader(rating_scale=(1,5))
train_data = Dataset.load_from_df(reg_train[['user', 'movie', 'rating']], reader)
trainset = train_data.build_full_trainset()

In [12]:
testset = list(zip(reg_test.user.values, reg_test.movie.values, reg_test.rating.values))
testset[:3]

[(1371302, 1962, 3), (1592891, 4931, 1), (808635, 71, 5)]

In [31]:
# options are to specify.., how to compute those user and item biases
scores = {}
for lr in [10, 1, 0.1, 0.01, 0.03, 0.05, 0.07, 
           0.001, 0.003, 0.005, 0.007, 0.0001]:
    bsl_options = {'method': 'sgd',
                   'learning_rate': lr}
    bsl_algo = BaselineOnly(bsl_options=bsl_options, verbose=False)
    # run this algorithm.., It will return the train and test results..
    bsl_train_results, bsl_test_results = run_surprise(bsl_algo, trainset, testset, verbose=False)
    scores[lr] = bsl_test_results['rmse']

In [33]:
import operator

In [38]:
sorted(scores.items(), key=operator.itemgetter(1))

[(0.003, 1.0995129545071682),
 (0.005, 1.099571571436182),
 (0.001, 1.0995924446993357),
 (0.007, 1.0996405000418235),
 (0.01, 1.0997239026072736),
 (0.0001, 1.100100052985354),
 (0.03, 1.100166435301005),
 (0.05, 1.1008505276616793),
 (0.07, 1.1016934711260267),
 (0.1, 1.1029808636311582),
 (10, 1.1748347151296556),
 (1, 1.244444193747045)]

In [15]:
bsl_options = {'method': 'sgd',
                   'learning_rate': 0.003}
bsl_algo = BaselineOnly(bsl_options=bsl_options, verbose=False)
# run this algorithm.., It will return the train and test results..
bsl_train_results, bsl_test_results = run_surprise(bsl_algo, trainset, testset, verbose=False)

In [16]:
bsl_test_results

{'mape': 36.30015233426039,
 'predictions': array([3.58167938, 3.58167938, 3.58167938, ..., 4.02737251, 3.58167938,
        4.02737251]),
 'rmse': 1.0995129545071682}

In [17]:
x_train['bslpr'] = bsl_train_results['predictions']
x_test['bslpr'] = bsl_test_results['predictions']

In [18]:
%%time
model = xgb.XGBRegressor(n_jobs=16, random_state=42, n_estimators=1000, colsample_bytree=0.1, learning_rate=0.15,
                        max_depth=3, subsample=0.5, reg_alpha=0.01)
model.fit(x_train, y_train, eval_metric='rmse', verbose=True)

CPU times: user 3min 20s, sys: 64 ms, total: 3min 20s
Wall time: 12.7 s


In [19]:
y_train_pred = model.predict(x_train)
rmse_train, mape_train = get_error_metrics(y_train.values, y_train_pred)
print(rmse_train, mape_train)
y_test_pred = model.predict(x_test)
rmse_test, mape_test = get_error_metrics(y_test.values, y_test_pred)
print(rmse_test, mape_test)

0.8216051866429652 24.316092284899497
1.1089055660379303 35.67581325824863


In [19]:
from surprise import KNNBaseline

In [21]:
%%time
# we specify , how to compute similarities and what to consider with sim_options to our algorithm
sim_options = {'user_based' : True,
               'name': 'pearson_baseline',
               'shrinkage': 100,
               'min_support': 2
              } 
# we keep other parameters like regularization parameter and learning_rate as default values.
bsl_options = {'method': 'sgd'} 
scores = {}
for k in range(10, 100, 5):
    knn_bsl_u = KNNBaseline(k=k, sim_options = sim_options, bsl_options = bsl_options, verbose=False)
    knn_bsl_u_train_results, knn_bsl_u_test_results = run_surprise(knn_bsl_u, trainset, testset, verbose=False)
    scores[k] = knn_bsl_u_test_results['rmse']

CPU times: user 57min 5s, sys: 49.4 s, total: 57min 55s
Wall time: 57min 54s


In [22]:
sorted(scores.items(), key=lambda x: x[1])

[(20, 1.0995512454225425),
 (40, 1.0995532873955698),
 (85, 1.0995537021654327),
 (80, 1.0995539465826385),
 (45, 1.0995540106627826),
 (90, 1.0995550723491567),
 (35, 1.0995553398547928),
 (25, 1.0995564006444158),
 (50, 1.09955666951909),
 (95, 1.0995569944680081),
 (65, 1.0995571268377984),
 (60, 1.099557129081337),
 (75, 1.0995573024293464),
 (55, 1.099557450682997),
 (70, 1.0995576968458285),
 (30, 1.0995578984311523),
 (15, 1.0995620654788676),
 (10, 1.0995761058471643)]

In [22]:
sim_options = {'user_based' : True,
               'name': 'pearson_baseline',
               'shrinkage': 100,
               'min_support': 2
              } 
knn_bsl_u = KNNBaseline(k=20, sim_options = sim_options, bsl_options = bsl_options, verbose=False)
knn_bsl_u_train_results, knn_bsl_u_test_results = run_surprise(knn_bsl_u, trainset, testset, verbose=False)

In [23]:
x_train['knn_bsl_u'] = knn_bsl_u_train_results['predictions']
x_test['knn_bsl_u'] = knn_bsl_u_test_results['predictions']

In [29]:
sim_options = {'user_based' : False,
               'name': 'pearson_baseline',
               'shrinkage': 100,
               'min_support': 2
              } 
# we keep other parameters like regularization parameter and learning_rate as default values.
bsl_options = {'method': 'sgd'}

scores = {}
for k in range(10, 100, 5):
    knn_bsl_m = KNNBaseline(k=k, sim_options = sim_options, bsl_options = bsl_options)
    knn_bsl_m_train_results, knn_bsl_m_test_results = run_surprise(knn_bsl_m, trainset, testset, verbose=True)
    scores[k] = knn_bsl_m_test_results['rmse']

Training the model...
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done. time taken : 0:00:01.146945 

Evaluating the model with train data..
time taken : 0:00:07.647337
---------------
Train Data
---------------
RMSE : 0.28482130664126204

MAPE : 7.436259813129569

adding train results in the dictionary..

Evaluating for test data...
time taken : 0:00:00.395857
---------------
Test Data
---------------
RMSE : 1.0996065799406272

MAPE : 36.29755507802492

storing the test results in test dictionary...

---------------------------------------------
Total time taken to run this algorithm : 0:00:09.190812
Training the model...
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done. time taken : 0:00:01.175288 

Evaluating the model with train data..
time taken : 0:00:08.734281
---------------
Train Data
---------------
RMSE : 0.30607904299008365

MAPE 

Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done. time taken : 0:00:01.142706 

Evaluating the model with train data..
time taken : 0:00:10.276441
---------------
Train Data
---------------
RMSE : 0.32567726036421146

MAPE : 8.44026975449212

adding train results in the dictionary..

Evaluating for test data...
time taken : 0:00:00.397476
---------------
Test Data
---------------
RMSE : 1.0995973016217113

MAPE : 36.29720602764387

storing the test results in test dictionary...

---------------------------------------------
Total time taken to run this algorithm : 0:00:11.818090
Training the model...
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done. time taken : 0:00:01.230374 

Evaluating the model with train data..
time taken : 0:00:10.310268
---------------
Train Data
---------------
RMSE : 0.32568823768423055

MAPE : 8.440549500773969

adding train results in the dicti

In [30]:
sorted(scores.items(), key=lambda x: x[1])

[(35, 1.0995971972957603),
 (75, 1.0995972539873902),
 (80, 1.0995972638066436),
 (85, 1.0995972734440311),
 (65, 1.0995972742348685),
 (90, 1.0995972839920938),
 (95, 1.0995972840829953),
 (70, 1.0995973016217113),
 (60, 1.0995973196377822),
 (45, 1.0995973204214702),
 (55, 1.099597382954084),
 (30, 1.0995973897288962),
 (50, 1.0995974008957787),
 (40, 1.0995974272735307),
 (25, 1.0995976295274),
 (20, 1.0995987348277094),
 (15, 1.099601644867118),
 (10, 1.0996065799406272)]

In [25]:
sim_options = {'user_based' : False,
               'name': 'pearson_baseline',
               'shrinkage': 100,
               'min_support': 2
              } 
knn_bsl_m = KNNBaseline(k=35, sim_options = sim_options, bsl_options = bsl_options)
knn_bsl_m_train_results, knn_bsl_m_test_results = run_surprise(knn_bsl_m, trainset, testset, verbose=True)

Training the model...
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done. time taken : 0:00:01.179727 

Evaluating the model with train data..
time taken : 0:00:10.419539
---------------
Train Data
---------------
RMSE : 0.3393620367717291

MAPE : 8.890488960757937

adding train results in the dictionary..

Evaluating for test data...
time taken : 0:00:00.513133
---------------
Test Data
---------------
RMSE : 1.0995388120825549

MAPE : 36.30006107870331

storing the test results in test dictionary...

---------------------------------------------
Total time taken to run this algorithm : 0:00:12.113478


In [26]:
x_train['knn_bsl_m'] = knn_bsl_m_train_results['predictions']
x_test['knn_bsl_m'] = knn_bsl_m_test_results['predictions']

In [27]:
x_train.head()

Unnamed: 0,sur1,sur2,sur3,sur4,sur5,sur6,sur7,sur8,sur9,sur10,...,smr7,smr8,smr9,smr10,MAvg,UAvg,GAvg,bslpr,knn_bsl_u,knn_bsl_m
0,4.0,3.0,4.0,5.0,3.0,4.0,3.0,3.0,4.0,4.0,...,2.0,3.0,4.0,5.0,3.684725,3.333333,3.581679,3.510595,2.94153,3.14287
1,4.0,4.0,3.0,4.0,3.0,2.0,3.0,3.0,5.0,3.0,...,3.333333,3.333333,3.333333,3.333333,2.976744,3.333333,3.581679,3.078008,2.280042,2.402025
2,1.0,4.0,1.0,4.0,4.0,4.0,4.0,2.0,4.0,3.0,...,4.0,4.0,3.0,4.0,3.076835,3.25,3.581679,3.36652,4.331044,4.315673
3,1.0,4.0,3.0,3.0,2.0,3.0,3.0,3.0,4.0,2.0,...,2.0,3.0,2.0,2.0,2.730769,2.66,3.581679,2.717042,2.419183,2.27827
4,5.0,1.0,2.0,3.0,3.0,2.0,4.0,3.0,3.0,4.0,...,4.4,4.4,4.4,4.4,2.90839,4.4,3.581679,3.681138,2.5977,2.463155


In [35]:
%%time
model = xgb.XGBRegressor(n_jobs=16, random_state=42, n_estimators=1000, colsample_bytree=0.1, learning_rate=0.15,
                        max_depth=3, subsample=0.5, reg_alpha=0.01)
model.fit(x_train, y_train, eval_metric='rmse', verbose=True)

CPU times: user 3min 44s, sys: 168 ms, total: 3min 45s
Wall time: 14.2 s


In [37]:
y_train_pred = model.predict(x_train)
rmse_train, mape_train = get_error_metrics(y_train.values, y_train_pred)
print(rmse_train, mape_train)
y_test_pred = model.predict(x_test)
rmse_test, mape_test = get_error_metrics(y_test.values, y_test_pred)
print(rmse_test, mape_test)

0.8200225730633288 24.298012499069984
1.1264115365292955 34.91394168030328


In [38]:
%%time
trr, tsr, model = run_xgb(train_frames=(x_train, y_train), test_frames=(x_test, y_test), 
        eval_metric=get_error_metrics, tuning=True, n_jobs=1)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:   36.1s remaining:   29.5s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  2.6min finished


CPU times: user 13.5 s, sys: 488 ms, total: 14 s
Wall time: 2min 50s


In [40]:
model.best_params_

{'colsample_bytree': 0.3,
 'learning_rate': 0.2,
 'max_depth': 3,
 'n_estimators': 200,
 'subsample': 1}

In [41]:
y_train_pred = model.best_estimator_.predict(x_train)
rmse_train, mape_train = get_error_metrics(y_train.values, y_train_pred)
print(rmse_train, mape_train)
y_test_pred = model.best_estimator_.predict(x_test)
rmse_test, mape_test = get_error_metrics(y_test.values, y_test_pred)
print(rmse_test, mape_test)

0.8266462829646578 24.492552927342455
1.1206661498408128 35.201180931377756


In [29]:
from surprise import SVD, SVDpp

In [45]:
scores = {}
for f in [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]:
    svd = SVD(n_factors=f, biased=True, random_state=15, verbose=False)
    svd_train_results, svd_test_results = run_surprise(svd, trainset, testset, verbose=False)
    scores[f] = svd_test_results['rmse']

In [46]:
sorted(scores.items(), key=lambda x: x[1])

[(150, 1.0995032392632291),
 (400, 1.0995109689989067),
 (450, 1.099522111259098),
 (100, 1.0995231661001237),
 (200, 1.0995259544926652),
 (500, 1.0995287041765225),
 (300, 1.099534729728022),
 (50, 1.0995477347957328),
 (250, 1.0995729404306134),
 (350, 1.099580094229309)]

In [30]:
svd = SVD(n_factors=150, biased=True, random_state=15, verbose=True, n_epochs=50)
svd_train_results, svd_test_results = run_surprise(svd, trainset, testset, verbose=True)

Training the model...
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Processing epoch 30
Processing epoch 31
Processing epoch 32
Processing epoch 33
Processing epoch 34
Processing epoch 35
Processing epoch 36
Processing epoch 37
Processing epoch 38
Processing epoch 39
Processing epoch 40
Processing epoch 41
Processing epoch 42
Processing epoch 43
Processing epoch 44
Processing epoch 45
Processing epoch 46
Processing epoch 47
Processing epoch 48
Processi

In [31]:
svd_train_results

{'mape': 7.570615715767446,
 'predictions': array([2.91835994, 2.11119893, 4.68831639, ..., 3.89304097, 3.79625456,
        2.05792307]),
 'rmse': 0.26271072084651853}

In [32]:
x_train['svd'] = svd_train_results['rmse'] 
x_test['svd'] = svd_test_results['rmse']

In [33]:
x_train.head()

Unnamed: 0,sur1,sur2,sur3,sur4,sur5,sur6,sur7,sur8,sur9,sur10,...,smr8,smr9,smr10,MAvg,UAvg,GAvg,bslpr,knn_bsl_u,knn_bsl_m,svd
0,4.0,3.0,4.0,5.0,3.0,4.0,3.0,3.0,4.0,4.0,...,3.0,4.0,5.0,3.684725,3.333333,3.581679,3.510595,2.94153,3.14287,0.262711
1,4.0,4.0,3.0,4.0,3.0,2.0,3.0,3.0,5.0,3.0,...,3.333333,3.333333,3.333333,2.976744,3.333333,3.581679,3.078008,2.280042,2.402025,0.262711
2,1.0,4.0,1.0,4.0,4.0,4.0,4.0,2.0,4.0,3.0,...,4.0,3.0,4.0,3.076835,3.25,3.581679,3.36652,4.331044,4.315673,0.262711
3,1.0,4.0,3.0,3.0,2.0,3.0,3.0,3.0,4.0,2.0,...,3.0,2.0,2.0,2.730769,2.66,3.581679,2.717042,2.419183,2.27827,0.262711
4,5.0,1.0,2.0,3.0,3.0,2.0,4.0,3.0,3.0,4.0,...,4.4,4.4,4.4,2.90839,4.4,3.581679,3.681138,2.5977,2.463155,0.262711


In [54]:
scores = {}
for f in [50, 100, 150, 200, 250, 300, 350, 400, 450, 500]:
    svdpp = SVDpp(n_factors=f, random_state=15, verbose=True)
    svdpp_train_results, svdpp_test_results = run_surprise(svdpp, trainset, testset, verbose=True)
    scores[f] = svdpp_test_results['rmse']

Training the model...
 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
Done. time taken : 0:02:08.392582 

Evaluating the model with train data..
time taken : 0:00:06.524472
---------------
Train Data
---------------
RMSE : 0.6010341577012868

MAPE : 17.52519704951758

adding train results in the dictionary..

Evaluating for test data...
time taken : 0:00:00.547750
---------------
Test Data
---------------
RMSE : 1.099877282671265

MAPE : 36.291039253452595

storing the test results in test dictionary...

---------------------------------------------
Total time taken to run this algorithm : 0:02:15.466163
Training the model...
 pro

 processing epoch 19
Done. time taken : 0:17:32.425124 

Evaluating the model with train data..
time taken : 0:00:08.958921
---------------
Train Data
---------------
RMSE : 0.29907720659526016

MAPE : 8.601186721311834

adding train results in the dictionary..

Evaluating for test data...
time taken : 0:00:00.384797
---------------
Test Data
---------------
RMSE : 1.09969339886896

MAPE : 36.29900525753309

storing the test results in test dictionary...

---------------------------------------------
Total time taken to run this algorithm : 0:17:41.769824
Training the model...
 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
Done. 

In [55]:
sorted(scores.items(), key=lambda x: x[1])

[(450, 1.09969339886896),
 (150, 1.0997128239715912),
 (400, 1.0997181018480295),
 (500, 1.0997444202673292),
 (300, 1.0997653153101905),
 (200, 1.0997756953280993),
 (100, 1.099799558877154),
 (350, 1.0998112494446333),
 (50, 1.099877282671265),
 (250, 1.0998891821067107)]

In [34]:
svdpp = SVDpp(n_factors=450, random_state=15, verbose=True)
svdpp_train_results, svdpp_test_results = run_surprise(svdpp, trainset, testset, verbose=True)

Training the model...
 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
Done. time taken : 0:17:20.478503 

Evaluating the model with train data..
time taken : 0:00:09.224921
---------------
Train Data
---------------
RMSE : 0.29907720659526016

MAPE : 8.601186721311834

adding train results in the dictionary..

Evaluating for test data...
time taken : 0:00:00.384511
---------------
Test Data
---------------
RMSE : 1.09969339886896

MAPE : 36.29900525753309

storing the test results in test dictionary...

---------------------------------------------
Total time taken to run this algorithm : 0:17:30.089927


In [35]:
x_train['svdpp'] = svdpp_train_results['rmse'] 
x_test['svdpp'] = svdpp_test_results['rmse']

In [36]:
x_train['rating'] = y_train
x_test['rating'] = y_test
x_train.to_csv('./dataset/x_train_10000_1000.csv', index=False)
x_test.to_csv('./dataset/x_test_10000_1000.csv', index=False)

In [41]:
x_train.drop('rating', axis=1, inplace=True)
x_test.drop('rating', axis=1, inplace=True)

In [44]:
%%time
trr, tsr, model = run_xgb(train_frames=(x_train, y_train), test_frames=(x_test, y_test), 
        eval_metric=get_error_metrics, tuning=True, n_jobs=1)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:  1.7min remaining:  1.4min
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 11.5min finished


CPU times: user 13min 8s, sys: 568 ms, total: 13min 8s
Wall time: 24min 35s


In [45]:
y_train_pred = model.best_estimator_.predict(x_train)
rmse_train, mape_train = get_error_metrics(y_train.values, y_train_pred)
print(rmse_train, mape_train)
y_test_pred = model.best_estimator_.predict(x_test)
rmse_test, mape_test = get_error_metrics(y_test.values, y_test_pred)
print(rmse_test, mape_test)

0.5886822832579691 16.591248704157714
1.1530419672171563 34.253363173622105


In [46]:
model.best_params_

{'colsample_bytree': 0.5,
 'learning_rate': 0.01,
 'max_depth': 10,
 'n_estimators': 2000,
 'subsample': 0.5}

#### On the dataset of 25000 x 3000

In [47]:
x_train = pd.read_csv('./dataset/train_25000_3000.csv',
                     names = ['user', 'movie',
                                    'sur1', 'sur2', 'sur3', 'sur4', 'sur5',
                                    'sur6', 'sur7', 'sur8', 'sur9', 'sur10',
                                    'smr1', 'smr2', 'smr3', 'smr4', 'smr5',
                                    'smr6', 'smr7', 'smr8', 'smr9', 'smr10',
                                    'MAvg', 'UAvg', 'GAvg', 'rating'], header=None)

x_train.drop(['user', 'movie'], axis=1, inplace=True)
y_train = x_train.rating
x_train.drop(['rating'], axis=1, inplace=True)

In [48]:
x_test = pd.read_csv('./dataset/test_25000_3000.csv', 
                    names = ['user', 'movie',
                                    'sur1', 'sur2', 'sur3', 'sur4', 'sur5',
                                    'sur6', 'sur7', 'sur8', 'sur9', 'sur10',
                                    'smr1', 'smr2', 'smr3', 'smr4', 'smr5',
                                    'smr6', 'smr7', 'smr8', 'smr9', 'smr10',
                                    'MAvg', 'UAvg', 'GAvg', 'rating'], header=None)
x_test.drop(['user', 'movie'], axis=1, inplace=True)
y_test = x_test.rating
x_test.drop(['rating'], axis=1, inplace=True)

In [49]:
x_train.shape

(856986, 23)

In [50]:
x_test.shape

(261693, 23)

In [59]:
%%time
trr, tsr, model = run_xgb(train_frames=(x_train, y_train), test_frames=(x_test, y_test), 
        eval_metric=get_error_metrics, tuning=False, n_jobs=16)

CPU times: user 3min 6s, sys: 1.82 s, total: 3min 7s
Wall time: 18.1 s


In [60]:
trr

{'mape': 25.60607327683226, 'rmse': 0.854572352919758}

In [61]:
tsr

{'mape': 33.6849388024272, 'rmse': 1.096860802799888}

In [None]:
%%time
trr, tsr, model = run_xgb(train_frames=(x_train, y_train), test_frames=(x_test, y_test), 
        eval_metric=get_error_metrics, tuning=True, n_jobs=1)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed: 13.1min remaining: 10.7min


In [55]:
trr

{'mape': 24.617162352130688, 'rmse': 0.8322098236809364}

In [56]:
tsr

{'mape': 32.96068961518468, 'rmse': 1.1460588022296534}

In [65]:
reg_train = pd.read_csv('./dataset/train_25000_3000.csv',
                     names = ['user', 'movie',
                                    'sur1', 'sur2', 'sur3', 'sur4', 'sur5',
                                    'sur6', 'sur7', 'sur8', 'sur9', 'sur10',
                                    'smr1', 'smr2', 'smr3', 'smr4', 'smr5',
                                    'smr6', 'smr7', 'smr8', 'smr9', 'smr10',
                                    'MAvg', 'UAvg', 'GAvg', 'rating'], header=None)
reader = Reader(rating_scale=(1,5))
train_data = Dataset.load_from_df(reg_train[['user', 'movie', 'rating']], reader)
trainset = train_data.build_full_trainset()

In [66]:
reg_test = pd.read_csv('./dataset/test_25000_3000.csv',
                     names = ['user', 'movie',
                                    'sur1', 'sur2', 'sur3', 'sur4', 'sur5',
                                    'sur6', 'sur7', 'sur8', 'sur9', 'sur10',
                                    'smr1', 'smr2', 'smr3', 'smr4', 'smr5',
                                    'smr6', 'smr7', 'smr8', 'smr9', 'smr10',
                                    'MAvg', 'UAvg', 'GAvg', 'rating'], header=None)
testset = list(zip(reg_test.user.values, reg_test.movie.values, reg_test.rating.values))
testset[:3]

[(1129620, 2, 3), (2407458, 5582, 4), (668855, 6677, 3)]

In [67]:
scores = {}
for lr in [10, 1, 0.1, 0.01, 0.001, 0.003, 0.005, 0.007, 0.0001]:
    bsl_options = {'method': 'sgd',
                   'learning_rate': lr}
    bsl_algo = BaselineOnly(bsl_options=bsl_options, verbose=False)
    # run this algorithm.., It will return the train and test results..
    bsl_train_results, bsl_test_results = run_surprise(bsl_algo, trainset, testset, verbose=False)
    scores[lr] = bsl_test_results['rmse']

In [68]:
sorted(scores.items(), key=lambda x: x[1])

[(0.001, 1.0816719448109384),
 (0.003, 1.0817022917347872),
 (0.005, 1.0819683403064757),
 (0.007, 1.0822509061747585),
 (0.01, 1.0826597487620286),
 (0.0001, 1.0832897290093344),
 (0.1, 1.0896225635772798),
 (10, 1.238800386453538),
 (1, 1.4150266238135194)]

In [69]:
bsl_options = {'method': 'sgd',
                   'learning_rate': 0.001}
bsl_algo = BaselineOnly(bsl_options=bsl_options, verbose=True)
bsl_train_results, bsl_test_results = run_surprise(bsl_algo, trainset, testset, verbose=True)

Training the model...
Estimating biases using sgd...
Done. time taken : 0:00:06.861425 

Evaluating the model with train data..
time taken : 0:00:08.343935
---------------
Train Data
---------------
RMSE : 0.9221340991490828

MAPE : 28.642423887479605

adding train results in the dictionary..

Evaluating for test data...
time taken : 0:00:03.231457
---------------
Test Data
---------------
RMSE : 1.0816719448109384

MAPE : 34.043085698379784

storing the test results in test dictionary...

---------------------------------------------
Total time taken to run this algorithm : 0:00:18.438229


In [70]:
x_train['bslpr'] = bsl_train_results['predictions']
x_test['bslpr'] = bsl_test_results['predictions']

In [None]:
%%time
# we specify , how to compute similarities and what to consider with sim_options to our algorithm
sim_options = {'user_based' : True,
               'name': 'pearson_baseline',
               'shrinkage': 100,
               'min_support': 2
              } 
# we keep other parameters like regularization parameter and learning_rate as default values.
bsl_options = {'method': 'sgd'} 
scores = {}
for k in range(10, 100, 5):
    knn_bsl_u = KNNBaseline(k=k, sim_options = sim_options, bsl_options = bsl_options, verbose=False)
    knn_bsl_u_train_results, knn_bsl_u_test_results = run_surprise(knn_bsl_u, trainset, testset, verbose=False)
    scores[k] = knn_bsl_u_test_results['rmse']

In [75]:
sorted(scores.items(), key=lambda x: x[1])

[(70, 1.081922181172459),
 (60, 1.0819259906791825),
 (40, 1.0819267112455815),
 (65, 1.081927291816672),
 (55, 1.0819288467283283),
 (50, 1.0819293322338353),
 (45, 1.0819322182617994),
 (30, 1.0819324281789666),
 (35, 1.0819339165590731),
 (25, 1.0819466434445217),
 (20, 1.081960174781031),
 (15, 1.0819770600416754),
 (10, 1.0820212062220682)]

In [None]:
knn_bsl_u = KNNBaseline(k=70, sim_options = sim_options, bsl_options = bsl_options, verbose=True)
knn_bsl_u_train_results, knn_bsl_u_test_results = run_surprise(knn_bsl_u, trainset, testset, verbose=True)

Training the model...
Estimating biases using sgd...


In [77]:
knn_bsl_u_test_results

{'mape': 34.04040310973702,
 'predictions': array([3.58758136, 3.58758136, 3.39395582, ..., 3.99276318, 3.99276318,
        3.99276318]),
 'rmse': 1.081922181172459}

In [78]:
x_train['knn_bsl_u'] = knn_bsl_u_train_results['predictions']
x_test['knn_bsl_u'] = knn_bsl_u_test_results['predictions']

In [79]:
x_train.head()

Unnamed: 0,sur1,sur2,sur3,sur4,sur5,sur6,sur7,sur8,sur9,sur10,...,smr6,smr7,smr8,smr9,smr10,MAvg,UAvg,GAvg,bslpr,knn_bsl_u
0,5.0,5.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,...,4.0,3.0,4.0,2.0,5.0,3.611111,3.882353,3.587581,3.679824,4.982729
1,3.0,5.0,5.0,4.0,2.0,3.0,3.0,3.0,4.0,4.0,...,4.0,3.0,5.0,4.0,3.0,3.163265,3.714286,3.587581,2.968929,3.167275
2,4.0,3.0,2.0,3.0,3.0,4.0,2.0,3.0,2.0,3.0,...,4.0,3.0,4.0,4.0,4.0,2.774065,2.945946,3.587581,3.447917,3.249298
3,2.0,4.0,4.0,4.0,3.0,4.0,3.0,4.0,3.0,2.0,...,3.0,4.0,4.0,3.0,4.0,3.5152,3.850467,3.587581,4.142028,4.782319
4,4.0,4.0,4.0,4.0,3.0,2.0,4.0,4.0,3.0,3.0,...,4.0,3.0,4.0,4.0,5.0,3.386404,3.666667,3.587581,4.323936,4.942732


In [81]:
sim_options = {'user_based' : False,
               'name': 'pearson_baseline',
               'shrinkage': 100,
               'min_support': 2
              } 
bsl_options = {'method': 'sgd'}
knn_bsl_m = KNNBaseline(k=70, sim_options = sim_options, bsl_options = bsl_options, verbose=True)
knn_bsl_m_train_results, knn_bsl_m_test_results = run_surprise(knn_bsl_m, trainset, testset, verbose=True)

Training the model...
Estimating biases using sgd...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Done. time taken : 0:00:19.124201 

Evaluating the model with train data..
time taken : 0:02:48.211354
---------------
Train Data
---------------
RMSE : 0.5184554371223534

MAPE : 14.564513589620564

adding train results in the dictionary..

Evaluating for test data...
time taken : 0:00:03.717117
---------------
Test Data
---------------
RMSE : 1.0820863958149332

MAPE : 34.04254851269998

storing the test results in test dictionary...

---------------------------------------------
Total time taken to run this algorithm : 0:03:11.054712


In [82]:
x_train['knn_bsl_m'] = knn_bsl_m_train_results['predictions']
x_test['knn_bsl_m'] = knn_bsl_m_test_results['predictions']

In [83]:
x_train.head()

Unnamed: 0,sur1,sur2,sur3,sur4,sur5,sur6,sur7,sur8,sur9,sur10,...,smr7,smr8,smr9,smr10,MAvg,UAvg,GAvg,bslpr,knn_bsl_u,knn_bsl_m
0,5.0,5.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,...,3.0,4.0,2.0,5.0,3.611111,3.882353,3.587581,3.679824,4.982729,4.886922
1,3.0,5.0,5.0,4.0,2.0,3.0,3.0,3.0,4.0,4.0,...,3.0,5.0,4.0,3.0,3.163265,3.714286,3.587581,2.968929,3.167275,3.122327
2,4.0,3.0,2.0,3.0,3.0,4.0,2.0,3.0,2.0,3.0,...,3.0,4.0,4.0,4.0,2.774065,2.945946,3.587581,3.447917,3.249298,3.413867
3,2.0,4.0,4.0,4.0,3.0,4.0,3.0,4.0,3.0,2.0,...,4.0,4.0,3.0,4.0,3.5152,3.850467,3.587581,4.142028,4.782319,4.512588
4,4.0,4.0,4.0,4.0,3.0,2.0,4.0,4.0,3.0,3.0,...,3.0,4.0,4.0,5.0,3.386404,3.666667,3.587581,4.323936,4.942732,4.93701


In [86]:
from surprise import SVD, SVDpp
from tqdm import tqdm

In [87]:
scores = {}
for f in tqdm([50, 100, 150, 200, 250, 300, 350, 400, 450, 500]):
    svd = SVD(n_factors=f, biased=True, random_state=15, verbose=False)
    svd_train_results, svd_test_results = run_surprise(svd, trainset, testset, verbose=False)
    scores[f] = svd_test_results['rmse']

100%|██████████| 10/10 [26:09<00:00, 223.43s/it]


In [89]:
sorted(scores.items(), key=lambda x: x[1])

[(450, 1.0817583429460766),
 (500, 1.0817590126894747),
 (400, 1.0817847522902897),
 (150, 1.0817918964076993),
 (300, 1.0818230447552493),
 (250, 1.0818553202819787),
 (350, 1.0818567076090326),
 (200, 1.0818630851340787),
 (100, 1.081876187303056),
 (50, 1.0818879815173739)]

In [91]:
svd = SVD(n_factors=450, biased=True, random_state=15, verbose=True)
svd_train_results, svd_test_results = run_surprise(svd, trainset, testset, verbose=True)

Training the model...
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Done. time taken : 0:04:07.661219 

Evaluating the model with train data..
time taken : 0:00:11.108487
---------------
Train Data
---------------
RMSE : 0.4706261759813944

MAPE : 13.739027251917435

adding train results in the dictionary..

Evaluating for test data...
time taken : 0:00:03.262811
---------------
Test Data
---------------
RMSE : 1.0817583429460766

MAPE : 33.99540588009756

storing the test results in test dictionary...

---------------------------------------------
Total time taken to run this algorithm : 0:04:22.033287


In [92]:
x_train['svd'] = svd_train_results['predictions']
x_test['svd'] = svd_test_results['predictions']

In [100]:
x_train.head()

Unnamed: 0,sur1,sur2,sur3,sur4,sur5,sur6,sur7,sur8,sur9,sur10,...,smr8,smr9,smr10,MAvg,UAvg,GAvg,bslpr,knn_bsl_u,knn_bsl_m,svd
0,5.0,5.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,...,4.0,2.0,5.0,3.611111,3.882353,3.587581,3.679824,4.982729,4.886922,4.631519
1,3.0,5.0,5.0,4.0,2.0,3.0,3.0,3.0,4.0,4.0,...,5.0,4.0,3.0,3.163265,3.714286,3.587581,2.968929,3.167275,3.122327,3.097232
2,4.0,3.0,2.0,3.0,3.0,4.0,2.0,3.0,2.0,3.0,...,4.0,4.0,4.0,2.774065,2.945946,3.587581,3.447917,3.249298,3.413867,3.151565
3,2.0,4.0,4.0,4.0,3.0,4.0,3.0,4.0,3.0,2.0,...,4.0,3.0,4.0,3.5152,3.850467,3.587581,4.142028,4.782319,4.512588,4.513377
4,4.0,4.0,4.0,4.0,3.0,2.0,4.0,4.0,3.0,3.0,...,4.0,4.0,5.0,3.386404,3.666667,3.587581,4.323936,4.942732,4.93701,4.758461


In [101]:
trr, tsr, _ = run_xgb(train_frames=(x_train, y_train), test_frames=(x_test, y_test), 
        eval_metric=get_error_metrics, tuning=False, n_jobs=16)

In [102]:
trr

{'mape': 25.60601509179718, 'rmse': 0.8545724040845634}

In [103]:
tsr

{'mape': 33.68493102838425, 'rmse': 1.096862228380405}

In [None]:
trr, tsr, model = run_xgb(train_frames=(x_train, y_train), test_frames=(x_test, y_test), 
        eval_metric=get_error_metrics, tuning=True, n_jobs=1)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:  4.3min remaining:  3.5min
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 17.4min finished


In [105]:
trr

{'mape': 25.087553813471818, 'rmse': 0.8429942025454135}

In [106]:
tsr

{'mape': 33.7782628644331, 'rmse': 1.1020239157292728}

In [2]:
!ls dataset/

combined_data_1.txt		      sample_sparse_matrix.npz
combined_data_2.txt		      sample_sparse_test_matrix_10000_1000.npz
combined_data_3.txt		      sample_sparse_test_matrix_25000_3000.npz
combined_data_4.txt		      sample_sparse_test_matrix.npz
movie_titles.csv		      test_10000_1000.csv
probe.txt			      test_25000_3000.csv
qualifying.txt			      test.csv
README				      train_10000_1000.csv
reviews_test_sparse.npz		      train_25000_3000.csv
reviews_train_sparse.npz	      train.csv
sample_sparse_matrix_10000_1000.npz   x_test_10000_1000.csv
sample_sparse_matrix_25000_3000.npz   x_train_10000_1000.csv
sample_sparse_matrix_80000_10000.npz


In [5]:
from lightfm import LightFM
from lightfm.datasets import fetch_movielens

In [6]:
data = fetch_movielens(min_rating=5.0)

In [10]:
data['test']

<943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 2153 stored elements in COOrdinate format>

In [11]:
import helpers

ImportError: No module named 'helpers'