In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
# pd.set_option('display.max_columns', 1000)
# pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('..')

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from project.ranker.ranker import RankingPredictor

In [25]:
%%time
from sklearn.model_selection import train_test_split
rp = Pipeline([
    ('scale', StandardScaler()),
    ('estimator', RankingPredictor("ma_100", n_neighbors=15)),
])
df_mf, df_rank, df_scores = rp.named_steps['estimator'].get_data()

X, _, y, _, y_scores, _ = train_test_split(df_mf.values,
                                           df_rank.values,
                                           df_scores.values,
                                           test_size=0,
                                           random_state=42)
print(X.shape, y.shape, y_scores.shape)

(60, 39) (60, 13) (60, 13)
CPU times: user 38.4 s, sys: 3.5 s, total: 41.9 s
Wall time: 42.2 s


## LGBM

In [26]:
%%time
import lightgbm
from project.ranker.lambdarank import cv_lgbm
from sklearn.model_selection import RepeatedKFold
kfolds = RepeatedKFold(10, n_repeats=10, random_state=42)
params = {'objective': 'lambdarank', 'metric': 'ndcg', 'ndcg_at': 13}
lgbm_results, lgbm_models = cv_lgbm(lightgbm, X, y, y_scores, kfolds, 
                  params, num_rounds=1000, early_stopping_rounds=50, 
                  verbose_eval=False, metric='ndcg@13')

Fold   1 | #Est:   5 | Trn_Spearman:  0.3801 | Val_Spearman:  0.1941 | Trn_ACCLoss:  0.0275 | Val_ACCLoss:  0.1114 | Trn_NDCG:  0.8194 | Val_NDCG:  0.6747
Fold   2 | #Est:   2 | Trn_Spearman:  0.2944 | Val_Spearman: -0.0165 | Trn_ACCLoss:  0.0394 | Val_ACCLoss:  0.2130 | Trn_NDCG:  0.6937 | Val_NDCG:  0.6000
Fold   3 | #Est:   4 | Trn_Spearman:  0.4338 | Val_Spearman:  0.1255 | Trn_ACCLoss:  0.0135 | Val_ACCLoss:  0.0674 | Trn_NDCG:  0.8427 | Val_NDCG:  0.6548
Fold   4 | #Est:  11 | Trn_Spearman:  0.4053 | Val_Spearman:  0.1795 | Trn_ACCLoss:  0.0045 | Val_ACCLoss:  0.0300 | Trn_NDCG:  0.8701 | Val_NDCG:  0.7601
Fold   5 | #Est:  11 | Trn_Spearman:  0.4359 | Val_Spearman:  0.0513 | Trn_ACCLoss:  0.0020 | Val_ACCLoss:  0.0325 | Trn_NDCG:  0.8951 | Val_NDCG:  0.6174
Fold   6 | #Est:  24 | Trn_Spearman:  0.4648 | Val_Spearman:  0.0714 | Trn_ACCLoss:  0.0017 | Val_ACCLoss:  0.0890 | Trn_NDCG:  0.9163 | Val_NDCG:  0.6074
Fold   7 | #Est:   9 | Trn_Spearman:  0.4545 | Val_Spearman:  0.1465 |

In [8]:
%%time
import lightgbm
from project.ranker.lambdarank import cv_lgbm
from sklearn.model_selection import RepeatedKFold
kfolds = RepeatedKFold(10, n_repeats=10, random_state=42)
params = {'objective': 'lambdarank', 'metric': 'ndcg', 'ndcg_at': 12}
lgbm_results, lgbm_models = cv_lgbm(lightgbm, X, y, y_scores, kfolds, 
                  params, num_rounds=1000, early_stopping_rounds=50, 
                  verbose_eval=False, metric='ndcg@12')

Fold   1 | #Est:   4 | Trn_Spearman:  0.4042 | Val_Spearman:  0.2716 | Trn_ACCLoss:  0.0195 | Val_ACCLoss:  0.0405 | Trn_NDCG:  0.8061 | Val_NDCG:  0.6379
Fold   2 | #Est:   2 | Trn_Spearman:  0.3250 | Val_Spearman:  0.0198 | Trn_ACCLoss:  0.0367 | Val_ACCLoss:  0.1955 | Trn_NDCG:  0.7299 | Val_NDCG:  0.6790
Fold   3 | #Est:  26 | Trn_Spearman:  0.5263 | Val_Spearman:  0.1399 | Trn_ACCLoss:  0.0005 | Val_ACCLoss:  0.0520 | Trn_NDCG:  0.9319 | Val_NDCG:  0.5767
Fold   4 | #Est:  67 | Trn_Spearman:  0.5650 | Val_Spearman:  0.3030 | Trn_ACCLoss:  0.0000 | Val_ACCLoss:  0.0409 | Trn_NDCG:  0.9610 | Val_NDCG:  0.7087
Fold   5 | #Est:  17 | Trn_Spearman:  0.4899 | Val_Spearman:  0.1958 | Trn_ACCLoss:  0.0068 | Val_ACCLoss:  0.0328 | Trn_NDCG:  0.9012 | Val_NDCG:  0.6582
Fold   6 | #Est:   1 | Trn_Spearman:  0.1668 | Val_Spearman:  0.0117 | Trn_ACCLoss:  0.0710 | Val_ACCLoss:  0.1074 | Trn_NDCG:  0.6654 | Val_NDCG:  0.6222
Fold   7 | #Est:  10 | Trn_Spearman:  0.5022 | Val_Spearman:  0.2576 |

In [27]:
cols = ['TrnSpearman', 'ValSpearman', 'TrnPLC', 'ValPLC', 'TrnNDCG', 'ValNDCG']

In [28]:
lgbm_df = pd.DataFrame(lgbm_results, columns=cols)
lgbm_df.head()

Unnamed: 0,TrnSpearman,ValSpearman,TrnPLC,ValPLC,TrnNDCG,ValNDCG
0,0.380138,0.194139,0.027466,0.111352,0.819394,0.674718
1,0.294363,-0.016484,0.039372,0.213012,0.693704,0.600015
2,0.433761,0.125458,0.013526,0.06737,0.842719,0.654842
3,0.405271,0.179487,0.004454,0.029956,0.870064,0.760146
4,0.435897,0.051282,0.001998,0.032465,0.89506,0.617362


In [29]:
lgbm_df.to_csv('results/Original_Scores_LGBM.csv', index=False)

## Tuned LGBM

In [30]:
%%time
import lightgbm
from project.ranker.lambdarank import cv_lgbm
from sklearn.model_selection import RepeatedKFold
kfolds = RepeatedKFold(10, n_repeats=10, random_state=42)
params = {'objective': 'lambdarank', 'metric': 'ndcg', 'ndcg_at': 13, 'max_depth': 50, 
          'num_leaves': 37, 'min_sum_hessian_in_leaf': 2.838955470674485, 'min_data_in_leaf': 20, 
          'bagging_fraction': 0.41761591951124705, 'bagging_freq': 4, 'feature_fraction': 0.6258773979549109, 
          'scale_pos_weight': 1.1049055615452428, 'learning_rate': 0.02887716145050293, 'lambda_l2': 29.235934351290478}
tuned_results, tuned_models = cv_lgbm(lightgbm, X, y, y_scores, kfolds, 
                  params, num_rounds=1000, early_stopping_rounds=50, 
                  verbose_eval=False, metric='ndcg@13')

Fold   1 | #Est:   3 | Trn_Spearman:  0.0566 | Val_Spearman:  0.0714 | Trn_ACCLoss:  0.1050 | Val_ACCLoss:  0.1117 | Trn_NDCG:  0.5992 | Val_NDCG:  0.6365
Fold   2 | #Est:   1 | Trn_Spearman:  0.0504 | Val_Spearman: -0.2253 | Trn_ACCLoss:  0.0766 | Val_ACCLoss:  0.2542 | Trn_NDCG:  0.5634 | Val_NDCG:  0.4855
Fold   3 | #Est:  71 | Trn_Spearman:  0.3190 | Val_Spearman:  0.0980 | Trn_ACCLoss:  0.0300 | Val_ACCLoss:  0.0364 | Trn_NDCG:  0.7252 | Val_NDCG:  0.5762
Fold   4 | #Est: 133 | Trn_Spearman:  0.2843 | Val_Spearman:  0.1731 | Trn_ACCLoss:  0.0295 | Val_ACCLoss:  0.0522 | Trn_NDCG:  0.7675 | Val_NDCG:  0.7152
Fold   5 | #Est:   4 | Trn_Spearman:  0.0965 | Val_Spearman:  0.1209 | Trn_ACCLoss:  0.0855 | Val_ACCLoss:  0.0352 | Trn_NDCG:  0.6085 | Val_NDCG:  0.6427
Fold   6 | #Est:  26 | Trn_Spearman:  0.2451 | Val_Spearman:  0.2326 | Trn_ACCLoss:  0.0713 | Val_ACCLoss:  0.0558 | Trn_NDCG:  0.6623 | Val_NDCG:  0.6998
Fold   7 | #Est:  56 | Trn_Spearman:  0.2513 | Val_Spearman:  0.0348 |

In [None]:
tuned_df = pd.DataFrame(tuned_results)
tuned_df.head()

In [None]:
tuned_df.to_csv('results/Scores_TunedLGBM.csv', index=False)

## Neural Net

In [31]:
%%time
import lightgbm
from sklearn.model_selection import RepeatedKFold
from project.ranker.neuralnet import cv_neuralnet

kfolds = RepeatedKFold(10, n_repeats=10, random_state=42)
params = {'latent_sz': 30, 'batch_sz': 40, 'epochs': 29, 
          'learning_rate': 0.0023670978180025}
neuralnet_results, neuralnet_models = cv_neuralnet(X, y, y_scores, kfolds, params, 
                               verbose_folds=True)

Fold   1 | Trn_Spearman:  0.7081 | Val_Spearman:  0.3636 | Trn_ACCLoss:  0.0303 | Val_ACCLoss:  0.0529
Fold   2 | Trn_Spearman:  0.7126 | Val_Spearman:  0.0742 | Trn_ACCLoss:  0.0168 | Val_ACCLoss:  0.1856
Fold   3 | Trn_Spearman:  0.7434 | Val_Spearman: -0.0568 | Trn_ACCLoss:  0.0318 | Val_ACCLoss:  0.0721
Fold   4 | Trn_Spearman:  0.7127 | Val_Spearman:  0.2628 | Trn_ACCLoss:  0.0292 | Val_ACCLoss:  0.1037
Fold   5 | Trn_Spearman:  0.7208 | Val_Spearman:  0.2674 | Trn_ACCLoss:  0.0342 | Val_ACCLoss:  0.0429
Fold   6 | Trn_Spearman:  0.7107 | Val_Spearman:  0.3901 | Trn_ACCLoss:  0.0276 | Val_ACCLoss:  0.1034
Fold   7 | Trn_Spearman:  0.7221 | Val_Spearman:  0.1905 | Trn_ACCLoss:  0.0305 | Val_ACCLoss:  0.1667
Fold   8 | Trn_Spearman:  0.7178 | Val_Spearman:  0.2930 | Trn_ACCLoss:  0.0332 | Val_ACCLoss:  0.0662
Fold   9 | Trn_Spearman:  0.7204 | Val_Spearman:  0.1804 | Trn_ACCLoss:  0.0276 | Val_ACCLoss:  0.0776
Fold  10 | Trn_Spearman:  0.7315 | Val_Spearman:  0.1566 | Trn_ACCLoss:  

In [15]:
%%time
import lightgbm
from sklearn.model_selection import RepeatedKFold
from project.ranker.neuralnet import cv_neuralnet

kfolds = RepeatedKFold(10, n_repeats=10, random_state=42)
params = {'latent_sz': 30, 'batch_sz': 40, 'epochs': 29, 
          'learning_rate': 0.0023670978180025}
neuralnet_results, neuralnet_models = cv_neuralnet(X, y, y_scores, kfolds, params, 
                               verbose_folds=True)

Fold   1 | Trn_Spearman:  0.6983 | Val_Spearman:  0.3485 | Trn_ACCLoss:  0.0345 | Val_ACCLoss:  0.0550
Fold   2 | Trn_Spearman:  0.7099 | Val_Spearman:  0.0664 | Trn_ACCLoss:  0.0164 | Val_ACCLoss:  0.1989
Fold   3 | Trn_Spearman:  0.7433 | Val_Spearman: -0.0023 | Trn_ACCLoss:  0.0280 | Val_ACCLoss:  0.0538
Fold   4 | Trn_Spearman:  0.6977 | Val_Spearman:  0.2249 | Trn_ACCLoss:  0.0335 | Val_ACCLoss:  0.0962
Fold   5 | Trn_Spearman:  0.6966 | Val_Spearman:  0.3030 | Trn_ACCLoss:  0.0354 | Val_ACCLoss:  0.0451
Fold   6 | Trn_Spearman:  0.7058 | Val_Spearman:  0.3275 | Trn_ACCLoss:  0.0258 | Val_ACCLoss:  0.1345
Fold   7 | Trn_Spearman:  0.7133 | Val_Spearman:  0.2436 | Trn_ACCLoss:  0.0287 | Val_ACCLoss:  0.1367
Fold   8 | Trn_Spearman:  0.7068 | Val_Spearman:  0.2121 | Trn_ACCLoss:  0.0297 | Val_ACCLoss:  0.0452
Fold   9 | Trn_Spearman:  0.6971 | Val_Spearman:  0.1270 | Trn_ACCLoss:  0.0294 | Val_ACCLoss:  0.0886
Fold  10 | Trn_Spearman:  0.7116 | Val_Spearman:  0.2168 | Trn_ACCLoss:  

In [32]:
cols = ['TrnSpearman', 'ValSpearman', 'TrnPLC', 'ValPLC']
neural_df = pd.DataFrame(neuralnet_results, columns=cols)
neural_df.head()

Unnamed: 0,TrnSpearman,ValSpearman,TrnPLC,ValPLC
0,0.708079,0.363553,0.03028,0.052934
1,0.712556,0.074176,0.016758,0.185613
2,0.743386,-0.056777,0.031779,0.07208
3,0.712658,0.262821,0.029151,0.103654
4,0.720798,0.267399,0.034239,0.04285


In [33]:
neural_df.to_csv('results/Original_Scores_NeuralNet.csv', index=False)

## Read results

In [None]:
scores = pd.DataFrame()
scores['LGBM'] = lgbm_df['']

In [None]:
scores = scores[['Random', 'AR', 'KNN', 'KNN Manhattan', 'LGBM', 'Tuned LGBM', 'Neural Net']]
cols = ['Random\nRanking', 'Average\nRanking', 'KNN\n(Euclidean)', 'KNN\n(Manhattan)', 'LGBM', 'Optimized\nLGBM', 'Neural Net']
scores.columns = cols
scores = pd.melt(scores, value_vars=cols, var_name='Ranking Method', 
               value_name='Mean PLC Value')

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
ax.xaxis.labelpad = 30
ax.yaxis.labelpad = 10
sns.boxplot(x='Ranking Method', y='Mean PLC Value', data=acc2)