In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
# pd.set_option('display.max_columns', 1000)
# pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('..')

In [2]:
from project.ranker.ranker import RankingPredictor
from project.ranker.ranker import RandomRankingPredictor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import RepeatedKFold
from project.feature_selection.feature_selector import FeatureSelection

In [3]:
rr = RandomRankingPredictor(rank_size=8)
rp = Pipeline([
    ('scale', StandardScaler()),
    ('estimator', RankingPredictor("ma_100", n_neighbors=15)),
])

In [4]:
from project.meta_feature.extractor import MFRCollection

In [5]:
%%time
df_mf, df_rank, _, df_fold_scores = rp.named_steps['estimator'].get_data()

CPU times: user 33.1 s, sys: 2.41 s, total: 35.5 s
Wall time: 36.6 s


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test, y_scores_train, y_scores_test = train_test_split(df_mf.values,
                                                                                   df_rank.values,
                                                                                   df_fold_scores.values,
                                                                                   test_size=0)

In [10]:
X_train.shape, X_test.shape

((60, 39), (0, 39))

In [11]:
from project.utils.metrics import evaluate_metric

In [24]:
def cv_random(rr, X, y, y_scores, kfolds):
    results = []
    for idx, (trn_idx, val_idx) in enumerate(kfolds.split(X, y)):
        X_val, y_val = X[val_idx], y[val_idx]
        X_trn, y_trn = X[trn_idx], y[trn_idx]
        y_scores_trn = y_scores[trn_idx]
        y_scores_val = y_scores[val_idx]
        
        y_pred_trn = rr.predict(y_trn)
        y_pred_val = rr.predict(y_val)
        
        trn_spearman = evaluate_metric("spearman", y_trn, y_pred_trn)
        trn_acc_loss = evaluate_metric("acc_loss", y_scores_trn, y_pred_trn)
        val_spearman = evaluate_metric("spearman", y_val, y_pred_val)
        val_acc_loss = evaluate_metric("acc_loss", y_scores_val, y_pred_val)
        print(f'Fold {idx + 1} | '
              f'Train Spearman: {trn_spearman:.4f} | '
              f'Valid Spearman: {val_spearman:.4f} | '
              f'Train ACC Loss: {trn_acc_loss:.4f} | '
              f'Valid ACC Loss: {val_acc_loss:.4f}')
        
        results.append((trn_spearman, val_spearman, trn_acc_loss, val_acc_loss))
          
    results = np.array(results)
    print()
    print(f'Train Spearman: {results[:,0].mean():.4f} +/-{results[:,0].std()*2:.4f} | '
          f'Valid Spearman: {results[:,1].mean():.4f} +/-{results[:,1].std()*2:.4f}\n'
          f'Train ACC Loss: {results[:,2].mean():.4f} +/-{results[:,2].std()*2:.4f} | '
          f'Valid ACC Loss: {results[:,3].mean():.4f} +/-{results[:,3].std()*2:.4f}')
    print()
    return results

In [25]:
from sklearn.model_selection import KFold
kfolds = KFold(10, shuffle=True, random_state=42)
results = cv_random(rr, X_train, y_train, y_scores_train, kfolds)

Fold 1 | Train Spearman: 0.0335 | Valid Spearman: 0.0595 | Train ACC Loss: 0.1331 | Valid ACC Loss: 0.1389
Fold 2 | Train Spearman: -0.0013 | Valid Spearman: -0.0952 | Train ACC Loss: 0.1407 | Valid ACC Loss: 0.2029
Fold 3 | Train Spearman: -0.0847 | Valid Spearman: 0.0357 | Train ACC Loss: 0.1560 | Valid ACC Loss: 0.0649
Fold 4 | Train Spearman: -0.0304 | Valid Spearman: -0.0675 | Train ACC Loss: 0.1320 | Valid ACC Loss: 0.2734
Fold 5 | Train Spearman: 0.0057 | Valid Spearman: 0.4286 | Train ACC Loss: 0.1340 | Valid ACC Loss: 0.1071
Fold 6 | Train Spearman: 0.0026 | Valid Spearman: 0.1032 | Train ACC Loss: 0.1441 | Valid ACC Loss: 0.1769
Fold 7 | Train Spearman: 0.0106 | Valid Spearman: -0.1310 | Train ACC Loss: 0.1474 | Valid ACC Loss: 0.1369
Fold 8 | Train Spearman: 0.0340 | Valid Spearman: -0.1746 | Train ACC Loss: 0.1397 | Valid ACC Loss: 0.1043
Fold 9 | Train Spearman: -0.0071 | Valid Spearman: 0.1349 | Train ACC Loss: 0.1229 | Valid ACC Loss: 0.1283
Fold 10 | Train Spearman: -0.

In [26]:
def wide2long(X, y):
    n_samples, n_classes = y.shape
    X_cont = np.repeat(X, n_classes, axis=0)
    X_cats = np.array(list(range(n_classes)) * n_samples)[:, None]
    return np.concatenate([X_cont, X_cats], axis=1), y.reshape(-1)

In [51]:
def cv_lgbm(lightgbm, X, y, y_scores, kfolds, params, num_rounds=1000):
    results = []
    for idx, (trn_idx, val_idx) in enumerate(kfolds.split(X, y)):
        X_val, y_val = X[val_idx], y[val_idx]
        X_trn, y_trn = X[trn_idx], y[trn_idx]
        y_scores_trn = y_scores[trn_idx]
        y_scores_val = y_scores[val_idx]
        
        X_trn_long, y_trn_long = wide2long(X_trn, y_trn)
        X_val_long, y_val_long = wide2long(X_val, y_val)
        
        dtrn = lightgbm.Dataset(X_trn_long, y_trn_long, group=[y_trn.shape[1]] * y_trn.shape[0])
        dval = lightgbm.Dataset(X_val_long, y_val_long, group=[y_val.shape[1]] * y_val.shape[0])
        
        bst = lightgbm.train(params, dtrn, num_rounds, [dval],
                  early_stopping_rounds=10, verbose_eval=1)
        y_pred_trn = np.argsort(bst.predict(X_trn_long).reshape(*y_trn.shape))
        y_pred_val = np.argsort(bst.predict(X_val_long).reshape(*y_val.shape))
        
        trn_spearman = evaluate_metric("spearman", y_trn, y_pred_trn)
        trn_acc_loss = evaluate_metric("acc_loss", y_scores_trn, y_pred_trn)
        val_spearman = evaluate_metric("spearman", y_val, y_pred_val)
        val_acc_loss = evaluate_metric("acc_loss", y_scores_val, y_pred_val)
        print(f'Fold {idx + 1} | '
              f'No. estimators: {bst.best_iteration} | '
              f'Train Spearman: {trn_spearman:.4f} | '
              f'Valid Spearman: {val_spearman:.4f} | '
              f'Train ACC Loss: {trn_acc_loss:.4f} | '
              f'Valid ACC Loss: {val_acc_loss:.4f}')
        print()
        
        results.append((trn_spearman, val_spearman, trn_acc_loss, val_acc_loss))
          
    results = np.array(results)
    print()
    print(f'Train Spearman: {results[:,0].mean():.4f} +/-{results[:,0].std()*2:.4f} | '
          f'Valid Spearman: {results[:,1].mean():.4f} +/-{results[:,1].std()*2:.4f}\n'
          f'Train ACC Loss: {results[:,2].mean():.4f} +/-{results[:,2].std()*2:.4f} | '
          f'Valid ACC Loss: {results[:,3].mean():.4f} +/-{results[:,3].std()*2:.4f}')
    print()
    return results

In [50]:
import lightgbm
from sklearn.model_selection import KFold
kfolds = KFold(10, shuffle=True, random_state=42)
params = {'objective': 'lambdarank', 'metric': 'ndcg', 'ndcg_at': y_train.shape[1]}
results = cv_lgbm(lightgbm, X_train, y_train, y_scores_train, kfolds, params)

Fold 1 | No. estimators: 4 | Train Spearman: 0.2509 | Valid Spearman: 0.1944 | Train ACC Loss: 0.1166 | Valid ACC Loss: 0.2202
Fold 2 | No. estimators: 1 | Train Spearman: 0.0939 | Valid Spearman: 0.0238 | Train ACC Loss: 0.1496 | Valid ACC Loss: 0.0773
Fold 3 | No. estimators: 1 | Train Spearman: 0.1283 | Valid Spearman: 0.0000 | Train ACC Loss: 0.1413 | Valid ACC Loss: 0.1009
Fold 4 | No. estimators: 12 | Train Spearman: 0.2628 | Valid Spearman: 0.1111 | Train ACC Loss: 0.1259 | Valid ACC Loss: 0.0340
Fold 5 | No. estimators: 5 | Train Spearman: 0.3108 | Valid Spearman: -0.0833 | Train ACC Loss: 0.1120 | Valid ACC Loss: 0.2802
Fold 6 | No. estimators: 3 | Train Spearman: 0.1310 | Valid Spearman: 0.2460 | Train ACC Loss: 0.1162 | Valid ACC Loss: 0.1689
Fold 7 | No. estimators: 1 | Train Spearman: 0.1477 | Valid Spearman: -0.1389 | Train ACC Loss: 0.1376 | Valid ACC Loss: 0.1628
Fold 8 | No. estimators: 3 | Train Spearman: 0.2571 | Valid Spearman: -0.0595 | Train ACC Loss: 0.1110 | Val

In [52]:
import lightgbm
from sklearn.model_selection import KFold
kfolds = KFold(10, shuffle=True, random_state=42)
params = {'objective': 'lambdarank', 'metric': 'ndcg'}
results = cv_lgbm(lightgbm, X_train, y_train, y_scores_train, kfolds, params)

[1]	valid_0's ndcg@1: 0.133333	valid_0's ndcg@2: 0.159816	valid_0's ndcg@3: 0.39202	valid_0's ndcg@4: 0.446629	valid_0's ndcg@5: 0.506428
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's ndcg@1: 0.069281	valid_0's ndcg@2: 0.0797009	valid_0's ndcg@3: 0.198321	valid_0's ndcg@4: 0.359477	valid_0's ndcg@5: 0.431284
[3]	valid_0's ndcg@1: 0.220915	valid_0's ndcg@2: 0.2616	valid_0's ndcg@3: 0.434599	valid_0's ndcg@4: 0.489223	valid_0's ndcg@5: 0.543702
[4]	valid_0's ndcg@1: 0.299346	valid_0's ndcg@2: 0.470636	valid_0's ndcg@3: 0.596581	valid_0's ndcg@4: 0.662137	valid_0's ndcg@5: 0.663208
[5]	valid_0's ndcg@1: 0.11634	valid_0's ndcg@2: 0.281181	valid_0's ndcg@3: 0.392037	valid_0's ndcg@4: 0.516918	valid_0's ndcg@5: 0.558278
[6]	valid_0's ndcg@1: 0.150327	valid_0's ndcg@2: 0.263114	valid_0's ndcg@3: 0.386886	valid_0's ndcg@4: 0.462078	valid_0's ndcg@5: 0.534681
[7]	valid_0's ndcg@1: 0.210458	valid_0's ndcg@2: 0.352796	valid_0's ndcg@3: 0.416131	valid_0's ndcg@4: 0.52

In [81]:
pd.DataFrame(y_trn_pred.reshape(*y_train.shape)).head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,-0.016975,0.178613,0.51551,0.25787,-0.648984,-0.75525,-0.680263,-0.358872
1,-0.287769,-0.265789,-0.271825,-0.615658,-1.049278,-0.220794,0.603262,0.393828
2,-0.808988,-0.971056,-0.74045,-0.896691,-0.80469,-0.16952,0.085088,0.239131
3,-0.569542,-0.031412,-0.225745,0.506981,0.345174,0.838387,0.154664,0.238592
4,0.59626,0.208392,-0.052182,-0.461827,-0.954403,-0.535102,-0.396171,-0.162349


In [82]:
pd.DataFrame(y_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,2,3,8,6,4,5,1,7
1,3,5,1,7,2,4,8,6
2,3,5,1,2,4,6,7,8
3,1,3,4,7,6,8,2,5
4,8,3,7,1,5,2,6,4


In [83]:
pd.DataFrame(y_tst_pred.reshape(*y_test.shape)).head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,-0.178293,0.340332,-0.253706,-0.309642,-1.07608,-0.450771,-0.164009,-0.154185
1,-0.715041,-0.314437,-0.339207,-0.262516,-0.855816,-0.325488,-0.160882,0.114249
2,-0.086199,-0.543968,-0.617068,-1.022921,-1.285527,-1.071132,-0.896273,-1.00778
3,-0.700709,-0.884632,-0.690102,-0.666368,-0.61132,-0.281327,-0.201452,0.07511
4,-0.613858,-0.316625,-0.629935,-0.534206,-0.985201,-0.339851,-0.057919,0.229094


In [84]:
pd.DataFrame(y_test).head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,8,2,4,1,5,3,6,7
1,3,4,2,5,8,7,6,1
2,1,8,2,4,5,6,3,7
3,7,1,2,6,5,4,8,3
4,3,6,1,8,2,5,4,7
