In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
# pd.set_option('display.max_columns', 1000)
# pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('..')

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from project.ranker.ranker import RankingPredictor

In [3]:
%%time
from sklearn.model_selection import train_test_split
rp = Pipeline([
    ('scale', StandardScaler()),
    ('estimator', RankingPredictor("ma_100", n_neighbors=15)),
])
df_mf, df_rank, df_scores = rp.named_steps['estimator'].get_data()

X, _, y, _, y_scores, _ = train_test_split(df_mf.values,
                                           df_rank.values,
                                           df_scores.values,
                                           test_size=0,
                                           random_state=42)
print(X.shape, y.shape, y_scores.shape)

(60, 39) (60, 13) (60, 13)
CPU times: user 45 s, sys: 5.07 s, total: 50.1 s
Wall time: 1min 5s


In [17]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler

class RankDataset(Dataset):
    def __init__(self, X, y, y_scores):
        super().__init__()
        self.X = X.astype(np.float32)
        self.y = y.astype(np.int64)
        self.y_scores = y_scores.astype(np.float32)
        
        self.mf_sz = X.shape[1]
        self.fs_sz = y.shape[1]
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx], self.y_scores[idx]]

In [18]:
def wide2long(X, y):
    n_samples, n_classes = y.shape
    X_cont = np.repeat(X, n_classes, axis=0)
    X_cats = np.array(list(range(n_classes)) * n_samples)
    return X_cont, X_cats.astype(int), y.reshape(-1)

In [19]:
import torch.nn as nn
from torch import optim
from project.ranker.neural_ranker import RankNet
from scipy.stats import rankdata

class NeuralNetwork():
    def __init__(self, mf_sz, fs_sz, params):
        self.mf_sz, self.fs_sz = mf_sz, fs_sz
        self.latent_sz = params['latent_sz']
        self.epochs = params['epochs']
        self.lr = params['learning_rate']
        self.num_negative_samples = params['num_negative_samples']
        
        self.model = RankNet(mf_sz, fs_sz, self.latent_sz)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        self.criterion = nn.MSELoss()
        
    def train(self, dl):
        train_losses = []
        for epoch in range(self.epochs):
            train_loss = 0
            for X, y, y_scores in dl:
                # for each dataset
                X_cont, X_cats, y_long = wide2long(X, y)
                X_cats = torch.LongTensor(X_cats)

                positive_pred = self.model(X_cont, X_cats)
                negative_pred = self.get_multiple_negative_preds(X_cont, n=self.num_negative_samples)
                
                train_loss += self.train_step(positive_pred, negative_pred)

            train_losses.append(train_loss) 
        return train_losses
    
    def get_negative_preds(self, X_cont):
        negative_items = np.random.randint(0, self.fs_sz, len(X_cont), dtype=np.int64)
        X_cats = torch.from_numpy(negative_items)
        return self.model(X_cont, X_cats)
    
    def get_multiple_negative_preds(self, X_cont, n=10):
        negative_preds = self.get_negative_preds(X_cont[None, ...] 
                                                 .expand(n, *X_cont.shape)
                                                 .reshape(-1, X_cont.shape[-1]))
        return negative_preds.view(n, len(X_cont))
    
    def train_step(self, positive_preds, negative_preds):
        self.model.train()
        self.optimizer.zero_grad()
        
        highest_negative_preds, _ = torch.max(negative_preds, 0)
        loss = torch.clamp(highest_negative_preds - positive_preds + 1.0, 0.0).mean()
        
        loss.backward()
        self.optimizer.step()
        return loss.item()
    
    def predict(self, dl):
        preds = []
        self.model.eval()
        for X, y, y_scores in dl:
            X_cont, X_cats, y_long = wide2long(X, y)
            X_cats = torch.LongTensor(X_cats)
            X_cont.requires_grad_(False)
            X_cats.requires_grad_(False)
            
            pred = self.model(X_cont, X_cats).cpu().detach().numpy()
            
            pred = np.array([rankdata(x, method='ordinal') for x in \
                             np.reshape(pred, y.shape)]).astype(int)
            preds.extend(pred)
        return np.array(preds)

In [20]:
from project.utils.metrics import evaluate_metric

def cv_neuralnet(X, y, y_scores, kfolds, params, verbose_folds=False):
    results = []
    models = []
    X = StandardScaler().fit_transform(X)
    for idx, (trn_idx, val_idx) in enumerate(kfolds.split(X, y)):
        X_trn, y_trn, y_scores_trn = X[trn_idx], y[trn_idx], y_scores[trn_idx]
        X_val, y_val, y_scores_val = X[val_idx], y[val_idx], y_scores[val_idx]
        
        trn_ds = RankDataset(X_trn, y_trn, y_scores_trn)
        val_ds = RankDataset(X_val, y_val, y_scores_val)
        
        neuralnet = NeuralNetwork(trn_ds.mf_sz, trn_ds.fs_sz, params)
        trn_dl = DataLoader(trn_ds, batch_size=params['batch_sz'], shuffle=True)
        neuralnet.train(trn_dl)
        
        trn_dl = DataLoader(trn_ds, batch_size=params['batch_sz'], shuffle=False)
        val_dl = DataLoader(val_ds, batch_size=params['batch_sz'], shuffle=False)
        
        y_pred_trn = neuralnet.predict(trn_dl)
        y_pred_val = neuralnet.predict(val_dl)
        
        trn_spearman = evaluate_metric("spearman", y_trn, y_pred_trn)
        trn_acc_loss = evaluate_metric("mean_acc_loss", y_scores_trn, y_pred_trn)
        val_spearman = evaluate_metric("spearman", y_val, y_pred_val)
        val_acc_loss = evaluate_metric("mean_acc_loss", y_scores_val, y_pred_val)
        
        if verbose_folds:
            print(f'Fold {idx + 1:>3} | '
                  f'Trn_Spearman: {trn_spearman: .4f} | '
                  f'Val_Spearman: {val_spearman: .4f} | '
                  f'Trn_ACCLoss: {trn_acc_loss: .4f} | '
                  f'Val_ACCLoss: {val_acc_loss: .4f}')
            
        results.append((trn_spearman, val_spearman, 
                        trn_acc_loss, val_acc_loss))
        models.append(neuralnet)
        
#         break # 1-fold
          
    results = np.array(results)
    print()
    print(f'Trn_Spearman: {results[:,0].mean(): .4f} +/-{results[:,0].std():.4f} | '
          f'Val_Spearman: {results[:,1].mean(): .4f} +/-{results[:,1].std():.4f}\n'
          f'Trn_ACCLoss:  {results[:,2].mean(): .4f} +/-{results[:,2].std():.4f} | '
          f'Val_ACCLoss:  {results[:,3].mean(): .4f} +/-{results[:,3].std():.4f}')
    print()
    return results, models

In [23]:
%%time
import lightgbm
from sklearn.model_selection import RepeatedKFold
kfolds = RepeatedKFold(10, n_repeats=2, random_state=42)
params = {
    'latent_sz': 8,
    'learning_rate': 1e-3,
    'batch_sz': 16,
    'epochs': 20,
    'num_negative_samples': 8
}
results, models = cv_neuralnet(X, y, y_scores, kfolds, params, 
                               verbose_folds=True)

Fold   1 | Trn_Spearman:  0.0451 | Val_Spearman: -0.0128 | Trn_ACCLoss:  0.1146 | Val_ACCLoss:  0.1505
Fold   2 | Trn_Spearman: -0.0223 | Val_Spearman:  0.0147 | Trn_ACCLoss:  0.1271 | Val_ACCLoss:  0.2006
Fold   3 | Trn_Spearman:  0.0256 | Val_Spearman: -0.0485 | Trn_ACCLoss:  0.1427 | Val_ACCLoss:  0.0413
Fold   4 | Trn_Spearman:  0.0052 | Val_Spearman: -0.0778 | Trn_ACCLoss:  0.1516 | Val_ACCLoss:  0.2452
Fold   5 | Trn_Spearman: -0.0134 | Val_Spearman: -0.0760 | Trn_ACCLoss:  0.1548 | Val_ACCLoss:  0.0621
Fold   6 | Trn_Spearman:  0.0038 | Val_Spearman: -0.0055 | Trn_ACCLoss:  0.1330 | Val_ACCLoss:  0.2291
Fold   7 | Trn_Spearman: -0.0151 | Val_Spearman:  0.0998 | Trn_ACCLoss:  0.1205 | Val_ACCLoss:  0.0958
Fold   8 | Trn_Spearman:  0.0119 | Val_Spearman: -0.0275 | Trn_ACCLoss:  0.1504 | Val_ACCLoss:  0.0493
Fold   9 | Trn_Spearman:  0.0041 | Val_Spearman: -0.1685 | Trn_ACCLoss:  0.1111 | Val_ACCLoss:  0.1847
Fold  10 | Trn_Spearman:  0.0629 | Val_Spearman:  0.0559 | Trn_ACCLoss:  

In [252]:
%%time
import lightgbm
from sklearn.model_selection import RepeatedKFold
kfolds = RepeatedKFold(10, n_repeats=10, random_state=42)
params = {
    'latent_sz': 8,
    'learning_rate': 1e-2,
    'batch_sz': 16,
    'epochs': 100
}
results, models = cv_neuralnet(X, y, y_scores, kfolds, params, 
                               verbose_folds=True)

Fold   1 | Trn_Spearman:  0.0024 | Val_Spearman: -0.2500 | Trn_ACCLoss:  0.1543 | Val_ACCLoss:  0.2222
Fold   2 | Trn_Spearman: -0.0315 | Val_Spearman:  0.0815 | Trn_ACCLoss:  0.1131 | Val_ACCLoss:  0.1618
Fold   3 | Trn_Spearman:  0.0130 | Val_Spearman: -0.0247 | Trn_ACCLoss:  0.1500 | Val_ACCLoss:  0.0497
Fold   4 | Trn_Spearman:  0.0619 | Val_Spearman:  0.2683 | Trn_ACCLoss:  0.0998 | Val_ACCLoss:  0.1075
Fold   5 | Trn_Spearman:  0.0096 | Val_Spearman:  0.0714 | Trn_ACCLoss:  0.1498 | Val_ACCLoss:  0.0450
Fold   6 | Trn_Spearman: -0.0506 | Val_Spearman: -0.1767 | Trn_ACCLoss:  0.1598 | Val_ACCLoss:  0.2332
Fold   7 | Trn_Spearman:  0.0260 | Val_Spearman:  0.0366 | Trn_ACCLoss:  0.1209 | Val_ACCLoss:  0.1183
Fold   8 | Trn_Spearman:  0.0694 | Val_Spearman:  0.2537 | Trn_ACCLoss:  0.1149 | Val_ACCLoss:  0.0140
Fold   9 | Trn_Spearman: -0.0380 | Val_Spearman:  0.1429 | Trn_ACCLoss:  0.1254 | Val_ACCLoss:  0.0880
Fold  10 | Trn_Spearman:  0.0038 | Val_Spearman: -0.1557 | Trn_ACCLoss:  

In [26]:
ds = RankDataset(X, y, y_scores)
dl = DataLoader(ds, batch_size=params['batch_sz'], shuffle=False)
y_pred = models[0].predict(dl)

In [27]:
y_pred.shape, y.shape, y_scores.shape

((60, 13), (60, 13), (60, 13))

In [28]:
evaluate_metric('mean_acc_loss', y_scores, y_pred)

0.15265806292078446

In [29]:
y_pred[10]

array([ 5,  2,  1,  8, 12,  6,  4,  9,  7, 13, 10, 11,  3])

In [30]:
y[10]

array([11,  5,  4, 12,  7,  8, 13,  9, 10,  3,  2,  1,  6])

In [31]:
y_pred[5]

array([12, 13,  6, 10,  5,  1,  3,  4,  9,  7, 11,  2,  8])

In [32]:
y[5]

array([ 2,  4, 12,  5,  1, 13,  8,  3, 11,  7,  6, 10,  9])