In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
# pd.set_option('display.max_columns', 1000)
# pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('..')

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from project.ranker.ranker import RankingPredictor

In [3]:
%%time
from sklearn.model_selection import train_test_split
rp = Pipeline([
    ('scale', StandardScaler()),
    ('estimator', RankingPredictor("ma_100", n_neighbors=15)),
])
df_mf, df_rank, df_scores = rp.named_steps['estimator'].get_data()

X, _, y, _, y_scores, _ = train_test_split(df_mf.values,
                                           df_rank.values,
                                           df_scores.values,
                                           test_size=0,
                                           random_state=42)
print(X.shape, y.shape, y_scores.shape)

(60, 39) (60, 13) (60, 13)
CPU times: user 1min 9s, sys: 5.95 s, total: 1min 15s
Wall time: 1min 26s


In [141]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler

class RankDataset(Dataset):
    def __init__(self, X, y, y_scores):
        super().__init__()
        self.X = X.astype(np.float32)
        self.y = y.astype(np.int64)
        self.y_scores = y_scores.astype(np.float32)
        
        self.mf_sz = X.shape[1]
        self.fs_sz = y.shape[1]
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx], self.y_scores[idx]]

In [142]:
def wide2long(X, y):
    n_samples, n_classes = y.shape
    X_cont = np.repeat(X, n_classes, axis=0)
    X_cats = np.array(list(range(n_classes)) * n_samples)
    return X_cont, X_cats.astype(int), y.reshape(-1)

In [172]:
import torch.nn as nn
from torch import optim
from project.ranker.neural_ranker import RankNet
from scipy.stats import rankdata

class NeuralNetwork():
    def __init__(self, mf_sz, fs_sz, params):
        self.mf_sz, self.fs_sz = mf_sz, fs_sz
        self.latent_sz = params['latent_sz']
        self.epochs = params['epochs']
        self.lr = params['learning_rate']
        
        self.model = RankNet(mf_sz, fs_sz, self.latent_sz)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)
        self.criterion = nn.MSELoss()
        
    def train(self, dl):
        train_losses = []
        for epoch in range(self.epochs):
            train_loss = 0
            for X, y, y_scores in dl:
                # for each dataset
                for X_, y_, y_scores_ in zip(X, y, y_scores):
                    X_cont, X_cats, y_long = wide2long(X_[None,:],
                                                       y_[None,:])
                    X_cats = torch.LongTensor(X_cats)
                    # precalculate preds
                    pred = self.model(X_cont, X_cats)
                    
                    # pick random fs method
                    i = np.random.randint(0, len(X_cont))
                    n, j = 0, 0
                    while True:
                        j = np.random.randint(0, len(X_cont))
                        if i == j:
                            continue
                            
                        # if there is discrepancy in ranking    
                        if pred[i] >= pred[j] and y_long[i] < y_long[j]:
                            break                            
                        elif pred[i] <= pred[j] and y_long[i] > y_long[j]:
                            i, j = j, i
                            break
                        
                        n += 1
                        if n == len(X_cont):
                            break
                    
                    # if r[i] >= r[j] wrongly
                    if n < len(X_cont):
                        train_loss += self.train_step(X_cont[[i,j]], X_cats[[i,j]], pred[[i,j]], n)

            train_losses.append(train_loss) 
        return train_losses
    
    def train_step(self, X_cont, X_cats, preds, n):
        self.model.train()
        self.optimizer.zero_grad()
        preds = self.model(X_cont, X_cats)
        
        loss = preds[1] - preds[0] + 1
        
#         loss = F.relu(preds[1] - preds[0] + 1)
#         loss = 2 * (1 / (1 + torch.exp(-loss))) + 1
#         loss = torch.log(loss + 1) / (n + 1)
        
        print(n, preds, loss)
        
        loss.backward()
        self.optimizer.step()
        return loss.item()
    
    def predict(self, dl):
        preds = []
        self.model.eval()
        for X, y, y_scores in dl:
            X_cont, X_cats, y_long = wide2long(X, y)
            X_cats = torch.LongTensor(X_cats)
            X_cont.requires_grad_(False)
            X_cats.requires_grad_(False)
            
            pred = self.model(X_cont, X_cats).cpu().detach().numpy()
            
            pred = np.array([rankdata(x, method='ordinal') for x in \
                             np.reshape(pred, y.shape)]).astype(int)
            preds.extend(pred)
        return np.array(preds)

In [173]:
from project.utils.metrics import evaluate_metric

def cv_neuralnet(X, y, y_scores, kfolds, params, verbose_folds=False):
    results = []
    models = []
    X = StandardScaler().fit_transform(X)
    for idx, (trn_idx, val_idx) in enumerate(kfolds.split(X, y)):
        X_trn, y_trn, y_scores_trn = X[trn_idx], y[trn_idx], y_scores[trn_idx]
        X_val, y_val, y_scores_val = X[val_idx], y[val_idx], y_scores[val_idx]
        
        trn_ds = RankerDataset(X_trn, y_trn, y_scores_trn)
        val_ds = RankerDataset(X_val, y_val, y_scores_val)
        
        neuralnet = NeuralNetwork(trn_ds.mf_sz, trn_ds.fs_sz, params)
        trn_dl = DataLoader(trn_ds, batch_size=params['batch_sz'], shuffle=True)
        neuralnet.train(trn_dl)
        
        trn_dl = DataLoader(trn_ds, batch_size=params['batch_sz'], shuffle=False)
        val_dl = DataLoader(val_ds, batch_size=params['batch_sz'], shuffle=False)
        
        y_pred_trn = neuralnet.predict(trn_dl)
        y_pred_val = neuralnet.predict(val_dl)
        
        trn_spearman = evaluate_metric("spearman", y_trn, y_pred_trn)
        trn_acc_loss = evaluate_metric("mean_acc_loss", y_scores_trn, y_pred_trn)
        val_spearman = evaluate_metric("spearman", y_val, y_pred_val)
        val_acc_loss = evaluate_metric("mean_acc_loss", y_scores_val, y_pred_val)
        
        if verbose_folds:
            print(f'Fold {idx + 1:>3} | '
                  f'Trn_Spearman: {trn_spearman: .4f} | '
                  f'Val_Spearman: {val_spearman: .4f} | '
                  f'Trn_ACCLoss: {trn_acc_loss: .4f} | '
                  f'Val_ACCLoss: {val_acc_loss: .4f}')
            
        results.append((trn_spearman, val_spearman, 
                        trn_acc_loss, val_acc_loss))
        models.append(neuralnet)
        
        break # 1-fold
          
    results = np.array(results)
    print()
    print(f'Trn_Spearman: {results[:,0].mean(): .4f} +/-{results[:,0].std():.4f} | '
          f'Val_Spearman: {results[:,1].mean(): .4f} +/-{results[:,1].std():.4f}\n'
          f'Trn_ACCLoss:  {results[:,2].mean(): .4f} +/-{results[:,2].std():.4f} | '
          f'Val_ACCLoss:  {results[:,3].mean(): .4f} +/-{results[:,3].std():.4f}')
    print()
    return results, models

In [174]:
%%time
import lightgbm
from sklearn.model_selection import RepeatedKFold
kfolds = RepeatedKFold(10, n_repeats=1, random_state=42)
params = {
    'latent_sz': 6,
    'learning_rate': 3e-4,
    'batch_sz': 16,
    'epochs': 10
}
results, models = cv_neuralnet(X, y, y_scores, kfolds, params, 
                               verbose_folds=True)

4 tensor([0.6660, 0.6113], grad_fn=<SigmoidBackward>) tensor(0.9453, grad_fn=<AddBackward>)
0 tensor([0.5646, 0.4154], grad_fn=<SigmoidBackward>) tensor(0.8508, grad_fn=<AddBackward>)
2 tensor([0.5331, 0.4821], grad_fn=<SigmoidBackward>) tensor(0.9490, grad_fn=<AddBackward>)
1 tensor([0.8793, 0.6405], grad_fn=<SigmoidBackward>) tensor(0.7613, grad_fn=<AddBackward>)
0 tensor([0.9102, 0.9012], grad_fn=<SigmoidBackward>) tensor(0.9910, grad_fn=<AddBackward>)
0 tensor([0.4956, 0.4582], grad_fn=<SigmoidBackward>) tensor(0.9627, grad_fn=<AddBackward>)
0 tensor([0.5619, 0.3906], grad_fn=<SigmoidBackward>) tensor(0.8288, grad_fn=<AddBackward>)
0 tensor([0.5918, 0.5714], grad_fn=<SigmoidBackward>) tensor(0.9796, grad_fn=<AddBackward>)
7 tensor([0.4886, 0.4643], grad_fn=<SigmoidBackward>) tensor(0.9757, grad_fn=<AddBackward>)
0 tensor([0.5130, 0.5042], grad_fn=<SigmoidBackward>) tensor(0.9911, grad_fn=<AddBackward>)
0 tensor([0.2764, 0.2451], grad_fn=<SigmoidBackward>) tensor(0.9687, grad_fn=<Ad

In [175]:
ds = RankerDataset(X, y, y_scores)
dl = DataLoader(ds, batch_size=params['batch_sz'], shuffle=False)
y_pred = models[0].predict(dl)

In [176]:
y_pred.shape, y.shape, y_scores.shape

((60, 13), (60, 13), (60, 13))

In [177]:
evaluate_metric('mean_acc_loss', y_scores, y_pred)

0.11048071534140318

In [178]:
y_pred[10]

array([ 3, 10,  4,  5,  6,  7,  9,  1, 11,  2, 12, 13,  8])

In [179]:
y[10]

array([11,  5,  4, 12,  7,  8, 13,  9, 10,  3,  2,  1,  6])

In [180]:
y_pred[5]

array([ 5,  2,  6,  7,  1,  8,  9, 10, 11,  3,  4, 12, 13])

In [181]:
y[5]

array([ 2,  4, 12,  5,  1, 13,  8,  3, 11,  7,  6, 10,  9])