In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import os
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
# pd.set_option('display.max_columns', 1000)
# pd.set_option('display.max_rows', 400)
sns.set()

os.chdir('..')

In [35]:
features = pd.read_csv('meta_features.csv')
ranking  = pd.read_csv('ranking_fs.csv')

In [3]:
features.head()

Unnamed: 0,dataset,X_correlation_max,X_correlation_mean,X_correlation_min,X_covariance_max,X_covariance_mean,X_covariance_min,X_exp_var_max,X_exp_var_n_t80_cumsum,X_kurtosis_max,...,X_stand_dev_min,X_std_covariance_max,X_std_covariance_mean,X_std_covariance_min,X_std_exp_var_max,X_var_coef_max,X_var_coef_mean,X_var_coef_min,y_norm_class_entropy_none,y_num_classes_none
0,dataset_prostate_singh,0.994905,0.001368,-0.98984,6501921.0,29.654376,-1662059.0,0.538233,3,96.809515,...,45.069954,0.994905,0.001368,-0.98984,0.635789,24.64494,0.968532,0.393882,0.999723,2
1,dataset_glioma_phillips,0.993589,0.026853,-0.690733,432343500.0,245013.502247,-65924950.0,0.311808,16,94.998,...,849.046524,0.993589,0.026853,-0.690733,0.144017,6.143025,0.497883,0.196919,0.79504,2
2,dataset_leukemia_armstrong,0.99428,0.084879,-0.916524,115480600.0,556524.637475,-73486290.0,0.249095,19,64.783987,...,1480.763026,0.99428,0.084879,-0.916524,0.241996,2.250262,0.511498,0.239257,0.991532,3
3,dataset_breast_sotiriou,0.99731,0.703431,-0.803858,6.244864,1.45108,-5.063628,0.721969,3,10.362369,...,1.308462,0.99731,0.703431,-0.803858,0.745195,0.389483,0.202296,0.182322,0.993717,3
4,dataset_lymphoma_dave_2,0.985847,0.019387,-0.739594,5.893252,0.027503,-3.396945,0.124228,93,35.139636,...,0.610939,0.985847,0.019387,-0.739594,0.140919,0.525447,0.158073,0.08656,0.882052,4


In [4]:
ranking.head()

Unnamed: 0,dataset,minmax_chi_square_naiveBayes,minmax_fisher_naiveBayes,minmax_reliefF_naiveBayes
0,dataset_prostate_singh,2,3,1
1,dataset_glioma_phillips,3,2,1
2,dataset_leukemia_armstrong,2,3,1
3,dataset_breast_sotiriou,3,1,2
4,dataset_lymphoma_dave_2,2,1,3


In [5]:
features.shape, ranking.shape

((60, 40), (60, 4))

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

In [78]:
class RankerNet(nn.Module):
    def __init__(self, dataset_sz, ranker_sz, latent_sz):
        super().__init__()
        self.linear = nn.Linear(dataset_sz, latent_sz)
        self.embedding = nn.Embedding(ranker_sz, latent_sz)
#         self.linear.weight.data.uniform_(0, 0.05)
#         self.embedding.weight.data.uniform_(0, 0.05)
    
    def forward(self, dataset_features, ranker_index):
        latent_dataset = self.linear(dataset_features)
        latent_ranker  = self.embedding(ranker_index)
        output = (latent_dataset * latent_ranker).sum(1)
        return output

In [79]:
model = RankerNet(39, 3, 10)

In [73]:
x_dataset = torch.FloatTensor(features.iloc[:2, 1:].values)
x_ranker  = torch.LongTensor([0,1])

In [74]:
model(x_dataset, x_ranker)

tensor([ 6.1817e+07,  3.6183e+09])

In [46]:
dataset = features.iloc[:, 1:].values.repeat(3, axis=0)
ranker  = np.concatenate([np.zeros(len(ranking)),
                          np.ones(len(ranking)),
                          np.ones(len(ranking)) + 1]).astype(int)
target  = np.concatenate([ranking.minmax_chi_square_naiveBayes.values,
                          ranking.minmax_fisher_naiveBayes,
                          ranking.minmax_reliefF_naiveBayes])

In [47]:
dataset.shape, ranker.shape, target.shape

((180, 39), (180,), (180,))

In [48]:
from torch.utils.data import DataLoader, Dataset

In [49]:
class RankerDataset(Dataset):
    def __init__(self, dataset, ranker, target):
        self.dataset = dataset.astype(np.float32)
        self.ranker = ranker.astype(np.int64)
        self.target = target.astype(np.float32) \
                        if target is not None else \
                        np.zeros(len(dataset)).astype(np.float32)
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        return [self.dataset[idx], self.ranker[idx], self.target[idx]]

In [84]:
ds = RankerDataset(dataset, ranker, target)

In [85]:
dl = DataLoader(ds, batch_size=32, shuffle=True)

In [52]:
def train_step(model, dataset, ranker, target, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    preds = model(dataset, ranker)
    loss = criterion(preds.view(-1), target)
    loss.backward()
    optimizer.step()
    return loss.item()

In [69]:
def train_model(model, train_loader, optimizer, criterion,
                n_epochs, print_every=1, USE_CUDA=False):
    train_losses = []
    for epoch in range(n_epochs):
        train_loss = 0
        for batch_idx, (dataset, ranker, target) in enumerate(train_loader):
            train_loss += train_step(model, dataset, ranker, target, optimizer, criterion)
            if batch_idx > 0 and batch_idx % print_every == 0:
                train_loss /= print_every
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                        epoch + 1, batch_idx * len(dataset), len(train_loader.dataset),
                        100. * batch_idx / len(train_loader), train_loss))
                train_losses.append(train_loss)
                train_loss = 0
        
        print()
    return model, train_losses

In [59]:
USE_CUDA = False

In [133]:
pd.DataFrame(dataset).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,29,30,31,32,33,34,35,36,37,38
count,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,...,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0,180.0
mean,0.992072,0.091662,-0.784183,2039197000.0,555248.1,-418348800.0,0.307231,20.433333,67.791047,5.315244,...,408.621286,0.992072,0.091662,-0.784183,0.218465,9.269826,-12.284393,-44.985478,0.923358,2.516667
std,0.014959,0.173092,0.163485,10928350000.0,1872173.0,2286736000.0,0.198924,19.677313,52.806781,4.324071,...,699.357013,0.014959,0.173092,0.163485,0.16428,58.89412,85.27618,289.088442,0.098877,0.887373
min,0.915535,0.000663,-0.996293,0.06122095,-2411.734,-17654600000.0,0.034849,0.0,10.362369,-0.156562,...,0.041068,0.915535,0.000663,-0.996293,0.03377,-254.020889,-658.725753,-2243.7455,0.629249,2.0
25%,0.993146,0.00991,-0.889751,6.712167,0.03004735,-66260370.0,0.176287,7.0,34.342161,1.674326,...,0.758121,0.993146,0.00991,-0.889751,0.130432,0.518238,0.193711,0.066518,0.909347,2.0
50%,0.996141,0.021898,-0.808215,7046733.0,210.2329,-1388524.0,0.246706,18.0,49.015076,4.068749,...,52.980775,0.996141,0.021898,-0.808215,0.162345,5.833418,0.497587,0.153729,0.957392,2.0
75%,0.998936,0.078478,-0.714148,234294200.0,187414.4,-3.40364,0.363671,26.0,80.569768,8.561525,...,541.321099,0.998936,0.078478,-0.714148,0.240794,12.504444,0.794166,0.235056,0.997111,3.0
max,1.0,0.80247,-0.135336,83865240000.0,11497210.0,-0.03401543,0.909037,108.0,279.030686,21.257351,...,3237.989744,1.0,0.80247,-0.135336,0.816744,359.257296,26.876461,22.07547,0.999723,6.0


In [134]:
from sklearn.preprocessing import StandardScaler

In [136]:
scaler = StandardScaler().fit(dataset)

In [137]:
dataset_norm = scaler.transform(dataset)

In [142]:
x_dataset = torch.FloatTensor(dataset_norm[:1])
x_ranker  = torch.LongTensor(ranker[:1])
x_target  = torch.FloatTensor(target[:1])

In [143]:
model(x_dataset, x_ranker), x_target

(tensor([ 4.9510]), tensor([ 2.]))

In [343]:
ds = RankerDataset(dataset_norm[3:], ranker[3:], target[3:])
dl = DataLoader(ds, batch_size=32, shuffle=True)

In [340]:
model = RankerNet(dataset_sz=39, ranker_sz=3, latent_sz=100)

In [341]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

In [342]:
model, train_losses = train_model(model, dl, optimizer, criterion, n_epochs=100)







































































































In [95]:
from lightgbm import LGBMRegressor

In [96]:
lgbm = LGBMRegressor()

In [97]:
lgbm.fit(dataset, target)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
       n_jobs=-1, num_leaves=31, objective=None, random_state=None,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)

In [98]:
from sklearn.metrics import mean_squared_error

In [99]:
preds = lgbm.predict(dataset)

In [100]:
mean_squared_error(target, preds)

0.4121804313016847