In [None]:
# Installing a our main torch network - Tabnet
!pip install pytorch-tabnet
!pip install iterative-stratification

In [3]:
### General
import os
import sys
import copy
import tqdm
import pickle
import random
import warnings
warnings.filterwarnings("ignore")
sys.path.append("../input/rank-gauss")
os.environ["CUDA_LAUNCH_BLOCKING"] = '1'

### Data Processing
import numpy as np
import pandas as pd
from scipy import stats
import gauss_rank_scaler
### Data Viz
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")

### Machine Learning 
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_selection import VarianceThreshold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

### Deep Learning 
import torch
from torch import nn
import torch.optim as optim
from torch.nn import functional as F
from torch.nn.modules.loss import _WeightedLoss
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
# Tabnet 
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor


In [12]:
# Setting our seed
seed = 42

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
set_seed(seed)

In [6]:
# Importing our data
train = pd.read_csv("New Features Datasets/train_feats_new.csv")


targets = pd.read_csv("train_targets_scored.csv")




test = pd.read_csv("New Features Datasets/test_feats_new.csv")


submission = pd.read_csv("sample_submission.csv")

In [7]:
# Opening the data_all pickle we previously saved in our data processing notebook
with open("data_all.pickle", "rb") as f:
    data_all = pickle.load(f)

In [8]:
# train_df and test_df
features_to_drop = ["sig_id", "cp_type"]
data_all.drop(features_to_drop, axis = 1, inplace = True)
try:
    targets.drop("sig_id", axis = 1, inplace = True)
except:
    pass
train_df = data_all[: train.shape[0]]
train_df.reset_index(drop = True, inplace = True)

test_df = data_all[train_df.shape[0]: ]
test_df.reset_index(drop = True, inplace = True)

In [9]:
# Just checking to make sure we've got the same number of features for train and test
print(f"{b_}train_df.shape: {r_}{train_df.shape}")
print(f"{b_}test_df.shape: {r_}{test_df.shape}")

[34mtrain_df.shape: [31m(23814, 1123)
[34mtest_df.shape: [31m(3982, 1123)


In [10]:
# Same with our x-test
X_test = test_df.values
print(f"{b_}X_test.shape: {r_}{X_test.shape}")

[34mX_test.shape: [31m(3982, 1123)


In [14]:
# Hyperparameters for our Tabnet Model
MAX_EPOCH = 200
tabnet_params = dict(
    n_d = 32,
    n_a = 32,
    n_steps = 1,
    gamma = 1.3,
    lambda_sparse = 0,
    optimizer_fn = optim.Adam,
    optimizer_params = dict(lr = 2e-2, weight_decay = 1e-5),
    mask_type = "entmax",
    scheduler_params = dict(
        mode = "min", patience = 5, min_lr = 1e-5, factor = 0.9),
    scheduler_fn = ReduceLROnPlateau,
    seed = seed,
    verbose = 10
)

In [15]:
# This is the custom loss function the competition was using
class LogitsLogLoss(Metric):
    """
    LogLoss with sigmoid applied
    """

    def __init__(self):
        self._name = "logits_ll"
        self._maximize = False

    def __call__(self, y_true, y_pred):
        """
        Compute LogLoss of predictions.

        Parameters
        ----------
        y_true: np.ndarray
            Target matrix or vector
        y_score: np.ndarray
            Score matrix or vector

        Returns
        -------
            float
            LogLoss of predictions vs targets.
        """
        logits = 1 / (1 + np.exp(-y_pred))
        aux = (1 - y_true) * np.log(1 - logits + 1e-15) + y_true * np.log(logits + 1e-15)
        return np.mean(-aux)

In [None]:
# Here's the guts of our TabNet neural Network
scores_auc_all = []
test_cv_preds = []

NB_SPLITS = 10 # 7
mskf = MultilabelStratifiedKFold(n_splits = NB_SPLITS, random_state = 0, shuffle = True)

oof_preds = []
oof_targets = []
scores = []
scores_auc = []
for fold_nb, (train_idx, val_idx) in enumerate(mskf.split(train_df, targets)):
    print(b_,"FOLDS: ", r_, fold_nb + 1)
    print(g_, '*' * 60, c_)
    
    X_train, y_train = train_df.values[train_idx, :], targets.values[train_idx, :]
    X_val, y_val = train_df.values[val_idx, :], targets.values[val_idx, :]
    ### Model 
    model = TabNetRegressor(**tabnet_params)
        
    ### Fit 
    model.fit(
        X_train = X_train,
        y_train = y_train,
        eval_set = [(X_val, y_val)],
        eval_name = ["val"],
        eval_metric = ["logits_ll"],
        max_epochs = MAX_EPOCH,
        patience = 20,
        batch_size = 1024, 
        virtual_batch_size = 32,
        num_workers = 1,
        drop_last = False,
        # To use binary cross entropy because this is not a regression problem
        loss_fn = F.binary_cross_entropy_with_logits
    )
    print(y_, '-' * 60)
    
    ### Predicting on our validation test set
    preds_val = model.predict(X_val)
    # Applying our sigmoid function
    preds = 1 / (1 + np.exp(-preds_val))
    score = np.min(model.history["val_logits_ll"])
    
    ### Saving oof for cross validation
    oof_preds.append(preds_val)
    oof_targets.append(y_val)
    scores.append(score)
    
    ### Finally, modeling the prediction on the test dataset 
    preds_test = model.predict(X_test)
    test_cv_preds.append(1 / (1 + np.exp(-preds_test)))

oof_preds_all = np.concatenate(oof_preds)
oof_targets_all = np.concatenate(oof_targets)
test_preds_all = np.stack(test_cv_preds)

[34m FOLDS:  [31m 1
[32m ************************************************************ [36m
Device used : cuda
epoch 0  | loss: 0.3356  | val_logits_ll: 0.03163 |  0:00:02s
epoch 10 | loss: 0.01914 | val_logits_ll: 0.01884 |  0:00:24s
epoch 20 | loss: 0.0176  | val_logits_ll: 0.01775 |  0:00:47s
epoch 30 | loss: 0.01721 | val_logits_ll: 0.01875 |  0:01:10s
epoch 40 | loss: 0.01681 | val_logits_ll: 0.01705 |  0:01:32s
epoch 50 | loss: 0.01656 | val_logits_ll: 0.01681 |  0:01:55s
epoch 60 | loss: 0.01634 | val_logits_ll: 0.01683 |  0:02:18s
epoch 70 | loss: 0.01616 | val_logits_ll: 0.01662 |  0:02:41s
epoch 80 | loss: 0.01595 | val_logits_ll: 0.0168  |  0:03:04s
epoch 90 | loss: 0.01577 | val_logits_ll: 0.01664 |  0:03:26s
epoch 100| loss: 0.01563 | val_logits_ll: 0.01669 |  0:03:49s
epoch 110| loss: 0.01517 | val_logits_ll: 0.01661 |  0:04:12s
epoch 120| loss: 0.01481 | val_logits_ll: 0.01669 |  0:04:35s

Early stopping occured at epoch 125 with best_epoch = 105 and best_val_logits_l

In [16]:
# I ran this on Colab's environment and saved the resultant model, here we're just loading it.
model = torch.load('tabnet_model.pt')
model

TabNetRegressor(n_d=32, n_a=32, n_steps=1, gamma=1.3, cat_idxs=[], cat_dims=[], cat_emb_dim=1, n_independent=2, n_shared=2, epsilon=1e-15, momentum=0.02, lambda_sparse=0, seed=42, clip_value=1, verbose=10, optimizer_fn=<class 'torch.optim.adam.Adam'>, optimizer_params={'lr': 0.02, 'weight_decay': 1e-05}, scheduler_fn=<class 'torch.optim.lr_scheduler.ReduceLROnPlateau'>, scheduler_params={'mode': 'min', 'patience': 5, 'min_lr': 1e-05, 'factor': 0.9}, mask_type='entmax', input_dim=975, output_dim=206, device_name='auto')

In [None]:
# And finally, save the submission we've created from modeling.
all_feat = [col for col in submission.columns if col not in ["sig_id"]]

test = pd.read_csv("test_feats_new.csv")
sig_id = test[test["cp_type"] != "ctl_vehicle"].sig_id.reset_index(drop = True)
tmp = pd.DataFrame(test_preds_all.mean(axis = 0), columns = all_feat)
tmp["sig_id"] = sig_id

submission = pd.merge(test[["sig_id"]], tmp, on = "sig_id", how = "left")
submission.fillna(0, inplace = True)




submission.to_csv("/content/gdrive/My Drive/submission.csv", index = None)
submission.head()