## <font color = "green">Installing Libraries</font>

In [1]:
# TabNet
!pip install --no-index --find-links /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl pytorch-tabnet
# Iterative Stratification
!pip install /kaggle/input/iterative-stratification/iterative-stratification-master/

Looking in links: /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl
Processing /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-2.0.0
Processing /kaggle/input/iterative-stratification/iterative-stratification-master
Building wheels for collected packages: iterative-stratification
  Building wheel for iterative-stratification (setup.py) ... [?25l- \ done
[?25h  Created wheel for iterative-stratification: filename=iterative_stratification-0.1.6-py3-none-any.whl size=8401 sha256=24820373b51c58fd34ba153b41319d41f8fffd227bde3d7817760a4c4d458448
  Stored in directory: /root/.cache/pip/wheels/b8/47/3f/eb4af42d124f37d23d6f13a4c8bbc32c1d70140e6e1cecb4aa
Successfully built iterative-stratification
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.6


## <font color = "green">Loading Libraries</font>

In [2]:
### General ###
import os
import sys
import copy
import tqdm
import pickle
import random
import warnings
warnings.filterwarnings("ignore")
sys.path.append("../input/rank-gauss")
os.environ["CUDA_LAUNCH_BLOCKING"] = '1'

### Data Wrangling ###
import numpy as np
import pandas as pd
from scipy import stats
from gauss_rank_scaler import GaussRankScaler

### Data Visualization ###
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")

### Machine Learning ###
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_selection import VarianceThreshold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

### Deep Learning ###
import torch
from torch import nn
import torch.optim as optim
from torch.nn import functional as F
from torch.nn.modules.loss import _WeightedLoss
from torch.utils.data import DataLoader, Dataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
# Tabnet 
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

### Make prettier the prints ###
from colorama import Fore
c_ = Fore.CYAN
m_ = Fore.MAGENTA
r_ = Fore.RED
b_ = Fore.BLUE
y_ = Fore.YELLOW
g_ = Fore.GREEN

## <font color = "green">Reproducibility</font>

In [3]:
seed = 42

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
set_seed(seed)

## <font color = "green">Configuration</font>

In [4]:
# Parameters
data_path = ".../input/univai/"
no_ctl = True
scale = "rankgauss"
variance_threshould = 0.5
decompo = "PCA"
ncompo = 80

## <font color = "green">Loading the Data</font>

In [5]:
def get_dummies(df,dum_cols,val = 0):
    if val == 0:
        return df
    return pd.get_dummies(df, prefix=None, prefix_sep='_', dummy_na=False, columns=dum_cols)

def normalize(df):
    return (df - df.mean(0) )/df.std(0)

In [6]:
train_df = pd.read_csv('../input/univai/train_encoded.csv')
test_df  = pd.read_csv('../input/univai/test_encoded.csv')

train_features = train_df.drop(['risk_flag'],axis=1)
targets = train_df['risk_flag']


test_features = test_df.drop(['risk_flag'],axis=1)
submission = pd.read_csv('../input/univai/Sample Prediction Dataset.csv')

concat = pd.concat([train_features,test_features],axis=0)

create_dummy = 1                #0 if dont want dummies
dum_cols = ['age', 'experience', 'married', 'house_ownership',
       'car_ownership', 'profession', 'city', 'state', 'current_job_years',
       'current_house_years']

concat = get_dummies(concat,dum_cols,create_dummy)
norm_concat = normalize(concat)

train = norm_concat[:len(train_features)]
test  = norm_concat[len(train_features):]

## <font color = "green">Distributions Before Rank Gauss and PCA</font>

### <font color = "green">Distributions of the Train Set</font>

## <font color = "green">Rank Gauss Process</font>

In [7]:
data_all = pd.concat([train, test], ignore_index = True)
cols_numeric = [feat for feat in list(data_all.columns)]
mask = (data_all[cols_numeric].var() >= variance_threshould).values
tmp = data_all[cols_numeric].loc[:, mask]
data_all = pd.DataFrame(tmp,columns = cols_numeric)
data_all

Unnamed: 0,income,age_21,age_22,age_23,age_24,age_25,age_26,age_27,age_28,age_29,...,current_job_years_10,current_job_years_11,current_job_years_12,current_job_years_13,current_job_years_14,current_house_years_10,current_house_years_11,current_house_years_12,current_house_years_13,current_house_years_14
0,-1.284860,-0.130193,-0.134248,7.547901,-0.131745,-0.135306,-0.129368,-0.145464,-0.125695,-0.128208,...,-0.258505,-0.239086,-0.208974,-0.193823,-0.167845,-0.489485,-0.508868,-0.509218,1.995099,-0.491127
1,0.894740,-0.130193,-0.134248,-0.132487,-0.131745,-0.135306,-0.129368,-0.145464,-0.125695,-0.128208,...,-0.258505,-0.239086,-0.208974,-0.193823,-0.167845,-0.489485,-0.508868,-0.509218,1.995099,-0.491127
2,-0.350556,-0.130193,-0.134248,-0.132487,-0.131745,-0.135306,-0.129368,-0.145464,-0.125695,-0.128208,...,-0.258505,-0.239086,-0.208974,-0.193823,-0.167845,2.042957,-0.508868,-0.509218,-0.501226,-0.491127
3,0.436599,-0.130193,-0.134248,-0.132487,-0.131745,-0.135306,-0.129368,-0.145464,-0.125695,-0.128208,...,-0.258505,-0.239086,-0.208974,-0.193823,-0.167845,-0.489485,-0.508868,1.963787,-0.501226,-0.491127
4,0.267123,-0.130193,-0.134248,-0.132487,-0.131745,-0.135306,-0.129368,-0.145464,-0.125695,-0.128208,...,-0.258505,-0.239086,-0.208974,-0.193823,-0.167845,-0.489485,-0.508868,-0.509218,-0.501226,2.036127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279995,1.722329,-0.130193,-0.134248,-0.132487,-0.131745,-0.135306,-0.129368,-0.145464,-0.125695,-0.128208,...,-0.258505,-0.239086,-0.208974,-0.193823,-0.167845,2.042957,-0.508868,-0.509218,-0.501226,-0.491127
279996,-0.723881,-0.130193,-0.134248,-0.132487,-0.131745,-0.135306,-0.129368,-0.145464,-0.125695,-0.128208,...,-0.258505,-0.239086,-0.208974,-0.193823,-0.167845,-0.489485,-0.508868,-0.509218,-0.501226,2.036127
279997,1.071278,-0.130193,-0.134248,-0.132487,7.590388,-0.135306,-0.129368,-0.145464,-0.125695,-0.128208,...,-0.258505,-0.239086,-0.208974,-0.193823,-0.167845,-0.489485,-0.508868,-0.509218,1.995099,-0.491127
279998,1.555036,-0.130193,-0.134248,-0.132487,-0.131745,-0.135306,-0.129368,-0.145464,-0.125695,-0.128208,...,-0.258505,-0.239086,-0.208974,5.159338,-0.167845,-0.489485,-0.508868,-0.509218,-0.501226,2.036127


In [8]:
def scale_minmax(col):
    return (col - col.min()) / (col.max() - col.min())

def scale_norm(col):
    return (col - col.mean()) / col.std()

if scale == "boxcox":
    print(b_, "boxcox")
    data_all[cols_numeric] = data_all[cols_numeric].apply(scale_minmax, axis = 0)
    trans = []
    for feat in cols_numeric:
        trans_var, lambda_var = stats.boxcox(data_all[feat].dropna() + 1)
        trans.append(scale_minmax(trans_var))
    data_all[cols_numeric] = np.asarray(trans).T
    
elif scale == "norm":
    print(b_, "norm")
    data_all[cols_numeric] = data_all[cols_numeric].apply(scale_norm, axis = 0)
    
elif scale == "minmax":
    print(b_, "minmax")
    data_all[cols_numeric] = data_all[cols_numeric].apply(scale_minmax, axis = 0)
    
elif scale == "rankgauss":
    ### Rank Gauss ###
    print(b_, "Rank Gauss")
    scaler = GaussRankScaler()
    data_all[cols_numeric] = scaler.fit_transform(data_all[cols_numeric])
    
else:
    pass

[34m Rank Gauss


## <font color = "green">Principal Component Analysis</font>

In [9]:
# PCA
if decompo == "PCA":
    print(b_, "PCA")
    
    pca_genes = PCA(n_components = ncompo,
                    random_state = seed).fit_transform(data_all)
    
    pca_genes = pd.DataFrame(pca_genes, columns = [f"pca_g-{i}" for i in range(ncompo)])
    data_all = pd.concat((data_all, pca_genes), axis=1)
else:
    pass

[34m PCA


In [10]:
data_all

Unnamed: 0,income,age_21,age_22,age_23,age_24,age_25,age_26,age_27,age_28,age_29,...,pca_g-70,pca_g-71,pca_g-72,pca_g-73,pca_g-74,pca_g-75,pca_g-76,pca_g-77,pca_g-78,pca_g-79
0,-0.803407,-2.751064,-2.751064,2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,...,-0.233235,-0.229491,0.926818,1.075861,0.313440,0.471901,-0.203552,0.937039,1.077593,0.878844
1,0.498481,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,...,-0.429634,-0.730794,0.156615,-0.628841,-0.782229,-0.333093,0.116770,0.367585,-0.751161,1.512482
2,-0.178755,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,...,0.170511,0.940855,0.354057,-1.175569,-0.845901,-0.837412,-1.298231,0.841301,0.652347,-0.315038
3,0.228206,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,...,-1.133942,0.218419,-0.290282,-0.020977,-1.377551,0.507345,-0.128129,0.179552,-0.587353,1.702142
4,0.138792,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,...,-0.723791,-0.889688,0.663794,0.022248,0.915554,1.626079,1.238566,-0.252754,-0.503776,0.977225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279995,1.878502,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,...,0.785566,0.417115,-0.109288,-0.220912,1.122295,0.192071,-0.156663,-0.651637,-0.356651,0.401281
279996,-0.383637,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,...,-0.071508,-0.531957,0.559303,-0.618249,0.399950,0.598856,0.713734,-0.160126,0.003642,0.981389
279997,0.620838,-2.751064,-2.751064,-2.751064,2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,...,-0.306562,-0.777144,0.420654,0.155388,0.314216,-1.108431,0.924523,-0.300422,1.473142,-0.899735
279998,1.142869,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,-2.751064,...,0.069656,0.884330,0.024342,0.121181,1.353447,0.518808,-0.472315,0.502064,-1.083616,0.665906


## <font color = "green">One Hot</font>

We can confirme that the shapes of data got close to the normal distribution.

In [11]:
with open("data_all.pickle", "wb") as f:
    pickle.dump(data_all, f)

In [12]:
with open("data_all.pickle", "rb") as f:
    data_all = pickle.load(f)

In [13]:
# train_df and test_df

train_df = data_all[: train.shape[0]]
train_df.reset_index(drop = True, inplace = True)
# The following line it's a bad practice in my opinion, targets on train set
#train_df = pd.concat([train_df, targets], axis = 1)
test_df = data_all[train_df.shape[0]: ]
test_df.reset_index(drop = True, inplace = True)

In [14]:
print(f"{b_}train_df.shape: {r_}{train_df.shape}")
print(f"{b_}test_df.shape: {r_}{test_df.shape}")

[34mtrain_df.shape: [31m(252000, 599)
[34mtest_df.shape: [31m(28000, 599)


In [15]:
X_test = test_df.values
print(f"{b_}X_test.shape: {r_}{X_test.shape}")

[34mX_test.shape: [31m(28000, 599)


# <font color = "seagreen">Modeling</font>

## <font color = "green">Model Parameters</font>

In [16]:
MAX_EPOCH = 150
# n_d and n_a are different from the original work, 32 instead of 24
# This is the first change in the code from the original
tabnet_params = dict(
    n_d = 32,
    n_a = 32,
    n_steps = 1,
    gamma = 1.3,
    lambda_sparse = 0,
    optimizer_fn = optim.Adam,
    optimizer_params = dict(lr = 2e-3, weight_decay = 1e-5),
    mask_type = "entmax",
    scheduler_params = dict(
        mode = "max", patience = 3, min_lr = 1e-6, factor = 0.33, verbose = True),
    scheduler_fn = ReduceLROnPlateau,
    seed = seed,
    verbose = 1
)

## <font color = "green">Custom Metric</font>

In [17]:
class LogitsLogLoss(Metric):
    """
    LogLoss with sigmoid applied
    """

    def __init__(self):
        self._name = "logits_ll"
        self._maximize = False

    def __call__(self, y_true, y_pred):
        """
        Compute LogLoss of predictions.

        Parameters
        ----------
        y_true: np.ndarray
            Target matrix or vector
        y_score: np.ndarray
            Score matrix or vector

        Returns
        -------
            float
            LogLoss of predictions vs targets.
        """
        logits = 1 / (1 + np.exp(-y_pred))
        aux = (1 - y_true) * np.log(1 - logits + 1e-15) + y_true * np.log(logits + 1e-15)
        return np.mean(-aux)

In [18]:
class roc_auc_c(Metric):
    def __init__(self):
        self._name = "roc_auc_c"
        self._maximize = True

    def __call__(self, y_true, y_score):
        return roc_auc_score(y_true, np.array(y_score).round())

# <font color = "seagreen">Training</font>

In [19]:
scores_auc_all = []
test_cv_preds = []

NB_SPLITS = 8 # 7
mskf = MultilabelStratifiedKFold(n_splits = NB_SPLITS, random_state = 0, shuffle = True)

oof_preds = []
oof_targets = []
scores = []
scores_auc = []

# for mskf
ms_tar = np.hstack((np.array(train_df),np.array(targets).reshape(-1,1)))
#####

for fold_nb, (train_idx, val_idx) in enumerate(mskf.split(train_df, ms_tar)):
    print(b_,"FOLDS: ", r_, fold_nb + 1)
    print(g_, '*' * 60, c_)
    
    X_train, y_train = train_df.values[train_idx, :], np.array(targets).reshape(-1,1)[train_idx]
    X_val, y_val = train_df.values[val_idx, :], np.array(targets).reshape(-1,1)[val_idx]
    ### Model ###
    model = TabNetRegressor(**tabnet_params)
        
    ### Fit ###
    # Another change to the original code
    # virtual_batch_size of 32 instead of 128
    model.fit(
        X_train = X_train,
        y_train = y_train,
        eval_set = [(X_val, y_val)],
        eval_name = ["val"],
        eval_metric = ["roc_auc_c"],
        max_epochs = MAX_EPOCH,
        patience = 20,
        batch_size = 1024, 
        virtual_batch_size = 32,
        num_workers = 4,
        drop_last = False,
        # To use binary cross entropy because this is not a regression problem
        loss_fn = F.binary_cross_entropy_with_logits
    )
    print(y_, '-' * 60)
    
    ### Predict on validation ###
    preds_val = model.predict(X_val)
    # Apply sigmoid to the predictions
    preds = 1 / (1 + np.exp(-preds_val))
    score = np.min(model.history["val_roc_auc_c"])
    
    ### Save OOF for CV ###
    oof_preds.append(preds)
    oof_targets.append(y_val)
    scores.append(score)
    
    ### Predict on test ###
    preds_test = model.predict(X_test)
    test_cv_preds.append(1 / (1 + np.exp(-preds_test)))

oof_preds_all = np.concatenate(oof_preds)
oof_targets_all = np.concatenate(oof_targets)
test_preds_all = np.stack(test_cv_preds)

[34m FOLDS:  [31m 1
[32m ************************************************************ [36m
Device used : cuda
epoch 0  | loss: 0.40267 | val_roc_auc_c: 0.53772 |  0:00:20s
epoch 1  | loss: 0.36035 | val_roc_auc_c: 0.69718 |  0:00:40s
epoch 2  | loss: 0.33749 | val_roc_auc_c: 0.80516 |  0:01:00s
epoch 3  | loss: 0.30817 | val_roc_auc_c: 0.85604 |  0:01:20s
epoch 4  | loss: 0.28203 | val_roc_auc_c: 0.87984 |  0:01:40s
epoch 5  | loss: 0.2629  | val_roc_auc_c: 0.88736 |  0:02:00s
epoch 6  | loss: 0.25155 | val_roc_auc_c: 0.8923  |  0:02:20s
epoch 7  | loss: 0.24172 | val_roc_auc_c: 0.89486 |  0:02:40s
epoch 8  | loss: 0.23432 | val_roc_auc_c: 0.89863 |  0:03:01s
epoch 9  | loss: 0.22975 | val_roc_auc_c: 0.89959 |  0:03:22s
epoch 10 | loss: 0.22446 | val_roc_auc_c: 0.89968 |  0:03:42s
epoch 11 | loss: 0.21942 | val_roc_auc_c: 0.9009  |  0:04:03s
epoch 12 | loss: 0.21476 | val_roc_auc_c: 0.90426 |  0:04:24s
epoch 13 | loss: 0.212   | val_roc_auc_c: 0.90566 |  0:04:44s
epoch 14 | loss: 0

In [20]:
aucs = []
for task_id in range(oof_preds_all.shape[1]):
    aucs.append(roc_auc_score(y_true = oof_targets_all[:, task_id],
                              y_score = oof_preds_all[:, task_id].round()
                             ))
print(f"{b_}Overall AUC: {r_}{np.mean(aucs)}")
print(f"{b_}Average CV: {r_}{np.mean(scores)}")

[34mOverall AUC: [31m0.8072572868857321
[34mAverage CV: [31m0.5369531970289722


**The worst CV value that i achive**

# <font color = "seagreen">Conclusion (NOT AVAILABLE UNTIL I SEE THE LB Score)</font> 

# <font color = "seagreen">Submission</font>

In [21]:
tempo = np.zeros((test_preds_all.shape[1],1))
for i in range(test_preds_all.shape[0]):
    tempo += test_preds_all[i]

tempo /= test_preds_all.shape[0]

In [22]:
#submission[all_feat] = tmp.mean(axis = 0)

# Set control to 0
#submission.loc[test["cp_type"] == 0, submission.columns[1:]] = 0
submission['risk_flag'] = tempo.round().reshape(-1,).astype(np.int64)
submission.to_csv("submission.csv", index = False)
submission.head()

Unnamed: 0,id,risk_flag
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0


In [23]:
print(f"{b_}submission.shape: {r_}{submission.shape}")

[34msubmission.shape: [31m(28000, 2)


<div class = "alert alert-block alert-info">
    <h3><font color = "red">NOTE: </font></h3>
    <p>If you want to comment please tag me with '@' to answer more quickly.</p>
</div>