# Bank dataset example

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ogunlao/saint/blob/main/notebooks/Bank_Dataset.ipynb)

This notebook example is made to run seamlessly on colab. You may need to change the paths to run on local.

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
# clone the repository

!git clone https://github.com/ogunlao/saint.git

In [None]:
!pip install -r saint/requirements.txt

**warning: restart the runtime on Colab**

## General setup configuration parameters from paper

In [2]:
import os

import torch
import pandas as pd
import numpy as np

In [3]:
# --- Config Information ---#
try:
    from ruamel.yaml import YAML
except ModuleNotFoundError:
    from ruamel_yaml import YAML

config_path = 'configs/config.yaml'

yaml = YAML(typ='safe')
with open(config_path) as f:
    args = yaml.load(f)

print(args)

{'defaults': ['_self_', {'experiment': 'self-supervised'}, {'data': 'bank_ssl'}], 'seed': 1234, 'transformer': {'num_layers': 6, 'num_heads': 8, 'dropout': 0.1, 'dropout_ff': 0.1, 'embed_dim': 32, 'd_ff': 32, 'cls_token_idx': 0}, 'augmentation': {'prob_cutmix': 0.3, 'alpha': 0.2, 'lambda_pt': 10}, 'optimizer': {'temperature': 0.7, 'proj_head_dim': 128, 'beta_1': 0.9, 'beta_2': 0.99, 'lr': 0.0001, 'weight_decay': 0.01, 'optim': 'adamw', 'metric': 'auroc'}, 'preproc': {'data_folder': None, 'train_split': 0.65, 'validation_split': 0.15, 'test_split': 0.2, 'num_supervised_train_data': None}, 'callback': {'monitor': 'val_loss', 'mode': 'min', 'auto_insert_metric_name': False}, 'trainer': {'max_epochs': 100, 'deterministic': True, 'default_root_dir': None}, 'dataloader': {'shuffle_val': False, 'train_bs': 32, 'val_bs': 32, 'test_bs': 16, 'num_workers': 2, 'pin_memory': False}, 'metric': '${optimizer.metric}', 'print_config': False}


## Dataset preprocessing

###### Bank dataset [Link](https://archive.ics.uci.edu/ml/datasets/bank+marketing)

In [4]:
data_folder = "data/ukb_test"

In [36]:
ukb_df = pd.read_csv(os.path.join(data_folder, 'X.csv'),index_col="eid")
y = pd.read_csv(os.path.join(data_folder, 'y.csv'),index_col="eid")
y.columns = ["time", "event"]

# if event column is boolean, convert to int
if y["event"].dtype == "bool":
    y["event"] = y["event"].astype(int)

ukb_df.head()

Unnamed: 0_level_0,protein_0,protein_1,protein_2,protein_3,protein_4,blood_cont_0,blood_cont_1,blood_cont_2,blood_cont_3,blood_cont_4,blood_cat_1,blood_cat_2,lifestyle_cont_0,lifestyle_cont_1,lifestyle_cont_2,lifestyle_cont_3,lifestyle_cont_4,lifestyle_cat_1,lifestyle_cat_2
eid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
sample_341,-0.999947,-0.362368,2.647675,-2.05083,0.695265,-1.503765,0.040051,-0.676371,1.574718,-0.165621,A,Low,-2.504596,-0.260175,-0.606295,-1.146541,-0.136848,L19,No
sample_358,0.094597,-0.292266,1.06866,-0.785074,-0.178323,-0.075421,0.332536,0.573553,0.167925,0.282252,B,High,-0.427264,1.242474,1.290035,-0.466768,-0.696482,L11,Yes
sample_564,0.979274,-0.676494,0.502028,0.510853,1.52655,-0.931003,0.820592,0.027506,-1.653179,-0.680078,B,Medium,-0.276156,-1.747656,0.935946,-0.75711,-0.416817,L10,Yes
sample_965,0.167213,-0.029956,0.821229,-0.91024,-0.179577,0.15502,0.673715,-0.295771,-0.816866,0.524858,A,Low,1.492023,-0.605616,0.503926,1.079585,0.775233,L4,No
sample_776,0.758431,1.398153,-2.39134,-1.178071,0.488712,0.939588,-0.831314,0.447403,-0.45087,1.144676,A,High,-0.259334,-0.515172,0.145227,-0.416769,1.255462,L3,Unknown


In [37]:
len(y) # Data size should be 45211

1000

In [38]:
y.value_counts() # Data is imbalanced, will require some balanced sampling

time         event
3757.996790  1        1
0.158492     1        1
1.039157     0        1
1.687757     1        1
4.592309     1        1
                     ..
9.679507     0        1
9.801794     0        1
9.977149     1        1
10.586484    0        1
11.333060    1        1
Name: count, Length: 1000, dtype: int64

In [39]:
# check if there are any missing values
ukb_df.isna().sum()

protein_0           0
protein_1           0
protein_2           0
protein_3           0
protein_4           0
blood_cont_0        0
blood_cont_1        0
blood_cont_2        0
blood_cont_3        0
blood_cont_4        0
blood_cat_1         0
blood_cat_2         0
lifestyle_cont_0    0
lifestyle_cont_1    0
lifestyle_cont_2    0
lifestyle_cont_3    0
lifestyle_cont_4    0
lifestyle_cat_1     0
lifestyle_cat_2     0
dtype: int64

In [40]:
from src.dataset import generate_splits, preprocess

In [41]:
# separate 200 samples for supervised training, all others are used in ssl training
num_supervised_train_data = "all" #200 

# get split indices
sup_train_indices, val_indices, test_indices, ssl_train_indices = generate_splits(len(ukb_df), 
                                                                            num_supervised_train_data,
                                                                            args['preproc']['validation_split'],
                                                                            args['preproc']['test_split'],
                                                                            args['seed'],)

In [42]:
df_proc, y_proc, no_num, no_cat, cats  = preprocess(ukb_df, y, args['transformer']['cls_token_idx'], survival_format=True)

In [43]:
# This variables will need to be added to the config files in "configs/data/bank_*" before training

print('no of numerical columns: ', no_num)
print('no of categorical columns: ', no_cat)

print('list of categories in each categorical column: ', cats)

no of numerical columns:  15
no of categorical columns:  5
list of categories in each categorical column:  [1, 4, 3, 20, 3]


In [44]:
#### args.num_features = args.no_num + args.no_cat

# split data into train, val and test using generated indices

train_df, train_y   = df_proc.iloc[sup_train_indices], y_proc.iloc[sup_train_indices]
val_df, val_y       = df_proc.iloc[val_indices], y_proc.iloc[val_indices]
test_df, test_y     = df_proc.iloc[test_indices], y_proc.iloc[test_indices]

In [45]:
# Generate data for self-supervised training if specified

train_ssl, train_ssl_y = None, None

if num_supervised_train_data != 'all':
    train_ssl, train_ssl_y = df_proc.iloc[ssl_train_indices], y_proc.iloc[ssl_train_indices]

In [46]:
# save dataframes in the data directory

train_df.to_csv('data/train.csv' , index=False)
train_y.to_csv('data/train_y.csv' , index=False)
val_df.to_csv('data/val.csv' , index=False)
val_y.to_csv('data/val_y.csv' , index=False)
test_df.to_csv('data/test.csv' , index=False)
test_y.to_csv('data/test_y.csv' , index=False)

if train_ssl is not None:
    train_ssl.to_csv('data/train_ssl.csv' , index=False)

if train_ssl_y is not None:
    train_ssl_y.to_csv('data/train_ssl_y.csv' , index=False)

### Self-supervised training

In [21]:
# give the number of gpus available if any
num_gpus = 1

In [None]:
# Train saint model in self-supervised settings. 
# To use gpus, add trainer.gpus=1 where "1" is the total no of gpus to the command

!python saint/main.py experiment=self-supervised \
    experiment.model=saint \
    data.data_folder=/content/saint/data \
    data=bank_ssl

In [None]:
# Retrieve the location of self-supervised pretrained model from logs

# for example
best_ssl_model_ckpt = "/content/outputs/2021-11-01/10-09-16/lightning_logs/version_0/checkpoints/0-916.ckpt"

In [25]:
# Train a supervised model initialized from the ssl model

!python main.py experiment=supervised \
    experiment.model=saint \
    data.data_folder=/content/saint/data \
    data=bank_sup \
    experiment.pretrained_checkpoint={best_ssl_model_ckpt}


The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  @hydra.main(config_path="configs", config_name="config")
no viable alternative at input '{best_ssl_model_ckpt}'
See https://hydra.cc/docs/1.2/advanced/override_grammar/basic for details

Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.


In [35]:
# You can also train a supervised model without initializing with a pretrained model i.e. randomly initialized

!python main.py experiment=supervised \
    experiment.model=saint \
    data.data_folder=/workspaces/saint/data \
    data=bank_sup

Traceback (most recent call last):
  File "/workspaces/saint/main.py", line 7, in <module>
    from src.train import setup_experiment
  File "/workspaces/saint/src/train.py", line 5, in <module>
    from src.trainer import SaintSemiSupLightningModule, SaintSupLightningModule
  File "/workspaces/saint/src/trainer.py", line 6, in <module>
    from torchsurv.loss.cox import neg_partial_log_likelihood
ModuleNotFoundError: No module named 'torchsurv'


In [None]:
# To predict on a test dataset  # check saint/experiment/predict for params

pretrained_checkpoint = "/content/outputs/2021-11-01/13-30-49/lightning_logs/version_0/checkpoints/2-20.ckpt"

!python saint/predict.py experiment=predict \
    experiment.model=saint \
    data=bank_sup \
    data.data_folder=/content/saint/data \
    experiment.pretrained_checkpoint={pretrained_checkpoint} \
    experiment.pred_sav_path=/content/predict.csv

## View Tensorboard

In [None]:
# View plots and hyperparameters

%load_ext tensorboard
%tensorboard --logdir /content/outputs/2021-11-01/10-14-14/lightning_logs # change lightning log path

## The End