# Bank dataset example

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ogunlao/saint/blob/main/notebooks/Bank_Dataset.ipynb)

This notebook example is made to run seamlessly on colab. You may need to change the paths to run on local.

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
# clone the repository

!git clone https://github.com/ogunlao/saint.git

In [None]:
!pip install -r saint/requirements.txt

**warning: restart the runtime on Colab**

## General setup configuration parameters from paper

In [2]:
import os

import torch
import pandas as pd
import numpy as np

In [4]:
# --- Config Information ---#
try:
    from ruamel.yaml import YAML
except ModuleNotFoundError:
    from ruamel_yaml import YAML

config_path = 'configs/config.yaml'

yaml = YAML(typ='safe')
with open(config_path) as f:
    args = yaml.load(f)

print(args)

{'defaults': ['_self_', {'experiment': 'self-supervised'}, {'data': 'bank_ssl'}], 'seed': 1234, 'transformer': {'num_layers': 6, 'num_heads': 8, 'dropout': 0.1, 'dropout_ff': 0.1, 'embed_dim': 32, 'd_ff': 32, 'cls_token_idx': 0}, 'augmentation': {'prob_cutmix': 0.3, 'alpha': 0.2, 'lambda_pt': 10}, 'optimizer': {'temperature': 0.7, 'proj_head_dim': 128, 'beta_1': 0.9, 'beta_2': 0.99, 'lr': 0.0001, 'weight_decay': 0.01, 'optim': 'adamw', 'metric': 'auroc'}, 'preproc': {'data_folder': None, 'train_split': 0.65, 'validation_split': 0.15, 'test_split': 0.2, 'num_supervised_train_data': None}, 'callback': {'monitor': 'val_loss', 'mode': 'min', 'auto_insert_metric_name': False}, 'trainer': {'max_epochs': 100, 'deterministic': True, 'default_root_dir': None}, 'dataloader': {'shuffle_val': False, 'train_bs': 32, 'val_bs': 32, 'test_bs': 16, 'num_workers': 2, 'pin_memory': False}, 'metric': '${optimizer.metric}', 'print_config': False}


## Dataset preprocessing

###### Bank dataset [Link](https://archive.ics.uci.edu/ml/datasets/bank+marketing)

In [7]:
data_folder = "data/bank"

if False:
    # download the data into a data folder
    URL_LINK = "https://archive.ics.uci.edu/ml/machine-learning-databases/00222/bank.zip"

    !wget -P {data_folder} {URL_LINK}
    !unzip {data_folder}/bank.zip -d {data_folder}

In [8]:
bank_df = pd.read_csv(os.path.join(data_folder, 'bank-full.csv'),  sep=';')
y = bank_df[['y']].copy()

bank_df.drop(columns=['y'], inplace=True)
bank_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown


In [9]:
len(y) # Data size should be 45211

45211

In [10]:
y.value_counts() # Data is imbalanced, will require some balanced sampling

y  
no     39922
yes     5289
Name: count, dtype: int64

In [11]:
# check if there are any missing values
bank_df.isna().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
dtype: int64

In [13]:
from src.dataset import generate_splits, preprocess

In [14]:
# separate 200 samples for supervised training, all others are used in ssl training
num_supervised_train_data = 200 

# get split indices
sup_train_indices, val_indices, test_indices, ssl_train_indices = generate_splits(len(bank_df), 
                                                                            num_supervised_train_data,
                                                                            args['preproc']['validation_split'],
                                                                            args['preproc']['test_split'],
                                                                            args['seed'],)

In [15]:
df_proc, y_proc, no_num, no_cat, cats  = preprocess(bank_df, y, args['transformer']['cls_token_idx'])

  y = column_or_1d(y, warn=True)


In [16]:
# This variables will need to be added to the config files in "configs/data/bank_*" before training

print('no of numerical columns: ', no_num)
print('no of categorical columns: ', no_cat)

print('list of categories in each categorical column: ', cats)

no of numerical columns:  7
no of categorical columns:  10
list of categories in each categorical column:  [1, 12, 3, 4, 2, 2, 2, 3, 12, 4]


In [17]:
#### args.num_features = args.no_num + args.no_cat

# split data into train, val and test using generated indices

train_df, train_y   = df_proc.iloc[sup_train_indices], y_proc.iloc[sup_train_indices]
val_df, val_y       = df_proc.iloc[val_indices], y_proc.iloc[val_indices]
test_df, test_y     = df_proc.iloc[test_indices], y_proc.iloc[test_indices]

In [18]:
# Generate data for self-supervised training if specified

train_ssl, train_ssl_y = None, None

if num_supervised_train_data != 'all':
    train_ssl, train_ssl_y = df_proc.iloc[ssl_train_indices], y_proc.iloc[ssl_train_indices]

In [20]:
# save dataframes in the data directory

train_df.to_csv('data/train.csv' , index=False)
train_y.to_csv('data/train_y.csv' , index=False)
val_df.to_csv('data/val.csv' , index=False)
val_y.to_csv('data/val_y.csv' , index=False)
test_df.to_csv('data/test.csv' , index=False)
test_y.to_csv('data/test_y.csv' , index=False)

if train_ssl is not None:
    train_ssl.to_csv('data/train_ssl.csv' , index=False)

if train_ssl_y is not None:
    train_ssl_y.to_csv('data/train_ssl_y.csv' , index=False)

### Self-supervised training

In [21]:
# give the number of gpus available if any
num_gpus = 1

In [None]:
# Train saint model in self-supervised settings. 
# To use gpus, add trainer.gpus=1 where "1" is the total no of gpus to the command

!python saint/main.py experiment=self-supervised \
    experiment.model=saint \
    data.data_folder=/content/saint/data \
    data=bank_ssl

In [None]:
# Retrieve the location of self-supervised pretrained model from logs

# for example
best_ssl_model_ckpt = "/content/outputs/2021-11-01/10-09-16/lightning_logs/version_0/checkpoints/0-916.ckpt"

In [25]:
# Train a supervised model initialized from the ssl model

!python main.py experiment=supervised \
    experiment.model=saint \
    data.data_folder=/content/saint/data \
    data=bank_sup \
    experiment.pretrained_checkpoint={best_ssl_model_ckpt}


The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  @hydra.main(config_path="configs", config_name="config")
no viable alternative at input '{best_ssl_model_ckpt}'
See https://hydra.cc/docs/1.2/advanced/override_grammar/basic for details

Set the environment variable HYDRA_FULL_ERROR=1 for a complete stack trace.


In [29]:
# You can also train a supervised model without initializing with a pretrained model i.e. randomly initialized

!python main.py experiment=supervised \
    experiment.model=saint \
    data.data_folder=/workspaces/saint/data \
    data=bank_sup

The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  @hydra.main(config_path="configs", config_name="config")
See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.
  ret = run_job(
Seed set to 1234
SaintSupLightningModule(
  (transformer): Encoder(
    (layers): ModuleList(
      (0-5): 6 x SaintLayer(
        (msa): EncoderLayer(
          (self_attn): MultiHeadedAttention(
            (linears): ModuleList(
              (0-3): 4 x Linear(in_features=32, out_features=32, bias=True)
            )
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (feed_forward): PositionwiseFeedForward(
            (w_1): Linear(in_features=32, out_features=32, bias=True)
            (w_2): Linear(in_features=32, out_features=32, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
            (activation): GELU(approximate='none')
     

In [None]:
# To predict on a test dataset  # check saint/experiment/predict for params

pretrained_checkpoint = "/content/outputs/2021-11-01/13-30-49/lightning_logs/version_0/checkpoints/2-20.ckpt"

!python saint/predict.py experiment=predict \
    experiment.model=saint \
    data=bank_sup \
    data.data_folder=/content/saint/data \
    experiment.pretrained_checkpoint={pretrained_checkpoint} \
    experiment.pred_sav_path=/content/predict.csv

## View Tensorboard

In [None]:
# View plots and hyperparameters

%load_ext tensorboard
%tensorboard --logdir /content/outputs/2021-11-01/10-14-14/lightning_logs # change lightning log path

## The End