<a href="https://colab.research.google.com/github/LiangShuLing/ML-000/blob/main/Week13/HomeWork_chap10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PyTorch Training Short:


---
First let us check whether the whole thing is working





In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change b type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

In [None]:
# !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --version nightly --apt-packages libomp5 libopenblas-dev

In [None]:
!pip install pytorch-lightning sklearn

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

In [None]:
!pwd

### Create a simple dataset to test the correctness of our approach

In [None]:
import numpy as np
import torch
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

x = torch.randn(100000, 2)
noise = torch.randn(100000,)
y = ((1.0*x[:,0]+2.0*x[:,1]+noise)>0).type(torch.int64)

In [None]:
y_np = y.numpy()
x_np = x.numpy()
y_train, y_test = y_np[:50000], y_np[50000:]
x_train, x_test = x_np[:50000, :], x_np[50000:, :]
log_reg = LogisticRegression()
log_reg.fit(x_train, y_train)
y_pred = log_reg.predict(x_test)
print(accuracy_score(y_test, y_pred))

### Now create an evil data set

In [None]:
x_1 = torch.randn(100000)
x_2 = torch.randn(100000)
x_useful = torch.cos(1.5*x_1)*(x_2**2)
x_1_rest_small = torch.randn(100000, 15)+ 0.01*x_1.unsqueeze(1)
x_1_rest_large = torch.randn(100000, 15) + 0.1*x_1.unsqueeze(1)
x_2_rest_small = torch.randn(100000, 15)+ 0.01*x_2.unsqueeze(1)
x_2_rest_large = torch.randn(100000, 15) + 0.1*x_2.unsqueeze(1)
x = torch.cat([x_1[:, None], x_2[:, None], x_1_rest_small, x_1_rest_large, x_2_rest_small, x_2_rest_large], dim=1)
y = ((10*x_useful) + 5*torch.randn(100000) >0.0).type(torch.int64) 

### Now let us test if we have an oracle.

In [None]:
y_train, y_test = y.numpy()[:50000], y.numpy()[50000:]
x_train, x_test = x.numpy()[:50000, :], x.numpy()[50000:, :]
oracle_train, oracle_test = x_useful.numpy()[:50000], x_useful.numpy()[50000:]
log_reg_2 = LogisticRegression()
log_reg_2.fit(oracle_train[:, None],y_train)
y_pred = log_reg_2.predict(oracle_test[:, None])
print(accuracy_score(y_pred, y_test))

### What if the oracle is not here?

In [None]:
y_train, y_test = y.numpy()[:50000], y.numpy()[50000:]
x_train, x_test = x.numpy()[:50000, :], x.numpy()[50000:, :]
log_reg_3 = LogisticRegression()
log_reg_3.fit(x_train, y_train)
y_pred = log_reg_3.predict(x_test)
accuracy_score(y_pred, y_test)

### Let us run a basic PyTorch example to test whether the good is correct

In [None]:
x = torch.randn(100000, 2)
noise = torch.randn(100000,)
y = ((1.0*x[:,0]+2.0*x[:,1]+noise)>0).type(torch.int64)
x_train, x_test = x[:50000, :], x[50000:, :]
y_train, y_test = y[:50000], y[50000:]

In [None]:
from torch.utils.data import Dataset, DataLoader

class MyDataSet(Dataset):
    def __init__(self, x, y):
        super().__init__()
        self.x = x
        self.y = y
        self.len = x.shape[0]
        
        
    def __getitem__(self, idx):
        return self.x[idx, :], self.y[idx]
    
    def __len__(self):
        return self.len

In [None]:
train_dataset = MyDataSet(x_train, y_train)
test_dataset = MyDataSet(x_test, y_test)
train_dataloader = DataLoader(train_dataset, batch_size = 128, shuffle=True, num_workers=6)
test_dataloader = DataLoader(test_dataset, batch_size = 128, num_workers=6)

In [None]:
import torch.nn as nn
import torch
import torch.nn.functional as F


def mish(input):

    return input * torch.tanh(F.softplus(input))

class Mish(nn.Module):

    def __init__(self):
        '''
        Init method.
        '''
        super().__init__()

    def forward(self, input):
        '''
        Forward pass of the function.
        '''
        return mish(input)

class MLPLayer(nn.Module):
    def __init__(self, dim_in, dim_out, res_coef = 0, dropout_p = 0.1):
        super().__init__()
        self.linear  = nn.Linear(dim_in, dim_out)
        self.res_coef = res_coef
        self.activation = Mish()
        self.dropout = nn.Dropout(dropout_p)
        self.ln = nn.LayerNorm(dim_out)
    
    def forward(self, x):
        y = self.linear(x)
        y = self.activation(y)
        y = self.dropout(y)
        if self.res_coef == 0:
            return self.ln(y)
        else:
            return self.ln(self.res_coef*x +y )

       
class MyNetwork(nn.Module):
    def __init__(self, dim_in, dim, res_coef=0.5, dropout_p = 0.1, n_layers = 10):
        super().__init__()
        self.mlp = nn.ModuleList()
        self.first_linear = MLPLayer(dim_in, dim)
        self.n_layers = n_layers
        for i in range(n_layers):
            self.mlp.append(MLPLayer(dim, dim, res_coef, dropout_p))
        self.final = nn.Linear(dim, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.first_linear(x)
        for layer in self.mlp:
            x = layer(x)
        x= self.sigmoid(self.final(x))
        return x.squeeze()

In [None]:
import pytorch_lightning as pl
from pytorch_lightning.metrics import Accuracy
class TrainingModule(pl.LightningModule):
    def __init__(self, dim_in, dim, res_coef=0, dropout_p=0, n_layers=10):
        super().__init__()
        self.backbone = MyNetwork(dim_in, dim, res_coef, dropout_p, n_layers)
        self.loss = nn.BCELoss()
        self.accuracy = Accuracy()
    def forward(self, x):
        return self.backbone(x)
    def validation_step(self, batch, batch_idx):
        x, y = batch
        x = self.backbone(x)
        loss = self.loss(x, y.type(torch.float32))
        acc = self.accuracy(x, y)
        self.log("Validation loss", loss)
        self.log("Validation acc", acc)
        return loss, acc
        
    def training_step(self, batch, batch_idx):
        x, y = batch
        x = self.backbone(x)
        loss = self.loss(x, y.type(torch.float32))
        acc = self.accuracy(x, y)
        self.log("Training loss", loss)
        self.log("Training acc", acc)
        
        return loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

import os
class CheckpointEveryNSteps(pl.Callback):
    def __init__(self, save_step_frequency):
        self.save_step_frequency = save_step_frequency

    def on_batch_end(self, trainer: pl.Trainer, _):
        epoch = trainer.current_epoch
        global_step = trainer.global_step
        if global_step % self.save_step_frequency == 0:
            filename = "epoch=" + str(epoch) + "_step=" + str(global_step)+".ckpt"
            ckpt_path = os.path.join(trainer.checkpoint_callback.dirpath, filename)
            trainer.save_checkpoint(ckpt_path)

In [None]:
from pytorch_lightning import loggers as pl_loggers

tb_logger = pl_loggers.TensorBoardLogger('logs/')
save_by_steps = CheckpointEveryNSteps(100)
training_module = TrainingModule(2, 10, 0.5, 0.1, 2)
trainer = pl.Trainer(max_epochs=5, gpus=1, progress_bar_refresh_rate=100, val_check_interval=0.25, logger=tb_logger)
trainer.fit(training_module, train_dataloader, test_dataloader)

In [None]:
%tensorboard --logdir logs


### Now Let us use the evil dataset

In [None]:
import torch
x_1 = torch.randn(100000)
x_2 = torch.randn(100000)
x_useful = torch.cos(1.5*x_1)*(x_2**2)
x_1_rest_small = torch.randn(100000, 15)+ 0.01*x_1.unsqueeze(1)
x_1_rest_large = torch.randn(100000, 15) + 0.1*x_1.unsqueeze(1)
x_2_rest_small = torch.randn(100000, 15)+ 0.01*x_2.unsqueeze(1)
x_2_rest_large = torch.randn(100000, 15) + 0.1*x_2.unsqueeze(1)
x = torch.cat([x_1[:, None], x_2[:, None], x_1_rest_small, x_1_rest_large, x_2_rest_small, x_2_rest_large], dim=1)
y = ((10*x_useful) + 5*torch.randn(100000) >0.0).type(torch.int64) 

x_train, x_test = x[:50000, :], x[50000:, :]
y_train, y_test = y[:50000], y[50000:]
train_dataset = MyDataSet(x_train, y_train)
test_dataset = MyDataSet(x_test, y_test)
train_dataloader = DataLoader(train_dataset, batch_size = 32, num_workers=6)
test_dataloader = DataLoader(test_dataset, batch_size = 128, num_workers=6)

In [None]:
from pytorch_lightning import loggers as pl_loggers

tb_logger = pl_loggers.TensorBoardLogger('logs/')

training_module = TrainingModule(62, 32, 0.5, 0.1, 20)
trainer = pl.Trainer(max_epochs=3, gpus=1, progress_bar_refresh_rate=100, val_check_interval=0.5, logger=tb_logger)
trainer.fit(training_module, train_dataloader, test_dataloader)

In [None]:
%tensorboard --logdir logs

Now let us see how LightGBM works.

In [None]:
import lightgbm as lgb

x_train_np, x_test_np = x_train.numpy(), x_test.numpy()
y_train_np, y_test_np = y_train.numpy(), y_test.numpy()

train_dataset = lgb.Dataset(x_train_np, y_train_np)
test_dataset = lgb.Dataset(x_test_np, y_test_np)

In [None]:
params = {'num_leaves': 31, 'objective': 'binary', 'feature_fraction':0.8, 'bagging_fraction':0.8, 'metric':'binary_error'}
num_round=2000
eval_list = [train_dataset, test_dataset]
lgb_model = lgb.train(params, train_dataset, num_round, valid_sets=eval_list)


How can we improve (save) our deep learning model. 
1. Discretize
2. Variable selection

In [None]:
# coding = 'utf-8'
import numpy as np
import pandas as pd
import tqdm

def encode_label(x):
    unique=sorted(list(set([str(item) for item in np.unique(x)])))
    kv = {unique[i]: i for i in range(len(unique))}
    vfunc = np.vectorize(lambda x: kv[str(x)])
    return vfunc(x)

def encode_label_mat(x):
    _, ncol = x.shape
    result = np.empty_like(x, dtype=int)
    for col in range(ncol):
        result[:,col] = encode_label(x[:, col])
    return result

def impute_nan(x, method='median'):
    _, ncol = x.shape
    result = np.empty_like(x)

    for col in range(ncol):
        if method == 'median':
            data = x[:, col]
            impute_value = np.median(data[~pd.isnull(data) & (data != np.inf) & (data != -np.inf)])
        else:
            raise NotImplementedError()

        func = np.vectorize(lambda x: impute_value if pd.isnull(x) else x)
        result[:, col] = func(x[:, col])
    return result


def get_uniform_interval(minimum, maximum, nbins):
    result = [minimum]
    step_size = (float(maximum - minimum)) / nbins
    for index in range(nbins - 1):
        result.append(minimum + step_size * (index + 1))
    result.append(maximum)
    return result


def get_interval_v2(x, sorted_intervals):
    if pd.isnull(x):
        return -1
    if x == np.inf:
        return -2
    if x == -np.inf:
        return -3
    interval = 0
    found = False
    sorted_intervals.append(np.inf)
    while not found and interval < len(sorted_intervals) - 1:
        if sorted_intervals[interval] <= x < sorted_intervals[interval + 1]:
            return interval
        else:
            interval += 1


def get_quantile_interval(data, nbins):
    quantiles = get_uniform_interval(0, 1, nbins)
    return list(np.quantile(data[(~pd.isnull(data)) & (data != np.inf) & (data != -np.inf)], quantiles))


def discretize(x, nbins=20):
    nrow, ncol = x.shape
    result = np.empty_like(x)
    interval_list = list()
    for col in range(ncol):
        intervals = sorted(list(set(get_quantile_interval(x[:, col], nbins))))
        interval_centroid = list()

        for i in range(len(intervals) - 1):
            interval_centroid.append(0.5 * (intervals[i] + intervals[i + 1]))
        func = np.vectorize(lambda x: get_interval_v2(x, intervals))
        result[:, col] = encode_label(func(x[:, col]))
        interval_list.append(interval_centroid)
    return result.astype(np.int64), interval_list

def get_var_type(df):
    columns = df.columns
    continuous_vars = [x for x in columns if x.startswith('continuous_')]
    discrete_vars = [x for x in columns if x.startswith('discrete_')]
    other_vars = list()
    for column in columns:
        if column not in continuous_vars and column not in discrete_vars:
            other_vars.append(column)
    return {'continuous': continuous_vars,
            'discrete': discrete_vars,
            'other': other_vars}


def get_cont_var(df):
    var_types = get_var_type(df)
    return var_types['continuous']


def get_dis_var(df):
    var_types = get_var_type(df)
    return var_types['discrete']

def drop_const_var(data):
    result = data.copy(deep=True)
    for col in data.columns:
        if len(data.loc[~pd.isnull(data[col]), col].unique()) <= 1:
            result.drop(columns=col, inplace=True)
    return result

In [None]:
x_train_np, x_test_np = x_train.numpy(), x_test.numpy()
y_train_np, y_test_np = y_train.numpy(), y_test.numpy()
x = np.concatenate([x_train_np, x_test_np])
x_dis, centroid = discretize(x)
x_dis_train = x_dis[:50000, :]
x_dis_test = x_dis[50000:,:]

In [None]:
class TabDataset(Dataset):
    def __init__(self, x, y):
        super().__init__()
        self.x = torch.from_numpy(x).type(torch.int32) 
        self.y = torch.from_numpy(y).type(torch.int32)

    def __getitem__(self, idx):
        return self.x[idx, :], self.y[idx]

    def __len__(self):
        return self.x.shape[0]

### Create Embedding Factories

In [None]:
!pip install einops

In [None]:

class EmbeddingFactory(nn.Module):
    def __init__(self, x, dim_out):
        super().__init__()
        self.dim_out = dim_out
        self.module_list = nn.ModuleList(
            [nn.Embedding(len(set(np.unique(x[:, col]))), dim_out) for col in range(x.shape[1])])

    def forward(self, x):
        result = [self.module_list[col](x[:, col]).unsqueeze(2) for col in range(x.shape[1])]
        return torch.cat(result, dim=2)

In [None]:
from einops import rearrange, reduce, repeat
import pytorch_lightning as pl
from pytorch_lightning.metrics import Accuracy
x_dis_test.shape
train_dataloader = DataLoader(TabDataset(x_dis_train, y_train_np), batch_size = 32, num_workers=6)
test_dataloader = DataLoader(TabDataset(x_dis_test, y_test_np), batch_size = 128, num_workers=6)

class TrainingModuleV2(pl.LightningModule):
    def __init__(self, x, dim_emb, dim_mlp, res_coef=0, dropout_p=0, n_layers=10):
        super().__init__()
        self.embedding = EmbeddingFactory(x, dim_emb)
        self.backbone = MyNetwork(x.shape[1]*dim_emb, dim_mlp, res_coef, dropout_p, n_layers)
        self.loss = nn.BCELoss()
        self.accuracy = Accuracy()
        
    def forward(self, x):
        x = self.embedding(x)
        print('End embedding',x.shape)
        return self.backbone(x)
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        x = self.embedding(x)
        x = rearrange(x, "b h e -> b (h e)")
        x = self.backbone(x)
        loss = self.loss(x, y.type(torch.float32))
        acc = self.accuracy(x, y)
        self.log("Validation loss", loss)
        self.log("Validation acc", acc)
        return loss, acc
        
    def training_step(self, batch, batch_idx):
        x, y = batch
        x = self.embedding(x)   # shape (32,62) to (32,16,62)
        x = rearrange(x, "b h e -> b (h e)") # shape (32,16,62) to (32,62,12)
        x = self.backbone(x)
        loss = self.loss(x, y.type(torch.float32))
        acc = self.accuracy(x, y)
        self.log("Training loss", loss)
        self.log("Training acc", acc)
        
        return loss
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer


In [None]:
from pytorch_lightning import loggers as pl_loggers

tb_logger = pl_loggers.TensorBoardLogger('logs/')
training_module = TrainingModuleV2(x_dis, 16, 64, 0.5, 0.1, 10)
trainer = pl.Trainer(max_epochs=3, gpus=1, progress_bar_refresh_rate=100, val_check_interval=0.5, logger=tb_logger)
trainer.fit(training_module, train_dataloader, test_dataloader)

In [None]:
%tensorboard --logdir logs

### Let us try TabNet

In [None]:
!pip install pytorch_tabnet


In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor

tabnet_clf=TabNetClassifier()
tabnet_clf.fit(x_dis_train, y_train_np,
               max_epochs=30,
               eval_set=[(x_dis_train, y_train_np)],
               eval_name=['train'],
               eval_metric=['auc'])


The train acc is 0.727 now

In [None]:
tabnet_clf2=TabNetClassifier(mask_type='entmax')
tabnet_clf2.fit(x_dis_train, y_train_np,
               max_epochs=30,
               eval_set=[(x_dis_train, y_train_np)],
               eval_name=['train'],
               eval_metric=['auc'])



We can see the train acc is up to 0.802


**how to add the pretrain into Tabnet?**

In [None]:

tabnet_clf3=TabNetClassifier(mask_type='entmax',lambda_sparse=0.001,n_a=10)
tabnet_clf3.fit(x_dis_train, y_train_np,
               max_epochs=30,
               eval_set=[(x_dis_train, y_train_np)],
               eval_name=['train'],
               eval_metric=['auc'])

The train acc up to 0.816