In [1]:
!pip install einops

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting einops
  Downloading einops-0.6.0-py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.6/41.6 KB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.6.0


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as opt
from torch.distributions.bernoulli import Bernoulli
from torch.distributions.categorical import Categorical
from torch.distributions.mixture_same_family import MixtureSameFamily
from einops import rearrange
import pandas as pd
import numpy as np
import scipy.stats as ss
import missingno as mno
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix, auc, roc_curve
from copy import copy, deepcopy
import zipfile
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
torch.cuda.set_device(0)
torch.backends.cudnn.benchmark = True

Mounted at /content/drive


In [3]:
def categorical_encoding(train, valid, test, categorical):
    new_cat, OHEncoders = [], {}
    for cat in categorical:
        LE = LabelEncoder()
        train[cat] = LE.fit_transform(train[cat])
        valid[cat] = LE.transform(valid[cat])
        test[cat] = LE.transform(test[cat])
        OHEncoders[cat] = LE
    return train, valid, test, OHEncoders


def numerical_scaling(train, valid, test, numerical):
    MS = MinMaxScaler(feature_range=(0, 1))
    scaled_train = MS.fit_transform(train[numerical])
    scaled_valid = MS.transform(valid[numerical])
    scaled_test = MS.transform(test[numerical])
    train[numerical] = scaled_train
    valid[numerical] = scaled_valid
    test[numerical] = scaled_test
    return train, valid, test, MS


def output_scaling(train, valid, test, output_col):
    YScaler = MinMaxScaler(feature_range=(0.5, 1.5))
    Y_train = YScaler.fit_transform(train[[output_col]]).ravel() + 1e-15
    Y_valid = YScaler.transform(valid[[output_col]]).ravel() + 1e-15
    Y_test = test[output_col]
    return Y_train, Y_valid, Y_test, YScaler


def final_features(X_train, categorical, numerical, binary):
    features = categorical + numerical + binary
    unique_col = [col for col in features if len(pd.unique(X_train[col])) == 1]
    return [col for col in features if col not in unique_col]

In [4]:
df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/classification/hotel_cancellation/cleaned_hotel_bookings.csv')
df

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_week_number,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,...,required_car_parking_spaces,total_of_special_requests,Date_sin_dayofweek,Date_cos_dayofweek,Date_sin_dayofyear,Date_cos_dayofyear,reservation_status_date_sin_dayofweek,reservation_status_date_cos_dayofweek,reservation_status_date_sin_dayofyear,reservation_status_date_cos_dayofyear
0,Resort Hotel,0,342,27,0,0,2,0.0,0,BB,...,0,0,8.660254e-01,-0.5,0.017166,-0.999853,8.660254e-01,-0.5,1.716633e-02,-0.999853
1,Resort Hotel,0,737,27,0,0,2,0.0,0,BB,...,0,0,8.660254e-01,-0.5,0.017166,-0.999853,8.660254e-01,-0.5,1.716633e-02,-0.999853
2,Resort Hotel,0,7,27,0,1,1,0.0,0,BB,...,0,0,8.660254e-01,-0.5,0.017166,-0.999853,1.224647e-16,-1.0,1.224647e-16,-1.000000
3,Resort Hotel,0,13,27,0,1,1,0.0,0,BB,...,0,0,8.660254e-01,-0.5,0.017166,-0.999853,1.224647e-16,-1.0,1.224647e-16,-1.000000
4,Resort Hotel,0,14,27,0,2,2,0.0,0,BB,...,0,1,8.660254e-01,-0.5,0.017166,-0.999853,-8.660254e-01,-0.5,-1.716633e-02,-0.999853
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119201,City Hotel,0,23,35,2,5,2,0.0,0,BB,...,0,0,8.660254e-01,-0.5,-0.848351,-0.529434,8.660254e-01,-0.5,-9.057023e-01,-0.423914
119202,City Hotel,0,102,35,2,5,3,0.0,0,BB,...,0,2,1.224647e-16,-1.0,-0.857315,-0.514793,1.224647e-16,-1.0,-9.128459e-01,-0.408304
119203,City Hotel,0,34,35,2,5,2,0.0,0,BB,...,0,4,1.224647e-16,-1.0,-0.857315,-0.514793,1.224647e-16,-1.0,-9.128459e-01,-0.408304
119204,City Hotel,0,109,35,2,5,2,0.0,0,BB,...,0,0,1.224647e-16,-1.0,-0.857315,-0.514793,1.224647e-16,-1.0,-9.128459e-01,-0.408304


In [5]:
pd.DataFrame({'Dtype': df.dtypes, 'Nunique': df.nunique(), 'Isnull': df.isnull().sum()}, index=df.columns)

Unnamed: 0,Dtype,Nunique,Isnull
hotel,object,2,0
is_canceled,int64,2,0
lead_time,int64,479,0
arrival_date_week_number,int64,53,0
stays_in_weekend_nights,int64,17,0
stays_in_week_nights,int64,33,0
adults,int64,14,0
children,float64,5,0
babies,int64,5,0
meal,object,4,0


In [6]:
out_column = 'is_canceled'
to_remove = []
features = [c for c in df.columns if (c != out_column) and (c not in to_remove)]
categorical = [c for c in features if (df[c].dtype=='object') and (df[c].nunique() > 2)]
binary = [c for c in features if df[c].nunique() == 2]
numerical = [col for col in features if col not in categorical + binary]
df[numerical] = df[numerical].apply(pd.to_numeric,1)
fig = go.Figure(data=go.Heatmap(z=df[numerical].corr(),x=numerical,y=numerical))
fig.show()

In [7]:
from sklearn.model_selection import train_test_split

index_train, index_test = train_test_split(range(df.shape[0]), test_size=0.2, random_state=42, stratify=df[out_column])
tmp_data_train = df.loc[index_train].reset_index( drop = True )
data_test  = df.loc[index_test].reset_index( drop = True )

index_train, index_valid = train_test_split(tmp_data_train.index, test_size=0.1, random_state=0, stratify=tmp_data_train[out_column])
data_train = tmp_data_train.loc[index_train].reset_index( drop = True )
data_valid = tmp_data_train.loc[index_valid].reset_index( drop = True )

# Creating the X, T and E inputs
X_train, X_valid, X_test = data_train[features], data_valid[features], data_test[features]
Y_train, Y_valid, Y_test = data_train[out_column], data_valid[out_column], data_test[out_column]
pos_weight = len(Y_train) / (Y_train.nunique() * np.bincount(Y_train))

cat_enc_d = {}
for cat in categorical+binary:
    print(cat)
    LE = LabelEncoder()
    X_train[cat] = LE.fit_transform(X_train[cat])
    X_valid[cat] = LE.transform(X_valid[cat])
    X_test[cat] = LE.transform(X_test[cat])
    cat_enc_d[cat] = LE
 
MS = MinMaxScaler(feature_range=(0, 1))
scaled_train = MS.fit_transform(X_train[numerical])
scaled_valid = MS.transform(X_valid[numerical])
scaled_test = MS.transform(X_test[numerical])
X_train[numerical], X_valid[numerical], X_test[numerical] = scaled_train, scaled_valid, scaled_test

X_train.shape, X_valid.shape, X_test.shape

meal
market_segment
distribution_channel
reserved_room_type
deposit_type
customer_type
hotel
is_repeated_guest




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

((85827, 28), (9537, 28), (23842, 28))

In [None]:
X_train

Unnamed: 0,hotel,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,market_segment,distribution_channel,...,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,deposit_type,customer_type,adr,required_car_parking_spaces,total_of_special_requests,sin_dayofyear,cos_dayofyear
0,1,0.006784,0.105263,0.00,0.036364,0.0,0.0,0,6,3,...,0.000000,0.0,0,0,2,0.012037,0.000,0.2,0.848985,0.858076
1,1,0.047490,0.052632,0.06,0.036364,0.0,0.0,0,6,3,...,0.000000,0.0,3,0,2,0.045000,0.125,0.2,0.196460,0.102671
2,0,0.075984,0.105263,0.08,0.036364,0.0,0.0,0,6,3,...,0.000000,0.0,3,0,2,0.013587,0.000,0.0,0.996909,0.555678
3,0,0.369064,0.000000,0.04,0.036364,0.0,0.0,0,4,3,...,0.038462,0.0,0,0,3,0.011630,0.000,0.0,0.380979,0.014372
4,1,0.279512,0.105263,0.12,0.036364,0.0,0.0,0,5,3,...,0.000000,0.0,0,0,2,0.006926,0.000,0.0,0.937257,0.257470
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86121,0,0.080054,0.105263,0.00,0.018182,0.0,0.0,0,5,3,...,0.000000,0.0,0,0,3,0.001111,0.000,0.0,0.127267,0.166712
86122,0,0.154681,0.105263,0.04,0.036364,0.0,0.0,0,6,3,...,0.000000,0.0,0,0,2,0.013939,0.000,0.0,0.905027,0.206802
86123,1,0.389417,0.052632,0.06,0.036364,0.0,0.0,0,4,0,...,0.000000,0.0,0,0,3,0.009352,0.125,0.0,0.013350,0.614844
86124,0,0.145183,0.000000,0.06,0.036364,0.0,0.0,0,3,1,...,0.000000,0.0,0,0,2,0.022000,0.000,0.0,0.062743,0.257470


In [8]:
from torch import autograd
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
class ClsLoader(Dataset):
    def __init__(self, X, Y, numerical_col, categorical_col):
        self.X1, self.X2, self.Y = X[numerical_col].values.astype(np.float32), X[categorical_col].values, Y.astype(np.float32)
 
    def __len__(self):
        return len(self.Y)
 
    def __getitem__(self, idx):
        return self.X1[idx], self.X2[idx], self.Y[idx]

class Embedder(nn.Module):
    def __init__(self, vocab_size, dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, dim)

    def forward(self, x):
        return self.embeddings(x)

class NNModel(nn.Module):
    def __init__(self, input_shape, units=None, factors=None, activ=True, norm=True, dropout=False, slops=None):
        super().__init__()
        self.input_shape = input_shape
        self.units = units
        self.factors = factors
        self.activ, self.norm = activ, norm
        self.network = nn.ModuleList()
        if self.factors:
            self.units = np.round(self.input_shape * np.asarray(self.factors)).astype(int)
        if self.units is not None:
            self.dropout = np.zeros_like(self.units) if not dropout else dropout
            self.slops = np.full(len(self.units), 1) if slops is None else slops
            for i, j, k in zip(self.units, self.dropout, self.slops):
                if i >= 1:
                    block = self.__build_block__(input_shape, i, p=j, slop=k)
                    self.network.extend(block)
                    input_shape = i
        self.output_shape = input_shape
        self.reset_parameters()
    
    def __build_block__(self, input_shape, units, p, slop):
        block = []
        block.append(nn.Linear(input_shape, units, bias=not self.norm))
        if self.norm:
            #block.append(nn.BatchNorm1d(units))
            block.append(nn.LayerNorm(units, eps=1e-5))
        if self.activ:
            #block.append(nn.ELU(slop))
            block.append(nn.GELU())
        if p > 0:
            block.append(nn.Dropout(p))
        return block
 
    def forward(self, x):
        for layer in self.network:
          tmp = layer(x)
          x = tmp
        return x
 
    def reset_parameters(self):
        for layer in self.network:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_normal_(layer.weight)
                #layer.bias.data.fill_(0.1)
 
 
class DenseBlock(nn.Module):
    def __init__(self, input_shape, nb_block, growth_rate, units, factors=None, dropout=0., slops=1.):
        super(DenseBlock, self).__init__()
        self.dropout = [dropout for i in range(nb_block)]
        self.slops = [slops for i in range(nb_block)]
        self.growth_rate = growth_rate
        self.network = nn.ModuleList()
        for j, k, l in zip(self.dropout, self.slops, range(nb_block)):
            block = NNModel(input_shape, units=[units], factors=factors, dropout=[j], slops=[k])
            self.network.extend([block])
            input_shape += units
            units += growth_rate
        self.input_shape = input_shape
        self.output_shape = units
 
    def forward(self, input):
        for block in self.network:
            x = block(input)
            input = torch.cat((input, x), 1)
        return input
 
class DenseNet(nn.Module):
    def __init__(self, input_layer, dense_blocks):
        super(DenseNet, self).__init__()
        self.input_layer = input_layer
        self.network = nn.ModuleList()
        self.network.extend(dense_blocks)
        self.output_shape = dense_blocks[-1].output_shape
 
    def forward(self, input):
        input = self.input_layer(input) if self.input_layer else input
        for i in self.network:
            input = i(input)
        return input

class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
        self.pe = torch.zeros(1, max_len, d_model).to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
        self.pe[0,:, 0::2] = torch.sin(position * div_term)
        self.pe[0,:, 1::2] = torch.cos(position * div_term)
    
    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return x

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, dim, heads=8, dim_head=None):
        """
        Implementation of multi-head attention layer of the original transformer model.
        einsum and einops.rearrange is used whenever possible
        Args:
            dim: token's dimension, i.e. word embedding vector size
            heads: the number of distinct representations to learn
            dim_head: the dim of the head. In general dim_head<dim.
            However, it may not necessary be (dim/heads)
        """
        super().__init__()
        self.dim_head = (int(dim / heads)) if dim_head is None else dim_head
        _dim = self.dim_head * heads
        self.heads = heads
        self.to_qvk = nn.Linear(dim, _dim * 3, bias=False)
        self.W_0 = nn.Linear( _dim, dim, bias=False)
        self.scale_factor = self.dim_head ** -0.5
    def forward(self, x, mask=None):
        assert x.dim() == 3
        # Step 1
        qkv = self.to_qvk(x)  # [batch, tokens, dim*3*heads ]
        # Step 2
        # decomposition to q,v,k and cast to tuple
        # the resulted shape before casting to tuple will be:
        # [3, batch, heads, tokens, dim_head]
        q, k, v = tuple(rearrange(qkv, 'b t (d k h) -> k b h t d ', k=3, h=self.heads))
        # Step 3
        # resulted shape will be: [batch, heads, tokens, tokens]
        scaled_dot_prod = torch.einsum('b h i d , b h j d -> b h i j', q, k) * self.scale_factor
        if mask is not None:
            assert mask.shape == scaled_dot_prod.shape[2:]
            scaled_dot_prod = scaled_dot_prod.masked_fill(mask, -np.inf)
        attention = torch.softmax(scaled_dot_prod, dim=-1)
        # Step 4. Calc result per batch and per head h
        out = torch.einsum('b h i j , b h j d -> b h i d', attention, v)
        # Step 5. Re-compose: merge heads with dim_head d
        out = rearrange(out, "b h t d -> b t (h d)")
        # Step 6. Apply final linear transformation layer
        return self.W_0(out)

class TransformerBlock(nn.Module):
   """
   Vanilla transformer block from the original paper "Attention is all you need"
   Detailed analysis: https://theaisummer.com/transformer/
   """
   def __init__(self, dim, heads=8, dim_head=None, dim_linear_block=1024, dropout=0.1):
       """
       Args:
           dim: token's vector length
           heads: number of heads
           dim_head: if none dim/heads is used
           dim_linear_block: the inner projection dim
           dropout: probability of droppping values
       """
       super().__init__()
       self.mhsa = MultiHeadSelfAttention(dim=dim, heads=heads, dim_head=dim_head)
       self.drop = nn.Dropout(dropout)
       self.norm_1 = nn.LayerNorm(dim)
       self.norm_2 = nn.LayerNorm(dim)
       #self.linear = NNModel(dim, units=[dim_linear_block, dim], factors=None, dropout=[dropout, dropout], norm=False)
       self.linear = nn.Sequential(
           nn.Linear(dim, dim_linear_block), nn.ReLU(), nn.Dropout(dropout),
           nn.Linear(dim_linear_block, dim), nn.Dropout(dropout)
       )
   def forward(self, x, mask=None):
       y = self.norm_1(self.drop(self.mhsa(x, mask)) + x)
       return self.norm_2(self.linear(y) + y)

class Transformer(nn.Module):
    def __init__(self, blocks, dim, heads=8, dim_head=None, dim_linear_block=1024, dropout=0.1):
       super().__init__()
       self.block_list = [TransformerBlock(dim, heads, dim_head, dim_linear_block=dim_linear_block, dropout=dropout) for _ in range(blocks)]
       self.layers = nn.ModuleList(self.block_list)

    def forward(self, x, mask=None):
       for layer in self.layers:
           x = layer(x, mask)
       return x

class TabTransformer(nn.Module):
    def __init__(self, categories, numerical_nb, blocks, dim, mlp_units, heads=8, dim_head=None, dim_linear_block=1024, dropout=0.1, mlp_dropout=0.00001, apply_pos=False):
        """
        categories: tuple containing the number of unique values within each category
        """
        super().__init__()
        self.embed = Embedder(sum(categories), dim)
        #self.cont_embed = nn.ModuleList([NNModel(1, units=[dim], factors=None, norm=False) for i in range(numerical_nb)])
        self.cont_embed = NNModel(numerical_nb, units=[dim], factors=None, norm=False, activ=False)
        self.pe = PositionalEncoder(dim)
        self.transformer = Transformer(blocks, dim, heads, dim_head, dim_linear_block, dropout)
        #self.input_size = dim * (len(categories) + numerical_nb) + numerical_nb
        self.input_size = dim * (len(categories) + 1) + numerical_nb
        self.mlp = NNModel(self.input_size, units=mlp_units, factors=None, dropout=[mlp_dropout]*len(mlp_units))
        self.norm = nn.LayerNorm(numerical_nb)
        self.output_shape = mlp_units[-1]
        self.apply_pos = apply_pos

    def forward(self, x_cont, x_cat):
        #x = torch.stack([e(x_cont[:,[i]]) for i,e in enumerate(self.cont_embed)], 1)#[batch_shape, num_cont, dim]
        x1_ = self.cont_embed(x_cont)#[batch_shape, dim]
        x = torch.unsqueeze(x1_, 1)
        if x_cat.nelement() != 0: #skipped if there's no categorical feature
            x2 = self.embed(x_cat)#[batch_shape, num_cat, dim]
            #x = torch.cat((x, x2), 1)#[batch_shape, num_cat+num_cont, dim]
            x = torch.cat((x, x2), 1)#[batch_shape, num_cat+1, dim]
        x = self.pe(x) if self.apply_pos else x
        x = self.transformer(x)
        x = x.flatten(1)
        x = torch.cat((x, self.norm(x_cont)), dim = -1)
        return self.mlp(x)


class BinClsN(nn.Module):
    def __init__(self, shared):
        super(BinClsN, self).__init__()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.shared = shared
        self.pi = nn.Linear(self.shared.output_shape, 1)
 
    def forward(self, x_cont, x_cat):
        z = self.shared(x_cont, x_cat)
        pred = self.pi(z)
        return pred

class MBNModel(nn.Module):#Mixture Binomial Network
    def __init__(self, shared, clf_nn, rt_nn, n_comp):
        super(MBNModel, self).__init__()
        self.shared = shared
        self.clf_nn = clf_nn
        self.rt_nn = rt_nn
        self.n_comp = n_comp
        self.pi = nn.Linear(self.clf_nn.output_shape, self.n_comp) if self.n_comp > 1 else None
        self.ai = nn.Linear(self.rt_nn.output_shape, self.n_comp)
 
    def forward(self, x_cont, x_cat):
        x = self.shared(x_cont, x_cat)
        proba = self.proba_model(x) if self.n_comp > 1 else torch.ones((len(x),1)).to(device)
        rt = self.rt_model(x)
        return proba, rt
 
    def proba_model(self, x):
        model = self.clf_nn(x)
        model = self.pi(model)
        model = nn.Softmax(dim=-1)(model)
        return model
 
    def rt_model(self, x):
        model = self.rt_nn(x)
        model = self.ai(model)
        model = nn.Softmax(dim=-1)(model)
        return model
 
class BaseParametric:
    def __init__(self, model, numerical_col, categorical_col, resume=None):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = model.to(self.device)
        self.losses = {'Epoch': [], 'Train': [], 'Test': [], 'BState': [], 'LState': [], 'LR': []}
        self.numerical_col, self.categorical_col = numerical_col, categorical_col
 
    def train_model(self, optim, train_loader, grad_clip, l2_reg):
          total_loss = 0
          self.model = self.model.train()
        #with autograd.detect_anomaly():
          for i, (X1, X2, Y) in enumerate(train_loader):
              X1, X2, Y = X1.to(self.device), X2.to(self.device), Y.to(self.device)
              #self.model.get_weight()
              optim.zero_grad()
              loss = self.loss_function(X1, X2, Y, l2_reg)
              loss.backward()
              torch.nn.utils.clip_grad_norm_(self.model.parameters(), grad_clip)
              optim.step()
              '''self.model.get_grad()
              print('_'*50)'''
              total_loss += loss.item()
          return total_loss/(i+1)
        
 
    def eval_model(self, test_loader):
        self.model = self.model.eval()
        total_loss = 0
        for i, (X1, X2, Y) in enumerate(test_loader):
            X1, X2, Y = X1.to(self.device), X2.to(self.device), Y.to(self.device)
            loss = self.loss_function(X1, X2, Y, l2_reg=0)
            total_loss += loss.item()
        return total_loss/(i+1)#np.abs(-100. - total_loss)
 
 
    def fit(self, X_train, Y_train, epoch, lr, opt_kwarg, batch_size=None,  grad_clip=100, momentum=0.9,
            X_test=None, Y_test=None, l2_reg=0, eval=True, verbose=True, save=True):
        
 
        batch_size = len(X_train) if batch_size is None else batch_size
        #X_train_uncens, Y_train_uncens, T_train_uncens, X_train_cens, Y_train_cens, T_train_cens = self.process_data(X_train, T_train, E_train)
        train_load = DataLoader(ClsLoader(X_train, Y_train, self.numerical_col, self.categorical_col), batch_size=batch_size, shuffle=True)  # DATALOADER obj
        if X_test is not None:
            #X_test_uncens, Y_test_uncens, T_test_uncens, X_test_cens, Y_test_cens, T_test_cens = self.process_data(X_test, T_test, E_test)
            test_load = DataLoader(ClsLoader(X_test, Y_test, self.numerical_col, self.categorical_col), batch_size=batch_size, shuffle=True)  # DATALOADER obj
 
        best_loss = 1e100
        #optim = opt.Adam(self.model.parameters(), lr=lr)
        optim = opt.SGD(self.model.parameters(), lr=lr, momentum=momentum, nesterov=True)

        #scheduler = None
        scheduler = opt.lr_scheduler.CyclicLR(optim, **opt_kwarg)
        #scheduler = opt.lr_scheduler.ReduceLROnPlateau(optim, **opt_kwarg)
        #scheduler = opt.lr_scheduler.MultiStepLR(optim, milestones=[28, 120], gamma=0.1)

        eval_score = ''
        for i in range(epoch):
            if verbose:
                print('##### EPOCH ' + str(i) + ' #####')
               
            train_loss = self.train_model(optim, train_load, grad_clip, l2_reg)
            self.losses['LState'] = deepcopy(self.model.state_dict())
    
            if verbose:
                print('train loss : ', train_loss)
            self.losses['Epoch'].append(i), self.losses['Train'].append(train_loss)
    
            if X_test is not None:
                valid_loss = self.eval_model(test_load)

                if verbose:
                    print('test loss : ', valid_loss)
                self.losses['Test'].append(valid_loss)
    
                if scheduler is not None:
                    '''scheduler.step(valid_loss)
                    self.losses['LR'].append(optim.param_groups[0]['lr'])'''
                    scheduler.step()
                    self.losses['LR'].append(scheduler.get_last_lr()[0])
    
                if valid_loss < best_loss:
                    self.losses['BState'] = deepcopy(self.model.state_dict())
                    best_loss = valid_loss
                    print('===========SAVE===========')
            

    def feature_importance(self, rep, X_test, Y_test, batch_size=None):
        res = np.zeros((rep, X_test.shape[1]))
        batch_size = len(X_train) if batch_size is None else batch_size
        test_load = DataLoader(ClsLoader(X_test, Y_test, self.numerical_col, self.categorical_col), batch_size=batch_size, shuffle=True)  # DATALOADER obj
        base_loss = self.eval_model(test_load)
        origin = X_test.copy()
        for i, col in enumerate(X_test.columns):
            for j in range(rep):
                X_test.loc[:,col] = np.random.permutation(X_test.loc[:, col])
                test_load = DataLoader(ClsLoader(X_test, Y_test, self.numerical_col, self.categorical_col), batch_size=batch_size, shuffle=True)  # DATALOADER obj
                loss = self.eval_model(test_load)
                res[j, i] = base_loss - loss
                X_test = origin
        res = np.abs(res)
        return {'importances': res, 'importances_mean': np.mean(res, 0), 'importances_std': np.std(res, 0)}

class BCEModel(BaseParametric):
    def __init__(self, model, numerical_col, categorical_col,resume=None):
        super(BCEModel, self).__init__(model, numerical_col, categorical_col,)
        self.pos_weight = torch.from_numpy(pos_weight[[0]]).to(device)

    def loss_function(self, X1, X2, Y, l2_reg):
        pred = self.model(X1, X2)
        bce_loss = nn.BCEWithLogitsLoss(pos_weight=self.pos_weight)
        loss = bce_loss(pred, Y)
        return loss

    def prdict(self, X):
        self.model.eval()
        X1 = torch.tensor(X[self.numerical_col].values.astype(np.float32)).to(self.device)
        X2 = torch.tensor(X[self.categorical_col].values).to(self.device)
        pred = nn.Sigmoid()(self.model(X1, X2))
        return pred

class MixtureBinomialModel(BaseParametric):
    def __init__(self, model, numerical_col, categorical_col,resume=None):
        super(MixtureBinomialModel, self).__init__(model, numerical_col, categorical_col,)

    def loss_function(self, X1, X2, Y, l2_reg):
        pi, rt = self.model(X1, X2)
        mix = Categorical(pi)
        comp = Bernoulli(rt, validate_args=None)
        pdf = MixtureSameFamily(mix, comp).log_prob(Y)
        ll1 = -torch.mean(pdf)
        return ll1

    def predict(self, X):
        self.model.eval()
        X1 = torch.tensor(X[self.numerical_col].values.astype(np.float32)).to(self.device)
        X2 = torch.tensor(X[self.categorical_col].values).to(self.device)
        pi, rt = self.model(X1, X2)
        mix = Categorical(pi)
        comp = Bernoulli(rt, validate_args=None)
        msf = MixtureSameFamily(mix, comp)
        pred = msf.mean
        return pred, pi, rt
 
def gradient_clipper(model: nn.Module, val: float) -> nn.Module:
    def process_grad(grad):
        grad[grad != grad] = 1e-10
        return torch.clamp(grad, -val, val)
    for parameter in model.parameters():
        parameter.register_hook(lambda grad: process_grad(grad))
    
    return model

In [9]:
epoch, lr, batch_size, d, mlp_d = 50000, 1e-4, 4096, 0.000001, 1e-6
cyclic_kwarg = {'base_lr': lr, 'max_lr': 1e-2, 'step_size_up':200, 'step_size_down':200}
plateau_kwarg = {'factor':0.5, 'patience':200, 'verbose':True, 'min_lr':1e-7, 'mode':'min'}

categories = list(X_train[categorical+binary].nunique())
shared_nn = TabTransformer(categories, len(numerical), blocks=4, dim=64, mlp_units=[512], heads=8, dim_head=None, dim_linear_block=1024, dropout=0.1, mlp_dropout=mlp_d)

nn_model = gradient_clipper(BinClsN(shared_nn), 10)
#nn_model.load_state_dict(best_state)
print(nn_model)
print(sum(p.numel() for p in nn_model.parameters() if p.requires_grad))
dws = BCEModel(nn_model, numerical, categorical+binary)
dws.fit(X_train, Y_train.values[:,None], epoch, lr, cyclic_kwarg, batch_size=batch_size, grad_clip=10, momentum=0.9,
        X_test=X_valid, Y_test=Y_valid.values[:,None], l2_reg=0, eval=False, verbose=True)

BinClsN(
  (shared): TabTransformer(
    (embed): Embedder(
      (embeddings): Embedding(36, 64)
    )
    (cont_embed): NNModel(
      (network): ModuleList(
        (0): Linear(in_features=20, out_features=64, bias=True)
      )
    )
    (pe): PositionalEncoder()
    (transformer): Transformer(
      (layers): ModuleList(
        (0): TransformerBlock(
          (mhsa): MultiHeadSelfAttention(
            (to_qvk): Linear(in_features=64, out_features=192, bias=False)
            (W_0): Linear(in_features=64, out_features=64, bias=False)
          )
          (drop): Dropout(p=0.1, inplace=False)
          (norm_1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (norm_2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (linear): Sequential(
            (0): Linear(in_features=64, out_features=1024, bias=True)
            (1): ReLU()
            (2): Dropout(p=0.1, inplace=False)
            (3): Linear(in_features=1024, out_features=64, bias=True)
    

KeyboardInterrupt: ignored

In [10]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

best_state = deepcopy(dws.losses['BState'])
dws.model.load_state_dict(best_state)
print(np.min(dws.losses['Test']))

fig = make_subplots(rows=3, cols=1)
s = 0
model = dws
fig.append_trace(go.Scatter(x=model.losses['Epoch'][s:], y=model.losses['Train'][s:],mode='lines',name='Train'), row=1, col=1)
fig.append_trace(go.Scatter(x=model.losses['Epoch'][s:], y=model.losses['Test'][s:],mode='lines',name='Test'), row=2, col=1)
fig.append_trace(go.Scatter(x=model.losses['Epoch'][s:], y=model.losses['LR'][s:],mode='lines',name='LR'), row=3, col=1)
fig.update_layout(height=1000, width=1500, title_text="Stacked Subplots")
fig.show()

0.003754886177678903


In [11]:
pred = []
for i in range(0, len(X_valid), 1000):
    pred_ = dws.prdict(X_valid.iloc[i:i+1000])
    pred_ = pred_.cpu().data.numpy().ravel()
    pred.extend(pred_)
pred = np.asarray(pred).ravel()

auc = roc_auc_score(Y_valid, pred)
pd.DataFrame({'AUC': auc, 'ACC': accuracy_score(Y_valid, np.round(pred)), 'PRE': precision_score(Y_valid, np.round(pred)), 'REC': recall_score(Y_valid, np.round(pred)), 'F1':f1_score(Y_valid, np.round(pred))}, index=[0])

Unnamed: 0,AUC,ACC,PRE,REC,F1
0,0.999958,0.998217,0.998583,0.996606,0.997594


In [12]:
pred = []
for i in range(0, len(X_test), 1000):
    pred_ = dws.prdict(X_test.iloc[i:i+1000])
    pred_ = pred_.cpu().data.numpy().ravel()
    pred.extend(pred_)
pred = np.asarray(pred).ravel()

auc = roc_auc_score(Y_test, pred)
pd.DataFrame({'AUC': auc, 'ACC': accuracy_score(Y_test, np.round(pred)), 'PRE': precision_score(Y_test, np.round(pred)), 'REC': recall_score(Y_test, np.round(pred)), 'F1':f1_score(Y_test, np.round(pred))}, index=[0])

Unnamed: 0,AUC,ACC,PRE,REC,F1
0,0.99999,0.998658,0.998754,0.997624,0.998189


In [13]:
confusion_matrix(Y_test, np.round(pred))

array([[14992,    11],
       [   21,  8818]])

In [14]:
imp = dws.feature_importance(10, X_test, Y_test.values[:,None], batch_size=None)
fig = go.Figure()
for i in range(X_test.shape[1]):
    fig.add_trace(go.Box(x=imp['importances'][:, i], name=X_test.columns[i]))
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

