## Confidentiality

The programmatic cases in this notebook are utilized from different internet resources (in this notebook especially from kaggle.com) and are for demonstrational purposes only.

Please do not copy or distribute this notebook.


## Table of content

Santander Bank Customer Transaction

1. Programmatic case 1 
2. Programmatic case 2
3. Programmatic case 3

## Previous knowledge

For a good understanding of this notebook you should have a few years of data-science and programming experience and have studied the advanced programming notebooks.


#### Programmatic case 1

In [None]:
import fastai
from fastai.tabular import *
from fastai.text import *
import feather
import gc
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
from fastai.callbacks import SaveModelCallback
import logging

#logger
def get_logger():
    FORMAT = '[%(levelname)s]%(asctime)s:%(name)s:%(message)s'
    logging.basicConfig(format=FORMAT)
    logger = logging.getLogger('main')
    logger.setLevel(logging.DEBUG)
    return logger
    
logger = get_logger()

def auroc_score(input, target):
    input, target = input.cpu().numpy()[:,1], target.cpu().numpy()
    return roc_auc_score(target, input)

# Callback to calculate AUC at the end of each epoch
class AUROC(Callback):
    _order = -20 #Needs to run before the recorder

    def __init__(self, learn, **kwargs): self.learn = learn
    def on_train_begin(self, **kwargs): self.learn.recorder.add_metric_names(['AUROC'])
    def on_epoch_begin(self, **kwargs): self.output, self.target = [], []
    
    def on_batch_end(self, last_target, last_output, train, **kwargs):
        if not train:
            self.output.append(last_output)
            self.target.append(last_target)
                
    def on_epoch_end(self, last_metrics, **kwargs):
        if len(self.output) > 0:
            output = torch.cat(self.output)
            target = torch.cat(self.target)
            preds = F.softmax(output, dim=1)
            metric = auroc_score(preds, target)
            return add_metrics(last_metrics, [metric])

# Callback that do the shuffle augmentation        
class AugShuffCallback(LearnerCallback):
    def __init__(self, learn:Learner):
        super().__init__(learn)
        
    def on_batch_begin(self, last_input, last_target, train, **kwargs):
        if not train: return
        m_pos = last_target==1
        m_neg = last_target==0
        
        pos_cat = last_input[0][m_pos]
        pos_cont = last_input[1][m_pos]
        
        neg_cat = last_input[0][m_neg]
        neg_cont = last_input[1][m_neg]
        
        for f in range(200):
            shuffle_pos = torch.randperm(pos_cat.size(0)).to(last_input[0].device)
            pos_cat[:,f] = pos_cat[shuffle_pos,f]
            pos_cont[:,f] = pos_cont[shuffle_pos, f]
            pos_cont[:,f+200] = pos_cont[shuffle_pos, f+200]
            
            shuffle_neg = torch.randperm(neg_cat.size(0)).to(last_input[0].device)
            neg_cat[:,f] = neg_cat[shuffle_neg,f]
            neg_cont[:, f] = neg_cont[shuffle_neg, f]
            neg_cont[:,f+200] = neg_cont[shuffle_neg, f+200]
        
        new_input = [torch.cat([pos_cat, neg_cat]), torch.cat([pos_cont, neg_cont])]
        new_target = torch.cat([last_target[m_pos], last_target[m_neg]])
        
        return {'last_input': new_input, 'last_target': new_target}
        
# Just a longer version of the random sampler : each samples is given "mult" times.
class LongerRandomSampler(Sampler):
    def __init__(self, data_source, replacement=False, num_samples=None, mult=3):
        self.data_source = data_source
        self.replacement = replacement
        self.num_samples = num_samples
        self.mult = mult

        if self.num_samples is not None and replacement is False:
            raise ValueError("With replacement=False, num_samples should not be specified, "
                             "since a random permute will be performed.")

        if self.num_samples is None:
            self.num_samples = len(self.data_source) * self.mult

        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
            raise ValueError("num_samples should be a positive integeral "
                             "value, but got num_samples={}".format(self.num_samples))
        if not isinstance(self.replacement, bool):
            raise ValueError("replacement should be a boolean value, but got "
                             "replacement={}".format(self.replacement))

    def __iter__(self):
        n = len(self.data_source)
        if self.replacement:
            return iter(torch.randint(high=n, size=(self.num_samples*self.mult,), dtype=torch.int64).tolist())
        return iter(torch.randperm(n).tolist()*self.mult)

    def __len__(self):
        return len(self.data_source)*self.mult
        
# This is the NN structure, starting from fast.ai TabularModel.
class my_TabularModel(nn.Module):
    "Basic model for tabular data."
    def __init__(self, emb_szs:ListSizes, n_cont:int, out_sz:int, layers:Collection[int], ps:Collection[float]=None,
                 emb_drop:float=0., y_range:OptRange=None, use_bn:bool=True, bn_final:bool=False, 
                 cont_emb=2, cont_emb_notu=2):
        
        super().__init__()
        # "Continuous embedding NN for raw features"
        self.cont_emb = cont_emb[1]
        self.cont_emb_l = torch.nn.Linear(1 + 2, cont_emb[0])
        self.cont_emb_l2 = torch.nn.Linear(cont_emb[0], cont_emb[1])
        
        # "Continuous embedding NN for "not unique" features". cf #1 solution post
        self.cont_emb_notu_l = torch.nn.Linear(1 + 2, cont_emb_notu[0])
        self.cont_emb_notu_l2 = torch.nn.Linear(cont_emb_notu[0], cont_emb_notu[1])
        self.cont_emb_notu = cont_emb_notu[1]
            
        ps = ifnone(ps, [0]*len(layers))
        ps = listify(ps, layers)
        
        # Embedding for "has one" categorical features, cf #1 solution post
        self.embeds = embedding(emb_szs[0][0], emb_szs[0][1])
        
        # At first information was included about the variable being processed (to extract feature importance). 
        # It works better using a constant feat (kind of intercept)
        self.embeds_feat = embedding(201, 2)
        self.embeds_feat_w = embedding(201, 2)
        
        self.emb_drop = nn.Dropout(emb_drop)
        
        n_emb = self.embeds.embedding_dim
        n_emb_feat = self.embeds_feat.embedding_dim
        n_emb_feat_w = self.embeds_feat_w.embedding_dim
        
        self.n_emb, self.n_emb_feat, self.n_emb_feat_w, self.n_cont,self.y_range = n_emb, n_emb_feat, n_emb_feat_w, n_cont, y_range
        
        sizes = self.get_sizes(layers, out_sz)
        actns = [nn.ReLU(inplace=True)] * (len(sizes)-2) + [None]
        layers = []
        for i,(n_in,n_out,dp,act) in enumerate(zip(sizes[:-1],sizes[1:],[0.]+ps,actns)):
            layers += bn_drop_lin(n_in, n_out, bn=use_bn and i!=0, p=dp, actn=act)
            
        self.layers = nn.Sequential(*layers)
        self.seq = nn.Sequential()
        
        # Input size for the NN that predicts weights
        inp_w = self.n_emb + self.n_emb_feat_w + self.cont_emb + self.cont_emb_notu
        # Input size for the final NN that predicts output
        inp_x = self.n_emb + self.cont_emb + self.cont_emb_notu
        
        # NN that predicts the weights
        self.weight = nn.Linear(inp_w, 5)
        self.weight2 = nn.Linear(5,1)
        
        mom = 0.1
        self.bn_cat = nn.BatchNorm1d(200, momentum=mom)
        self.bn_feat_emb = nn.BatchNorm1d(200, momentum=mom)
        self.bn_feat_w = nn.BatchNorm1d(200, momentum=mom)
        self.bn_raw = nn.BatchNorm1d(200, momentum=mom)
        self.bn_notu = nn.BatchNorm1d(200, momentum=mom)
        self.bn_w = nn.BatchNorm1d(inp_w, momentum=mom)
        self.bn = nn.BatchNorm1d(inp_x, momentum=mom)
        
    def get_sizes(self, layers, out_sz):
        return [self.n_emb + self.cont_emb_notu + self.cont_emb] + layers + [out_sz]

    def forward(self, x_cat:Tensor, x_cont:Tensor) -> Tensor:
        b_size = x_cont.size(0)
        
        # embedding of has one feat
        x = [self.embeds(x_cat[:,i]) for i in range(200)]
        x = torch.stack(x, dim=1)
        
        # embedding of intercept. It was embedding of feature id before
        x_feat_emb = self.embeds_feat(x_cat[:,200])
        x_feat_emb = torch.stack([x_feat_emb]*200, 1)
        x_feat_emb = self.bn_feat_emb(x_feat_emb)
        x_feat_w = self.embeds_feat_w(x_cat[:,200])
        x_feat_w = torch.stack([x_feat_w]*200, 1)
        
        # "continuous embedding" of raw features
        x_cont_raw = x_cont[:,:200].contiguous().view(-1, 1)
        x_cont_raw = torch.cat([x_cont_raw, x_feat_emb.view(-1, self.n_emb_feat)], 1)
        x_cont_raw = F.relu(self.cont_emb_l(x_cont_raw))
        x_cont_raw = self.cont_emb_l2(x_cont_raw)
        x_cont_raw = x_cont_raw.view(b_size, 200, self.cont_emb)
        
        # "continuous embedding" of not unique features
        x_cont_notu = x_cont[:,200:].contiguous().view(-1, 1)
        x_cont_notu = torch.cat([x_cont_notu, x_feat_emb.view(-1,self.n_emb_feat)], 1)
        x_cont_notu = F.relu(self.cont_emb_notu_l(x_cont_notu))
        x_cont_notu = self.cont_emb_notu_l2(x_cont_notu)
        x_cont_notu = x_cont_notu.view(b_size, 200, self.cont_emb_notu)

        x_cont_notu = self.bn_notu(x_cont_notu)
        x = self.bn_cat(x)
        x_cont_raw = self.bn_raw(x_cont_raw)

        x = self.emb_drop(x)
        x_cont_raw = self.emb_drop(x_cont_raw)
        x_cont_notu = self.emb_drop(x_cont_notu)
        x_feat_w = self.bn_feat_w(x_feat_w)
        
        # Predict a weight for each of the previous embeddings
        x_w = torch.cat([x.view(-1,self.n_emb),
                         x_feat_w.view(-1,self.n_emb_feat_w),
                         x_cont_raw.view(-1, self.cont_emb), 
                         x_cont_notu.view(-1, self.cont_emb_notu)], 1)

        x_w = self.bn_w(x_w)

        w = F.relu(self.weight(x_w))
        w = self.weight2(w).view(b_size, -1)
        w = torch.nn.functional.softmax(w, dim=-1).unsqueeze(-1)

        # weighted average of the differents embeddings using weights given by NN
        x = (w * x).sum(dim=1)
        x_cont_raw = (w * x_cont_raw).sum(dim=1)
        x_cont_notu = (w * x_cont_notu).sum(dim=1)
        
        # Use NN on the weighted average to predict final output
        x = torch.cat([x, x_cont_raw, x_cont_notu], 1) if self.n_emb != 0 else x_cont
        x = self.bn(x)
            
        x = self.seq(x)
        x = self.layers(x)
        return x
    
def set_seed(seed=42):
    # python RNG
    random.seed(seed)

    # pytorch RNGs
    import torch
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

    # numpy RNG
    import numpy as np
    np.random.seed(seed)
    
ss = StandardScaler()

logger.info('Input data')

data = pd.read_csv('../content/train.csv')
data = data.set_index('ID_code')

etd = pd.read_csv('../content/test.csv')
etd = etd.set_index('ID_code')

has_one = [f'var_{i}_has_one' for i in range(200)]
orig = [f'var_{i}' for i in range(200)]
not_u = [f'var_{i}_not_unique' for i in range(200)]

cont_vars = orig + not_u
cat_vars = has_one
target = 'target'
path = './'

logger.info('cat treatment')

for f in cat_vars:
    data[f] = data[f].astype('category').cat.as_ordered()
    etd[f] = pd.Categorical(etd[f], categories=data[f].cat.categories, ordered=True)

# constant feature to replace feature index information
feat = ['intercept']
data['intercept'] = 1
data['intercept'] = data['intercept'].astype('category')
etd['intercept'] = 1
etd['intercept'] = etd['intercept'].astype('category')
    
cat_vars += feat

ref = pd.concat([data[cont_vars + cat_vars + ['target']], etd[cont_vars + cat_vars]])
ref[cont_vars] = ss.fit_transform(ref[cont_vars].values)

data = ref.iloc[:200000]
etd = ref.iloc[200000:]

data[target] = data[target].astype('int')

del ref; gc.collect()

fold_seed = 42
ss = StratifiedKFold(n_splits=10, random_state=fold_seed, shuffle=True)

folds = []
for num, (train,test) in enumerate(ss.split(data[target], data[target])):
    folds.append([train, test])


layers=[32]
ps=0.2
emb_drop=0.08
cont_emb=(50,10)
cont_emb_notu=(50,10)
emb_szs = [[6,12]]
use_bn = True
joined=False
# Code modified to sub with one seed
seeds = [42] #, 1337, 666]

results = []
sub_preds = pd.DataFrame(columns=range(10), index=etd.index)
for num_fold, (train, test) in enumerate(folds):
    procs=[]
    df = (TabularList.from_df(data, path=path, cat_names=cat_vars, cont_names=cont_vars, procs=procs)
                .split_by_idx(test)
                .label_from_df(cols=target)
            .add_test(TabularList.from_df(etd, path=path, cat_names=cat_vars, cont_names=cont_vars, procs=procs))
            .databunch(num_workers=0, bs=1024))
            
    df.dls[0].dl = df.dls[0].new(sampler=LongerRandomSampler(data_source=df.train_ds, mult=2), shuffle=False).dl
    for num_seed, seed in enumerate(seeds):
        logger.info(f'Model {num_fold} seed {num_seed}')
        set_seed(seed)
        model = my_TabularModel(emb_szs, len(df.cont_names), out_sz=df.c, layers=layers, ps=ps, emb_drop=emb_drop,
                                 y_range=None, use_bn=use_bn, cont_emb=cont_emb, cont_emb_notu=cont_emb_notu)

        learn = Learner(df, model, metrics=None, callback_fns=AUROC, wd=0.1)
        learn.fit_one_cycle(15, max_lr=1e-2, callbacks=[SaveModelCallback(learn, every='improvement', monitor='AUROC', name=f'fold{fold_seed}_{num_fold}_seed_{seed}'), AugShuffCallback(learn)])
        pred, _ = learn.get_preds()
        pred = pred[:,1]
        
        pred_test, _ = learn.get_preds(DatasetType.Test)
        pred_test = pred_test[:,1]
        
        sub_preds.loc[:, num_fold] = pred_test
        results.append(np.max(learn.recorder.metrics))
        logger.info('result ' + str(results[-1]))
        
        np.save(f'oof_fold{fold_seed}_{num_fold}_seed_{seed}.npy', pred)
        np.save(f'test_fold{fold_seed}_{num_fold}_seed_{seed}.npy', pred_test)
        
        del learn, pred, model, pred_test; gc.collect()
    del df; gc.collect()
print(results)
print(np.mean(results))

sub_preds[target] = sub_preds.rank().mean(axis=1)
sub_preds[[target]].to_csv('submission_NN_wo_pseudo_seed42.csv', index_label='ID_code')

### Programmatic case 2

In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn.metrics import roc_auc_score, log_loss
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold,KFold
from scipy.stats import norm, skew

from tqdm import tqdm_notebook as tqdm
from copy import copy
from multiprocessing import Pool

from tqdm import tqdm_notebook as tqdm

# Input data files are available in the "../content/" directory.
import json, math, os, sys
np.random.seed(42)
# Any results you write to the current directory are saved as output.
print(os.listdir("../content"))
import warnings
warnings.filterwarnings('ignore')


test_path = '/content/test.csv'

df_test = pd.read_csv(test_path)
df_test.drop(['ID_code'], axis=1, inplace=True)
df_test = df_test.values

unique_samples = []
unique_count = np.zeros_like(df_test)
for feature in tqdm(range(df_test.shape[1])):
    _, index_, count_ = np.unique(df_test[:, feature], return_counts=True, return_index=True)
    unique_count[index_[count_ == 1], feature] += 1

# Samples which have unique values are good/real and others are bad/fake
real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]

print(len(real_samples_indexes))
print(len(synthetic_samples_indexes))


df_test_real = df_test[real_samples_indexes].copy()

generator_for_each_synthetic_sample = []
# Using 20,000 samples should be enough. 
# You can use all of the 100,000 and get the same results (but 5 times slower)
for cur_sample_index in tqdm(synthetic_samples_indexes[:20000]):
    cur_synthetic_sample = df_test[cur_sample_index]
    potential_generators = df_test_real == cur_synthetic_sample

    # A verified generator for a synthetic sample is achieved
    # only if the value of a feature appears only once in the
    # entire real samples set
    features_mask = np.sum(potential_generators, axis=0) == 1
    verified_generators_mask = np.any(potential_generators[:, features_mask], axis=1)
    verified_generators_for_sample = real_samples_indexes[np.argwhere(verified_generators_mask)[:, 0]]
    generator_for_each_synthetic_sample.append(set(verified_generators_for_sample))



public_LB = generator_for_each_synthetic_sample[0]
for x in tqdm(generator_for_each_synthetic_sample):
    if public_LB.intersection(x):
        public_LB = public_LB.union(x)

private_LB = generator_for_each_synthetic_sample[1]
for x in tqdm(generator_for_each_synthetic_sample):
    if private_LB.intersection(x):
        private_LB = private_LB.union(x)
        
print(len(public_LB))
print(len(private_LB))


np.save('public_LB', list(public_LB))
np.save('private_LB', list(private_LB))
np.save('synthetic_samples_indexes', list(synthetic_samples_indexes))



use_experimental = False

train_df = pd.read_csv('../content/train.csv')
test_df = pd.read_csv('../content/test.csv')

indices_fake = np.load('../content/synthetic_samples_indexes.npy')
indices_pub = np.load('../content/public_LB.npy')
indices_pri = np.load('../content/private_LB.npy')
indices_real = np.concatenate([indices_pub, indices_pri])

features = [c for c in train_df.columns if c not in ['ID_code', 'target']]
target_train = train_df['target']
X_train = train_df
X_test = test_df.loc[indices_real,:]
X_test['target'] = np.zeros(X_test.shape[0])
X_fake = test_df.loc[indices_fake,:]
X_fake['target'] = np.zeros(X_test.shape[0])
train_length = X_train.shape[0]
target_test = X_test['target']
target_fake = X_fake['target']

if use_experimental:
    np.random.seed(42)    
    indices = np.arange(train_length)
    train_length = 150000
    np.random.shuffle(indices)
    indices_train = indices[:train_length]
    indices_test = indices[train_length:]
    # Swapped order to not overwrite X_train to soon
    X_test = X_train.iloc[indices_test,:]
    X_fake = X_train.iloc[indices_test,:]
    target_fake = X_fake['target']
    X_train = X_train.iloc[indices_train,:]
    target_train = X_train['target']
    target_test = X_test['target']

X_all = pd.concat([X_train, X_test])
print(X_all.shape)



import scipy.ndimage

sigma_fac = 0.001
sigma_base = 4

eps = 0.00000001

def get_count(X_all, X_fake):
    features_count = np.zeros((X_all.shape[0], len(features)))
    features_density = np.zeros((X_all.shape[0], len(features)))
    features_deviation = np.zeros((X_all.shape[0], len(features)))

    features_count_fake = np.zeros((X_fake.shape[0], len(features)))
    features_density_fake = np.zeros((X_fake.shape[0], len(features)))
    features_deviation_fake = np.zeros((X_fake.shape[0], len(features)))
    
    sigmas = []

    for i,var in enumerate(tqdm(features)):
        X_all_var_int = (X_all[var].values * 10000).round().astype(int)
        X_fake_var_int = (X_fake[var].values * 10000).round().astype(int)
        lo = X_all_var_int.min()
        X_all_var_int -= lo
        X_fake_var_int -= lo
        hi = X_all_var_int.max()+1
        counts_all = np.bincount(X_all_var_int, minlength=hi).astype(float)
        zeros = (counts_all == 0).astype(int)
        before_zeros = np.concatenate([zeros[1:],[0]])
        indices_all = np.arange(counts_all.shape[0])
        # Geometric mean of twice sigma_base and a sigma_scaled which is scaled to the length of array 
        sigma_scaled = counts_all.shape[0]*sigma_fac
        sigma = np.power(sigma_base * sigma_base * sigma_scaled, 1/3)
        sigmas.append(sigma)
        counts_all_smooth = scipy.ndimage.filters.gaussian_filter1d(counts_all, sigma)
        deviation = counts_all / (counts_all_smooth+eps)
        indices = X_all_var_int
        features_count[:,i] = counts_all[indices]
        features_density[:,i] = counts_all_smooth[indices]
        features_deviation[:,i] = deviation[indices]
        indices_fake = X_fake_var_int
        features_count_fake[:,i] = counts_all[indices_fake]
        features_density_fake[:,i] = counts_all_smooth[indices_fake]
        features_deviation_fake[:,i] = deviation[indices_fake]
        
    features_count_names = [var+'_count' for var in features]
    features_density_names = [var+'_density' for var in features]
    features_deviation_names = [var+'_deviation' for var in features]

    X_all_count = pd.DataFrame(columns=features_count_names, data = features_count)
    X_all_count.index = X_all.index
    X_all_density = pd.DataFrame(columns=features_density_names, data = features_density)
    X_all_density.index = X_all.index
    X_all_deviation = pd.DataFrame(columns=features_deviation_names, data = features_deviation)
    X_all_deviation.index = X_all.index
    X_all = pd.concat([X_all,X_all_count, X_all_density, X_all_deviation], axis=1)
    
    X_fake_count = pd.DataFrame(columns=features_count_names, data = features_count_fake)
    X_fake_count.index = X_fake.index
    X_fake_density = pd.DataFrame(columns=features_density_names, data = features_density_fake)
    X_fake_density.index = X_fake.index
    X_fake_deviation = pd.DataFrame(columns=features_deviation_names, data = features_deviation_fake)
    X_fake_deviation.index = X_fake.index
    X_fake = pd.concat([X_fake,X_fake_count, X_fake_density, X_fake_deviation], axis=1)    

    features_count = features_count_names
    features_density = features_density_names
    features_deviation = features_deviation_names
    return X_all, features_count, features_density, features_deviation, X_fake

X_all, features_count, features_density, features_deviation, X_fake = get_count(X_all, X_fake)
print(X_all.shape)



features_to_scale = [features, features_count]

from sklearn.preprocessing import StandardScaler

def get_standardized(X_all, X_fake):
    scaler = StandardScaler()
    features_to_scale_flatten = [var for sublist in features_to_scale for var in sublist]
    scaler.fit(X_all[features_to_scale_flatten])
    features_scaled = scaler.transform(X_all[features_to_scale_flatten])
    features_scaled_fake = scaler.transform(X_fake[features_to_scale_flatten])
    X_all[features_to_scale_flatten] = features_scaled
    X_fake[features_to_scale_flatten] = features_scaled_fake
    return X_all, X_fake

X_all, X_fake = get_standardized(X_all, X_fake)

print(X_all.shape)


X_train = X_all.iloc[:train_length,:]
X_test = X_all.iloc[train_length:,:]
del X_all
import gc
gc.collect()
print(X_train.shape, X_test.shape)

features_used = [features, features_count]


params = {
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 1,
    'learning_rate': 0.08,
    'max_depth': -1,
    'metric':'binary_logloss',
    'num_leaves': 4,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary',
    'reg_alpha': 2,
    'reg_lambda': 0,
    'verbosity': 1,
    'max_bin':256,
}


# reg_alpha
reg_alpha_values = [0.75, 1, 2, 3]
reg_alpha_var = [3, 0, 2, 3, 2, 0, 1, 1, 3, 2, 2, 0, 2, 0, 2, 2, 2, 1, 1, 2, 
                 1, 2, 3, 3, 2, 1, 3, 1, 3, 2, 2, 3, 1, 1, 3, 2, 0, 1, 0, 2, 
                 1, 1, 2, 3, 0, 3, 3, 3, 2, 0, 3, 1, 3, 1, 1, 0, 2, 2, 0, 0, 
                 0, 1, 2, 1, 0, 1, 3, 2, 0, 2, 1, 2, 0, 0, 1, 3, 3, 1, 2, 3, 
                 3, 2, 0, 1, 2, 3, 3, 2, 3, 3, 0, 0, 3, 0, 1, 0, 1, 0, 2, 3, 
                 1, 0, 3, 1, 3, 2, 3, 1, 3, 3, 3, 1, 3, 2, 3, 2, 1, 0, 1, 2, 
                 0, 3, 0, 3, 0, 3, 2, 1, 0, 0, 2, 2, 2, 0, 1, 0, 0, 2, 3, 2, 
                 2, 1, 1, 0, 1, 2, 2, 2, 1, 0, 2, 3, 2, 3, 1, 1, 3, 1, 1, 2, 
                 1, 2, 0, 3, 1, 3, 3, 2, 0, 1, 3, 3, 0, 1, 0, 3, 1, 3, 1, 3, 
                 0, 3, 0, 3, 1, 0, 0, 0, 3, 0, 3, 0, 0, 2, 0, 3, 1, 0, 3, 2]

# max_bin
max_bin_values = [256, 512, 1024]
max_bin_var = [0, 0, 1, 0, 0, 0, 2, 0, 0, 2, 0, 2, 0, 0, 1, 1, 1, 0, 0, 0, 0, 
               2, 2, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 1, 0, 0, 1, 0, 
               1, 1, 0, 1, 0, 0, 0, 2, 1, 1, 1, 1, 0, 0, 0, 0, 1, 2, 1, 0, 0, 
               1, 2, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 0, 
               1, 1, 0, 1, 0, 0, 1, 2, 1, 2, 1, 0, 0, 1, 0, 2, 0, 1, 0, 0, 2, 
               1, 1, 1, 0, 0, 0, 2, 0, 0, 2, 1, 0, 0, 1, 0, 1, 2, 0, 0, 0, 0, 
               0, 2, 2, 2, 2, 1, 1, 2, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 2, 
               0, 1, 0, 1, 1, 0, 2, 1, 1, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 
               0, 1, 0, 2, 0, 1, 0, 1, 2, 0, 0, 0, 0, 2, 0, 0, 2, 0, 1, 1, 0, 
               2, 0, 0, 0, 1, 2, 0, 0, 1, 0, 2]

# learning_rate
learning_rate_values = [0.06, 0.08, 0.12]
learning_rate_var = [2, 2, 2, 1, 2, 2, 2, 0, 1, 2, 0, 2, 2, 2, 0, 2, 2, 0, 2, 
                     1, 2, 2, 2, 2, 2, 0, 1, 0, 2, 0, 0, 2, 0, 2, 2, 2, 1, 2, 
                     0, 0, 2, 0, 0, 1, 2, 1, 2, 0, 0, 2, 1, 2, 2, 2, 2, 0, 0, 
                     2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 
                     1, 0, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 1, 
                     2, 0, 2, 0, 2, 0, 2, 1, 0, 0, 1, 2, 0, 2, 2, 2, 0, 2, 2, 
                     2, 2, 1, 0, 2, 1, 2, 2, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 
                     1, 2, 1, 0, 2, 1, 1, 2, 2, 2, 2, 0, 2, 1, 2, 1, 2, 2, 2, 
                     2, 2, 2, 2, 1, 1, 0, 1, 2, 0, 2, 2, 0, 1, 2, 2, 2, 1, 0, 
                     1, 2, 1, 2, 1, 1, 1, 2, 1, 2, 1, 0, 0, 2, 0, 1, 2, 1, 1, 
                     2, 2, 2, 2, 2, 2, 2, 2, 1, 1]

# num_leaves
num_leaves_values = [3, 4, 5]
num_leaves_var = [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 
                  0, 1, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 
                  0, 0, 0, 1, 0, 0, 0, 0, 2, 2, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 
                  0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 2, 1, 2, 0, 0, 0, 0, 0, 1, 1, 
                  0, 0, 2, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
                  0, 1, 0, 1, 1, 2, 0, 0, 0, 0, 1, 0, 1, 2, 1, 1, 1, 0, 2, 0, 0, 
                  0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 2, 0, 1, 0, 2, 
                  2, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 2, 0, 
                  0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 
                  0, 1, 2, 1, 1, 0, 0, 0, 2, 1, 2]



n_folds = 5
early_stopping_rounds=10
settings = [4]
np.random.seed(47)

settings_best_ind = []

def train_trees():
    preds_oof = np.zeros((len(X_train), len(features)))
    preds_test = np.zeros((len(X_test), len(features)))
    preds_train = np.zeros((len(X_train), len(features)))
    preds_fake = np.zeros((len(X_fake), len(features)))

    features_used_flatten = [var for sublist in features_used for var in sublist]
    X_train_used = X_train[features_used_flatten]
    X_test_used = X_test[features_used_flatten]
    X_fake_used = X_fake[features_used_flatten]

    for i in range(len(features)):
        params['max_bin'] = max_bin_values[max_bin_var[i]]
        params['learning_rate'] = learning_rate_values[learning_rate_var[i]]
        params['reg_alpha'] = reg_alpha_values[reg_alpha_var[i]]
        params['num_leaves'] = num_leaves_values[num_leaves_var[i]]
        features_train = [feature_set[i] for feature_set in features_used] 
        print(f'Training on: {features_train}')
        folds = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=np.random.randint(100000))
        list_folds = list(folds.split(X_train_used.values, target_train.values))
        preds_oof_temp = np.zeros((preds_oof.shape[0], len(settings)))
        preds_test_temp = np.zeros((preds_test.shape[0], len(settings)))
        preds_train_temp = np.zeros((preds_train.shape[0], len(settings)))
        preds_fake_temp = np.zeros((preds_fake.shape[0], len(settings)))

        scores = []
        for j, setting in enumerate(settings):
            # setting is used for hyperparameter tuning, here you can add sometinh like params['num_leaves'] = setting
            print('\nsetting: ', setting)
            for k, (trn_idx, val_idx) in enumerate(list_folds):
                print("Fold: {}".format(k+1), end="")
                trn_data = lgb.Dataset(X_train_used.iloc[trn_idx][features_train], label=target_train.iloc[trn_idx])
                val_data = lgb.Dataset(X_train_used.iloc[val_idx][features_train], label=target_train.iloc[val_idx])

                # Binary Log Loss
                clf = lgb.train(params, trn_data, 2000, valid_sets=[trn_data, val_data], verbose_eval=False, early_stopping_rounds=early_stopping_rounds) 

                prediction_val1 = clf.predict(X_train_used.iloc[val_idx][features_train])
                prediction_test1 = clf.predict(X_test_used[features_train])
                prediction_train1 = clf.predict(X_train_used.iloc[trn_idx][features_train])
                prediction_fake1 = clf.predict(X_fake_used[features_train])

                # Predictions
                s1 = roc_auc_score(target_train.iloc[val_idx], prediction_val1)
                s1_log = log_loss(target_train.iloc[val_idx], prediction_val1)
                print(' - val AUC: {:<8.4f} - loss: {:<8.3f}'.format(s1, s1_log*1000), end='')

                # Predictions Test
                if use_experimental:
                    s1_test = roc_auc_score(target_test, prediction_test1)
                    s1_log_test = log_loss(target_test, prediction_test1)
                    print(' - test AUC: {:<8.4f} - loss: {:<8.3f}'.format(s1_test, s1_log_test*1000), end='')

                # Predictions Train
                s1_train = roc_auc_score(target_train.iloc[trn_idx], prediction_train1)
                s1_log_train = log_loss(target_train.iloc[trn_idx], prediction_train1)
                print(' - train AUC: {:<8.4f} - loss: {:<8.3f}'.format(s1_train, s1_log_train*1000), end='')
                if use_experimental:
                    print('',clf.feature_importance(), end='')

                print('')


                preds_oof_temp[val_idx,j] += np.sqrt(prediction_val1 - prediction_val1.mean() + 0.1) 
                preds_test_temp[:,j] += np.sqrt(prediction_test1 - prediction_test1.mean() + 0.1) / n_folds
                preds_train_temp[trn_idx,j] += np.sqrt(prediction_train1 - prediction_train1.mean() + 0.1) / (n_folds-1)
                preds_fake_temp[:,j] += np.sqrt(prediction_fake1 - prediction_fake1.mean() + 0.1) / n_folds

            score_setting = roc_auc_score(target_train, preds_oof_temp[:,j])
            score_setting_log = 1000*log_loss(target_train, np.exp(preds_oof_temp[:,j]))
            scores.append(score_setting_log)
            print("Score:  - val AUC: {:<8.4f} - loss: {:<8.3f}".format(score_setting, score_setting_log), end='')
            if use_experimental:
                score_setting_test = roc_auc_score(target_test, preds_test_temp[:,j])
                score_setting_log_test = 1000*log_loss(target_test, np.exp(preds_test_temp[:,j]))  
                print(" - test AUC: {:<8.4f} - loss: {:<8.3f}".format(score_setting_test, score_setting_log_test), end='')

            score_setting_train = roc_auc_score(target_train, preds_train_temp[:,j])
            score_setting_log_train = 1000*log_loss(target_train, np.exp(preds_train_temp[:,j]))
            print(" - train AUC: {:<8.4f} - loss: {:<8.3f}".format(score_setting_train, score_setting_log_train))

        best_ind = np.argmin(scores)
        settings_best_ind.append(best_ind)
        preds_oof[:,i] = preds_oof_temp[:,best_ind]
        preds_test[:,i] = preds_test_temp[:,best_ind]
        preds_train[:,i] = preds_train_temp[:,best_ind]
        preds_fake[:,i] = preds_fake_temp[:,best_ind]


        print('\nbest setting: ', settings[best_ind])
        preds_oof_cum = preds_oof[:,:i+1].mean(axis=1)
        print("Cum CV val  : {:<8.4f} - loss: {:<8.3f}".format(roc_auc_score(target_train, preds_oof_cum), 1000*log_loss(target_train, np.exp(preds_oof_cum))))
        if use_experimental:        
            preds_test_cum = preds_test[:,:i+1].mean(axis=1)
            print("Cum CV test : {:<8.4f} - loss: {:<8.3f}".format(roc_auc_score(target_test, preds_test_cum), 1000*log_loss(target_test, np.exp(preds_test_cum))))
        preds_train_cum = preds_train[:,:i+1].mean(axis=1)
        print("Cum CV train: {:<8.4f} - loss: {:<8.3f}".format(roc_auc_score(target_train, preds_train_cum), 1000*log_loss(target_train, np.exp(preds_train_cum))))
        print('*****' * 10 + '\n')
        
    return preds_oof, preds_test, preds_train, preds_fake

preds_oof, preds_test, preds_train, preds_fake = train_trees()


['.config', 'train.csv', '.ipynb_checkpoints', 'test.csv', 'sample_data']


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


100000
100000


HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))


50000
50000
(300000, 202)


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))


(300000, 802)
(300000, 802)
(200000, 802) (100000, 802)


In [None]:
preds_oof_cum = np.zeros(preds_oof.shape[0])
if use_experimental:
    preds_test_cum = np.zeros(preds_test.shape[0])
preds_train_cum = np.zeros(preds_train.shape[0])
for i in range(len(features)):
    preds_oof_cum += preds_oof[:,i]
    preds_train_cum += preds_train[:,i]
    print("var_{} Cum val: {:<8.5f}".format(i,roc_auc_score(target_train, preds_oof_cum)), end="")
    if use_experimental:
        preds_test_cum += preds_test[:,i]
        print(" - test : {:<8.5f}".format(roc_auc_score(target_test, preds_test_cum)), end="")
    print(" - train: {:<8.5f}".format(roc_auc_score(target_train, preds_train_cum)))


print(settings)
print(settings_best_ind)


from scipy.interpolate import interp1d
from scipy.ndimage.filters import gaussian_filter
import matplotlib.pyplot as plt 

features_to_show = np.arange(20)
plt.figure(figsize = (20,20))

for i in features_to_show:
    var = 'var_'+str(i)
    signal = X_test[var].values
    logits = preds_test[:,i]
    func = interp1d(signal, logits)
    space = np.linspace(signal.min(), signal.max(), 4000)
    activations = func(space)
    activations_smooth = gaussian_filter(activations, 10)
    
    func_smooth = interp1d(space, activations_smooth)
    logits_smooth = func_smooth(signal)
    plt.subplot(5,4,i+1)
    plt.plot(space, activations)
    plt.plot(space, activations_smooth)


import keras

n_splits = 7
num_preds = 5
epochs = 60
learning_rate_init = 0.02
batch_size = 4000

num_features = len(features)

def get_features(preds, df):
    list_features = [preds, df[features].values, df[features_count].values, df[features_deviation], df[features_density]]
    list_indices = []
    for i in range(num_features):
        indices = np.arange(num_preds)*num_features + i
        list_indices.append(indices)
    indices = np.concatenate(list_indices)
    feats = np.concatenate(list_features, axis=1)[:,indices]
    return feats 

def get_model_3():
    inp = keras.layers.Input((num_features*num_preds,))
    x = keras.layers.Reshape((num_features*num_preds,1))(inp)
    x = keras.layers.Conv1D(32,num_preds,strides=num_preds, activation='elu')(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Conv1D(24,1, activation='elu')(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Conv1D(16,1, activation='elu')(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Conv1D(4,1, activation='elu')(x)
    x = keras.layers.Flatten()(x)
    x = keras.layers.Reshape((num_features*4,1))(x)
    x = keras.layers.AveragePooling1D(2)(x)
    x = keras.layers.Flatten()(x)
    x = keras.layers.BatchNormalization()(x)
    out = keras.layers.Dense(1, activation='sigmoid')(x)
    return keras.Model(inputs=inp, outputs=out)


def lr_scheduler(epoch):
    if epoch <= epochs*0.8:
        return learning_rate_init
    else:
        return learning_rate_init * 0.1

def train_NN(features_oof, features_test, features_train, features_fake):
    
    folds = StratifiedKFold(n_splits=n_splits)

    preds_nn_oof = np.zeros(features_oof.shape[0])
    preds_nn_test = np.zeros(features_test.shape[0])
    preds_nn_fake = np.zeros(features_fake.shape[0])

    for trn_idx, val_idx in folds.split(features_oof, target_train):
        features_oof_tr = features_oof[trn_idx, :]
        target_oof_tr = target_train.values[trn_idx]
        features_oof_val = features_oof[val_idx, :]
        target_oof_val = target_train.values[val_idx]

        optimizer = keras.optimizers.Adam(lr = learning_rate_init, decay = 0.00001)
        model = get_model_3()
        callbacks = []
        callbacks.append(keras.callbacks.LearningRateScheduler(lr_scheduler))
        model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
        model.fit(features_oof_tr, target_oof_tr, validation_data=(features_oof_val, target_oof_val), epochs=epochs, verbose=2, batch_size=batch_size, callbacks=callbacks)

        preds_nn_oof += model.predict(features_oof, batch_size=2000)[:,0]
        preds_nn_test += model.predict(features_test, batch_size=2000)[:,0]
        preds_nn_fake += model.predict(features_fake, batch_size=2000)[:,0]

        print(roc_auc_score(target_train, preds_nn_oof))
        if use_experimental:
            print(roc_auc_score(target_test, preds_nn_test))
            print(roc_auc_score(target_test, preds_test.mean(axis=1)))

    preds_nn_oof /= n_splits
    preds_nn_test /= n_splits
    preds_nn_fake /= n_splits
    return preds_nn_oof, preds_nn_test, preds_nn_fake


features_oof = get_features(preds_oof, X_train)
features_test = get_features(preds_test, X_test)
if not use_experimental:
    del X_test
features_train = get_features(preds_train, X_train)
if not use_experimental:
    del X_train
features_fake = get_features(preds_fake, X_fake)
if not use_experimental:
    del X_fake
    del preds_oof
    del preds_fake
    del preds_train
    del preds_test

print(get_model_3().summary())
    
preds_nn_oof, preds_nn_test, preds_nn_fake = train_NN(features_oof, features_test, features_train, features_fake)

print(roc_auc_score(target_train, preds_nn_oof))
if use_experimental:
    print('test AUC: ', roc_auc_score(target_test, preds_nn_test))


preds_oof_final = preds_nn_oof
preds_test_final = preds_nn_test
preds_fake_final = preds_nn_fake

print('oof  : ', roc_auc_score(target_train, preds_oof_final))
if use_experimental:
    print('test : ', roc_auc_score(target_test, preds_test_final))
    print('train: ', roc_auc_score(target_fake, preds_fake_final))

if not use_experimental:
    sub = pd.DataFrame({"ID_code": test_df.ID_code.values})
    predictions_all = np.zeros(test_df.shape[0])
    predictions_all[indices_real] = preds_test_final
    predictions_all[indices_fake] = preds_fake_final
    sub["target"] = predictions_all
    sub.to_csv("submission.csv", index=False)
    print(sub.head(20))


#### Programmatic case 3

In [None]:
import gc

import numpy as np
import pandas as pd
import lightgbm as lgb
import datetime

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import json, math, os, sys

import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
from multiprocessing import Pool
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

import warnings
warnings.filterwarnings('ignore')

PATH="../content/"#santander-customer-transaction-prediction/"
N_SPLITS = 10
SEED_SKF = 4221


def merge_train_test(df_train, df_test):
    if "target" not in df_test.columns.values:
        df_test["target"] = -1
    res = pd.concat([df_train, df_test])
    res.reset_index(inplace=True, drop=True)
    return res

def split_train_test(df):
    df_train = df[df["target"] >= 0]
    df_test = df[df["target"] <= -1]
    df_train.reset_index(inplace=True, drop=True)
    df_test.reset_index(inplace=True, drop=True)
    assert list(df_train["ID_code"].values) == [f"train_{i}" for i in range(20000)]
    assert list(df_test["ID_code"].values) == [f"test_{i}" for i in range(20000)]
    return df_train, df_test


class CountEncoder:
    def fit(self, series):
        self.counts = series.groupby(series).count()
    
    def transform(self, series):
        return series.map(self.counts).fillna(0).astype(np.int16)


# separate into real and fake

df_cnt = pd.DataFrame()
for v in range(200):
    sr = test_df[f"var_{v}"]
    enc = CountEncoder()
    enc.fit(sr)
    df_cnt[f"cnt_{v}"] = enc.transform(sr)
test_df["target"] = -df_cnt.min(1)  # target==-1 -> real, target==-2 -> fake
del df_cnt


df_merged = merge_train_test(train_df, test_df)
df_merged.tail()


# count encoding

count_enc = [None] * 200
df_real = df_merged[df_merged["target"]!=-2]
for v in range(200):
    enc = CountEncoder()
    enc.fit(df_real[f"var_{v}"])
    count_enc[v] = enc.transform(df_merged[f"var_{v}"])
    
for v in range(200):
    df_merged[f"cnt_{v}"] = count_enc[v]

del df_real


# normalize

for v in range(200):
    df_merged[f"var_{v}_minmax"] = StandardScaler().fit_transform(df_merged[f"var_{v}"].values.reshape(-1, 1))
    df_merged[f"cnt_{v}_minmax"] = MinMaxScaler().fit_transform(df_merged[f"cnt_{v}"].values.reshape(-1, 1))
df_merged.drop(columns=[f"var_{v}" for v in range(200)]+[f"cnt_{v}" for v in range(200)], inplace=True)
train_df, test_df = split_train_test(df_merged)
target = train_df['target']
gc.collect()
print(train_df.shape)
test_df.head()


# nn model

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.hidden_size = 64
        self.relu = nn.ReLU()
        self.conv1 = nn.Conv1d(2, self.hidden_size, kernel_size=1)
        self.conv2 = nn.Conv1d(self.hidden_size, self.hidden_size*2, kernel_size=1)
        self.conv3 = nn.Conv1d(self.hidden_size*2, self.hidden_size*4, kernel_size=1)
        self.conv4 = nn.Conv1d(self.hidden_size*4, self.hidden_size*8, kernel_size=1)
        self.conv5 = nn.Conv1d(self.hidden_size*8, self.hidden_size*16, kernel_size=1)
        self.conv6 = nn.Conv1d(self.hidden_size*16, self.hidden_size*32, kernel_size=1)
        
        self.fc = nn.Linear(self.hidden_size*32*200, 2)
        
    def forward(self, x_):
        x = self.conv1(x_)
        x = self.relu(x)
        
        x = self.conv2(x)
        x = self.relu(x)
        
        x = self.conv3(x)
        x = self.relu(x)
        
        x = self.conv4(x)
        x = self.relu(x)
        
        x = self.conv5(x)
        x = self.relu(x)
        
        x = self.conv6(x)
        x = self.relu(x)
        
        x = x.view(x.shape[0], -1)
        x = self.fc(x)
        return x


# dataset

class TrainData(torch.utils.data.Dataset):
    def __init__(self, trn_X, trn_y):
        self.trn_X = trn_X
        self.trn_y = trn_y
        
    def __len__(self):
        return self.trn_X.shape[0]
        
    def __getitem__(self, idx):
        return self.trn_X[idx], self.trn_y[idx], idx
    
    def shuffle(self):
        trn_X = self.trn_X.to("cpu").numpy()
        trn_y = self.trn_y.to("cpu").numpy()
        trn_X_pos = trn_X[trn_y==1].transpose(2,0,1)
        trn_X_neg = trn_X[trn_y==0].transpose(2,0,1)
        for c in trn_X_pos:
            np.random.shuffle(c)
        for c in trn_X_neg:
            np.random.shuffle(c)
        trn_X[trn_y==1] = trn_X_pos.transpose(1,2,0)
        trn_X[trn_y==0] = trn_X_neg.transpose(1,2,0)
        self.trn_X = torch.from_numpy(trn_X).to(device)
    
class ValidData(torch.utils.data.Dataset):
    def __init__(self, val_X, val_y):
        self.val_X = val_X
        self.val_y = val_y
        
    def __len__(self):
        return self.val_X.shape[0]
        
    def __getitem__(self, idx):
        return self.val_X[idx], self.val_y[idx], idx
    
class TestData(torch.utils.data.Dataset):
    def __init__(self, test_X):
        self.test_X = test_X
        
    def __len__(self):
        return self.test_X.shape[0]
        
    def __getitem__(self, idx):
        return self.test_X[idx], -1, idx


from scipy.special import logit, expit

BATCH_SIZE = 256
EARLY_STOPPING = 20
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED_SKF)
oof = np.zeros(len(train_df))
fold_oof = np.zeros((N_SPLITS, len(train_df)))
fold_preds = np.zeros((N_SPLITS, len(test_df)))
predictions = np.zeros(len(test_df))

loss_func = nn.CrossEntropyLoss()

for fold_, (trn_idx, val_idx) in enumerate(skf.split(train_df.values, target.values)):
    print("fold n°{}".format(fold_))
    
    features = [f"var_{v}_minmax" for v in range(200)] + [f"cnt_{v}_minmax" for v in range(200)]
    
    trn_X_npy, trn_y_npy = train_df.iloc[trn_idx][features].values.astype(np.float32), target.iloc[trn_idx].values
    val_X_npy, val_y_npy = train_df.iloc[val_idx][features].values.astype(np.float32), target.iloc[val_idx].values
    trn_X, trn_y = torch.tensor(trn_X_npy.reshape(-1, 2, 200)).to(device), torch.tensor(trn_y_npy).to(device)       
    val_X, val_y = torch.tensor(val_X_npy.reshape(-1, 2, 200)).to(device), torch.tensor(val_y_npy).to(device)     
    test_X = torch.tensor(test_df[features].values.astype(np.float32).reshape(-1, 2, 200)).to(device)
    trn_dataset = TrainData(trn_X, trn_y)
    val_dataset = ValidData(val_X, val_y)
    test_dataset = TestData(test_X)
    #trn_loader = torch.utils.data.DataLoader(dataset=trn_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=256) #batch_size=len(val_idx))
    test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=256)
    filename_net = f"net_{fold_}.pth"
    
    net = Model().to(device)
    optimizer = optim.Adam(net.parameters(), lr=0.00002)
    
    best_epoch = 0
    min_auc = 0.5
    for epoch in range(100):
        if epoch - EARLY_STOPPING > best_epoch:
            break
            
        # train dataset with shuffling
        trn_dataset.shuffle()
        trn_loader = torch.utils.data.DataLoader(dataset=trn_dataset, batch_size=BATCH_SIZE, shuffle=True)

        # train
        net = net.train()
        oof_ = np.zeros((len(trn_idx), 2), dtype=np.float32)

        for data, label, idx in trn_loader:
            optimizer.zero_grad()
            output = net(data)
            loss = loss_func(output, label)
            loss.backward()
            oof_[idx.numpy()] = output.detach().cpu().numpy()
            optimizer.step()
            
        # eval
        net = net.eval()
        with torch.no_grad():
            # train data
            loss = loss_func(torch.from_numpy(oof_), torch.from_numpy(trn_y_npy))
            auc = roc_auc_score(trn_y_npy, oof_[:, 1] - oof_[:, 0])
            print(f"epoch {epoch}: train loss: {loss:.5f}, train auc: {auc:.5f}, ", end="")

            # valid data
            output = np.zeros((len(val_idx), 2), dtype=np.float32)
            for data, _, idx in val_loader:
                output[idx.numpy()] = net(data).detach().cpu().numpy()
            loss = loss_func(torch.from_numpy(output), torch.from_numpy(val_y_npy))
            auc = roc_auc_score(val_y_npy, output[:, 1] - output[:, 0])
            print(f"valid loss: {loss:.5f}, valid auc: {auc:.5f}")

            if auc > min_auc:
                torch.save(net.state_dict(), filename_net)
                min_auc = auc
                best_epoch = epoch

    net.load_state_dict(torch.load(filename_net))
    output = np.zeros((len(val_idx), 2), dtype=np.float32)
    for data, _, idx in val_loader:
        output[idx.numpy()] = net(data).detach().cpu().numpy()
    val_auc = roc_auc_score(val_y_npy, output[:, 1] - output[:, 0])
    print(f"fold {fold_} auc: {val_auc:.5f}")
    oof[val_idx] = expit(output[:, 1] - output[:, 0])
    fold_oof[fold_, val_idx] = oof[val_idx]
    
    output = np.zeros((len(test_dataset), 2), dtype=np.float32)
    for data, _, idx in test_loader:
        output[idx.numpy()] = net(data).detach().cpu().numpy()
    fold_preds[fold_, :] = expit(output[:, 1] - output[:, 0])
    predictions += fold_preds[fold_] / N_SPLITS
    
    break  # due to execution time limitation
    

np.save("oof.npy", oof)
np.save("fold_oof.npy", fold_oof)
np.save("fold_preds.npy", fold_preds)
np.save("predictions.npy", predictions)
print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))


sub_df = pd.DataFrame({"ID_code":test_df["ID_code"].values})
sub_df["target"] = predictions
sub_df.to_csv("submission.csv", index=False)
