In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.nn import functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
import math
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import time
import uuid
import optuna
import plotly
import matplotlib

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
class AmericanExpressProfileTimeSeriesDataset(Dataset):
    def __init__(self, dataset_file, nrows, transformation=False):
        self.dataset = pd.read_csv(dataset_file, nrows=nrows)
        self.transformation = transformation

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        row = self.dataset.iloc[[idx]]
        label = row["target"].values
        data = row.drop(['customer_ID', 'target'], axis=1)
        data = data.values[0].tolist()

        for idx, value in enumerate(data):
            if idx == 0: continue
            list_ = value[1:-1].split(", ")
            for idx_l, elem in enumerate(list_):
                list_[idx_l] = float(elem)
            data[idx] = list_
        data = data[1:]

        data = np.array(data, dtype=np.float32)
        data = torch.tensor(data, dtype=torch.float32, requires_grad=True)
        label = torch.tensor(label, dtype=torch.float32)
        label = label.to(device)
        data = data.to(device)
        if self.transformation: data = self.transformation(data)
        return data, label

In [None]:
class Block(nn.Module):

    def __init__(self, trial, i):
        super(Block, self).__init__()

        activation = trial.suggest_categorical(f"activation_conv_layer{i}", ["LeakyReLU", "ReLU", "ELU"])

        self.activ = getattr(nn, activation)()

        padding_mode = "replicate"#trial.suggest_categorical(f"padding_mode_layer{i}", ["zeros", "reflect", "replicate"])

        kernel_size1 = trial.suggest_int(f"conv1_kernel_layer{i}", 2, 13)
        kernel_size2 = trial.suggest_int(f"conv2_kernel_layer{i}", 2, 13)
        kernel_size3 = trial.suggest_int(f"conv3_kernel_layer{i}", 2, 13)

        channel_size_conv_intermediate = 189#trial.suggest_int(f"channel_size_conv_intermediate_layer{i}", 100, 500, 50)


        self.conv1 = nn.Sequential(
            nn.Conv1d(189, channel_size_conv_intermediate, kernel_size=kernel_size1, stride=1, padding="same", padding_mode=padding_mode),
            nn.BatchNorm1d(channel_size_conv_intermediate),
        )

        self.conv2 = nn.Sequential(
            nn.Conv1d(channel_size_conv_intermediate, channel_size_conv_intermediate, kernel_size=kernel_size2, stride=1, padding="same", padding_mode=padding_mode),
            nn.BatchNorm1d(channel_size_conv_intermediate),
        )

        self.conv3 = nn.Sequential(
            nn.Conv1d(channel_size_conv_intermediate, 189, kernel_size=kernel_size3, stride=1, padding="same", padding_mode=padding_mode),
            nn.BatchNorm1d(189),
        )

    def forward(self, x):
        identity = x
        x = self.conv1(x)
        x = self.activ(x)
        x = self.conv2(x)
        x = self.activ(x)
        x = self.conv3(x)
        x = x + identity
        x = self.activ(x)
        return x

class ResNet(nn.Module):


    def __init__(self, trial):
        super(ResNet, self).__init__()

        n_layers_blocks = trial.suggest_int("n_layers_blocks", 1, 10)

        self.blocks = nn.Sequential()

        for i in range(n_layers_blocks):
            self.blocks.append(Block(trial, i))
            
        self.linear = nn.Sequential()

        linear_in = 189

        n_layers_linear = trial.suggest_int("n_layers_linear", 1, 20)


        for k in range(n_layers_linear):

            linear_out = trial.suggest_int(f"linear_size_layer{k}", 100, 300, 50)

            self.linear.append(nn.Linear(linear_in, linear_out))

            self.linear.append(nn.LeakyReLU())

            linear_in = linear_out

        self.linear.append(nn.Linear(linear_in, 1))

        self.linear.append(nn.Sigmoid())


    def forward(self, x):
        x = self.blocks(x)

        avg = nn.AvgPool1d(13, stride=1)
        x = avg(x)
        x = x.view(x.shape[0], -1)
        x = self.linear(x)
        return x

In [15]:
class ConvNet(nn.Module):
    def __init__(self, trial):
        super(ConvNet, self).__init__()

        self.trial = trial

        conv_activation = trial.suggest_categorical(f"activation_conv", ["LeakyReLU", "ReLU", "ELU"])
        self.conv_activ = getattr(nn, conv_activation)()

        lin_activation = trial.suggest_categorical(f"activation_lin", ["LeakyReLU", "ReLU", "ELU"])
        self.lin_activ = getattr(nn, lin_activation)()

        n_layers_blocks = self.trial.suggest_int("n_layers_blocks", 1, 30)
        self.blocks = nn.Sequential()
        conv_in = 189
        for i in range(n_layers_blocks):
            conv_out = self.trial.suggest_int(f"conv_size_layer{i}", 100, 300, 10)
            kernel_size = self.trial.suggest_int(f"kernel_size_layer{i}", 1, 13)
            self.blocks.append(nn.Conv1d(conv_in, conv_out, kernel_size=kernel_size, stride=1, padding="same", padding_mode="replicate"))
            self.blocks.append(self.conv_activ)
            batchnorm = self.trial.suggest_categorical(f"conv_batchnorm_layer{i}", [True, False])
            if batchnorm: self.blocks.append(nn.BatchNorm1d(conv_out))

            conv_in = conv_out


        self.linear = nn.Sequential()
        linear_in = conv_out
        n_layers_linear = self.trial.suggest_int("n_layers_linear", 1, 10)
        for k in range(n_layers_linear):
            linear_out = self.trial.suggest_int(f"linear_size_layer{k}", 100, 300, 10)
            self.linear.append(nn.Linear(linear_in, linear_out))
            self.linear.append(self.lin_activ)
            batchnorm = self.trial.suggest_categorical(f"linear_batchnorm_layer{k}", [True, False])            
            if batchnorm: self.linear.append(nn.BatchNorm1d(linear_out))

            linear_in = linear_out
        self.linear.append(nn.Linear(linear_out, 1))
        self.linear.append(nn.Sigmoid())


    def forward(self, x):
        x = self.blocks(x)
        pool = self.trial.suggest_categorical(f"pool", ["AvgPool1d", "MaxPool1d"])            
        pooling = getattr(nn, pool)(13, stride=1)
        x = pooling(x)
        x = x.view(x.shape[0], -1)
        x = self.linear(x)
        return x

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        
        self.linear = nn.Sequential(
            nn.Linear(20, 10),
            nn.ReLU(),
            nn.Linear(10, 5),
            nn.ReLU(),
            nn.Linear(5, 1),
            nn.Sigmoid()
        )
    
    def forward(self,x):
        h_0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c_0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        output, (hn, cn) = self.lstm(x, (h_0, c_0))
        hidden = torch.cat((hn[-2,:,:], hn[-1,:,:]), dim = 1)     
        out = self.linear(hidden)
        return out

model = LSTM(189, 10, 10)
model = model.to(device)

In [8]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)



In [5]:
def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            pred = torch.round(pred)
            correct += (pred == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    #print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return test_loss

In [6]:
def validate(dataloader, model):
    with torch.no_grad():
        for batch, (X, y) in enumerate(dataloader):
            # Compute prediction and loss
            X = X.to(device)
            y = y.to(device)
            pred = model(X)        


            pred_df = pd.DataFrame(pred.cpu().detach(), columns=["prediction"])
            target_df = pd.DataFrame(y.cpu().detach(), columns=["target"])

            uuid_df = pd.DataFrame(data={"uuid": [uuid.uuid4() for _ in range(len(pred_df.index))]})

            pred_df = pd.concat([uuid_df, pred_df], axis=1)
            target_df = pd.concat([uuid_df, target_df], axis=1)

            amex_metric_value = amex_metric(target_df, pred_df)

            return amex_metric_value

            #print(f"Amex Score (batch {batch:>1d}): {amex_metric_value:>8f} \n")

In [7]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        X = X.to(device)
        y = y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        #if batch % 100 == 0:
            #loss, current = loss.item(), batch * len(X)


            #pred_df = pd.DataFrame(pred.cpu().detach(), columns=["prediction"])
            #target_df = pd.DataFrame(y.cpu().detach(), columns=["target"])

            #uuid_df = pd.DataFrame(data={"uuid": [uuid.uuid4() for _ in range(len(pred_df.index))]})

            #pred_df = pd.concat([uuid_df, pred_df], axis=1)
            #target_df = pd.concat([uuid_df, target_df], axis=1)

            #amex_metric_value = amex_metric(target_df, pred_df)

            #, amex-score: {amex_metric_value:>7f}

            #print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")     
    return loss.item()

In [19]:
train_test_ration = 0.9
full_dataset = AmericanExpressProfileTimeSeriesDataset("transformed_dataset.csv", nrows=10000)#, transformation=lambda data: data.T)

train_size = int(train_test_ration * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset1, test_dataset1 = torch.utils.data.random_split(full_dataset, [train_size, test_size])

#train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)
#test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=shuffle)
#validate_dataloader = DataLoader(test_dataset, batch_size=math.floor(test_size/2), shuffle=shuffle)

In [21]:
train_test_ration = 0.9
full_dataset = AmericanExpressProfileTimeSeriesDataset("transformed_dataset.csv", nrows=100000)#, transformation=lambda data: data.T)

train_size = int(train_test_ration * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset2, test_dataset2 = torch.utils.data.random_split(full_dataset, [train_size, test_size])

#train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)
#test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=shuffle)
#validate_dataloader = DataLoader(test_dataset, batch_size=math.floor(test_size/2), shuffle=shuffle)

In [23]:
train_test_ration = 0.9
full_dataset = AmericanExpressProfileTimeSeriesDataset("transformed_dataset.csv", nrows=500000)#, transformation=lambda data: data.T)

train_size = int(train_test_ration * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset3, test_dataset3 = torch.utils.data.random_split(full_dataset, [train_size, test_size])

#train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)
#test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=shuffle)
#validate_dataloader = DataLoader(test_dataset, batch_size=math.floor(test_size/2), shuffle=shuffle)

In [None]:
def objective(trial):
    model = ConvNet(trial).to(device)
    loss_fn = nn.BCELoss()
    lr = trial.suggest_float("lr", 1e-7, 0.1, log=True)
    optim_ = trial.suggest_categorical("optimizer", ["AdamW", "Adam", "SGD", "Adagrad", "NAdam"])
    optimizer = getattr(optim, optim_)(model.parameters(), lr=lr)
    lr_lambda = trial.suggest_float("lr_lambda", 0.4, 0.99)
    
    batch_size = trial.suggest_int("batch_size", 16, 128, 16)

    dataset = trial.suggest_categorical("dataset_size", ["small", "medium", "large"])


    if dataset == "small":
        train_dataset = train_dataset1
        test_dataset = test_dataset1

    if dataset == "medium":
        train_dataset = train_dataset2
        test_dataset = test_dataset2

    if dataset == "large":
        train_dataset = train_dataset3
        test_dataset = test_dataset3


    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lambda epoch: lr_lambda)

    epochs = trial.suggest_int("epochs", 1, 10)

    for t in range(epochs):
    
        train_loop(train_dataloader, model, loss_fn, optimizer)
        loss = test_loop(test_dataloader, model, loss_fn)

        trial.report(loss, t)
        scheduler.step()

    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()

    return loss

In [24]:
study = optuna.create_study(study_name="optuna-study-convolution", direction="minimize", storage='sqlite:///optuna-study.db', load_if_exists=True)

[32m[I 2022-09-15 10:14:42,788][0m A new study created in RDB with name: optuna-study-convolution[0m


In [25]:
study.optimize(objective, n_trials=100)

print("Best trial:")
trial = study.best_trial
print(" Value: ", trial.value)
print(" Params: ")
for key, value in trial.params.items():
    print("     {}: {}".format(key, value))

[32m[I 2022-09-15 10:15:38,783][0m Trial 0 finished with value: 0.4446168430149555 and parameters: {'activation_conv': 'LeakyReLU', 'activation_lin': 'ELU', 'n_layers_blocks': 22, 'conv_size_layer0': 270, 'kernel_size_layer0': 4, 'conv_batchnorm_layer0': True, 'conv_size_layer1': 100, 'kernel_size_layer1': 2, 'conv_batchnorm_layer1': False, 'conv_size_layer2': 300, 'kernel_size_layer2': 10, 'conv_batchnorm_layer2': True, 'conv_size_layer3': 270, 'kernel_size_layer3': 2, 'conv_batchnorm_layer3': False, 'conv_size_layer4': 130, 'kernel_size_layer4': 9, 'conv_batchnorm_layer4': False, 'conv_size_layer5': 130, 'kernel_size_layer5': 9, 'conv_batchnorm_layer5': True, 'conv_size_layer6': 210, 'kernel_size_layer6': 8, 'conv_batchnorm_layer6': False, 'conv_size_layer7': 100, 'kernel_size_layer7': 3, 'conv_batchnorm_layer7': False, 'conv_size_layer8': 280, 'kernel_size_layer8': 13, 'conv_batchnorm_layer8': True, 'conv_size_layer9': 290, 'kernel_size_layer9': 6, 'conv_batchnorm_layer9': False, 

Best trial:
 Value:  0.23998218443658617
 Params: 
     activation_conv: ELU
     activation_lin: ELU
     batch_size: 112
     conv_batchnorm_layer0: True
     conv_batchnorm_layer1: False
     conv_batchnorm_layer10: False
     conv_batchnorm_layer11: False
     conv_batchnorm_layer12: False
     conv_batchnorm_layer13: True
     conv_batchnorm_layer14: False
     conv_batchnorm_layer2: False
     conv_batchnorm_layer3: True
     conv_batchnorm_layer4: True
     conv_batchnorm_layer5: False
     conv_batchnorm_layer6: False
     conv_batchnorm_layer7: True
     conv_batchnorm_layer8: False
     conv_batchnorm_layer9: True
     conv_size_layer0: 290
     conv_size_layer1: 260
     conv_size_layer10: 130
     conv_size_layer11: 150
     conv_size_layer12: 160
     conv_size_layer13: 190
     conv_size_layer14: 280
     conv_size_layer2: 180
     conv_size_layer3: 270
     conv_size_layer4: 220
     conv_size_layer5: 190
     conv_size_layer6: 220
     conv_size_layer7: 100
     conv_si

In [None]:
class AmericanExpressProfileTimeSeriesValidationDataset(Dataset):
    def __init__(self, dataset_file, transformation=False):
        self.dataset = pd.read_csv(dataset_file, nrows=10)
        self.transformation = transformation

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):

        row = self.dataset.iloc[[idx]]
        data = row.drop(['customer_ID'], axis=1)

        data = data.values[0].tolist()

        for idx, value in enumerate(data):
            if idx == 0: continue
            list_ = value[1:-1].split(", ")
            for idx_l, elem in enumerate(list_):
                list_[idx_l] = float(elem)
            data[idx] = list_
        data = data[1:]
        
        
        data = np.array(data, dtype=np.float32)
        data = torch.tensor(data, dtype=torch.float32, requires_grad=True)
        data = data.to(device)
        if self.transformation: data = self.transformation(data)
        return data

In [None]:
full_validation_dataset = AmericanExpressProfileTimeSeriesValidationDataset("transformed_test_dataset_normalized", transformation=lambda data: data.T)

validation_dataloader = DataLoader(full_validation_dataset, batch_size=batch_size)

In [None]:
full_validation_dataset[0].shape

In [None]:
def validation_loop(dataloader, model, out_file):

    with torch.no_grad():
        data_file = open(f"{out_file}.csv", "w")
        data_file.write("customer_ID,prediction\n")
        for X, customer_ID in dataloader:
            pred = model(X)
            pred = pred.tolist()
            for idx in range(len(customer_ID)):
                data_file.write(customer_ID[idx] + "," + str(pred[idx])+"\n")
        data_file.close()

In [None]:
validation_loop(validation_dataloader, model, "./test_data/test_labels")