In [25]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.nn import functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
import math
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import time
import uuid
import optuna

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [132]:
class AmericanExpressProfileTimeSeriesDataset(Dataset):
    def __init__(self, dataset_file, nrows, transformation=False):
        self.dataset = pd.read_csv(dataset_file, nrows=nrows)
        self.transformation = transformation

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        row = self.dataset.iloc[[idx]]
        label = row["target"].values
        data = row.drop(['customer_ID', 'target'], axis=1)
        data = data.values[0].tolist()

        for idx, value in enumerate(data):
            if idx == 0: continue
            list_ = value[1:-1].split(", ")
            for idx_l, elem in enumerate(list_):
                list_[idx_l] = float(elem)
            data[idx] = list_
        data = data[1:]

        data = np.array(data, dtype=np.float32)
        data = torch.tensor(data, dtype=torch.float32, requires_grad=True)
        label = torch.tensor(label, dtype=torch.float32)
        label = label.to(device)
        data = data.to(device)
        if self.transformation: data = self.transformation(data)
        return data, label

In [154]:
class Block(nn.Module):

    def __init__(self, trial):
        super(Block, self).__init__()

        self.relu = nn.LeakyReLU()

        padding_mode = trial.suggest_categorical("padding_mode", ["zeros", "reflect", "replicate"])

        kernel_size1 = trial.suggest_int("conv1_kernel", 2, 13)
        kernel_size2 = trial.suggest_int("conv2_kernel", 2, 13)
        kernel_size3 = trial.suggest_int("conv3_kernel", 2, 13)

        channel_size_conv_intermediate = trial.suggest_int("channel_size_conv_intermediate", 100, 500, 50)


        self.conv1 = nn.Sequential(
            nn.Conv1d(189, channel_size_conv_intermediate, kernel_size=kernel_size1, stride=1, padding="same", padding_mode=padding_mode),
            nn.BatchNorm1d(channel_size_conv_intermediate),
        )

        self.conv2 = nn.Sequential(
            nn.Conv1d(channel_size_conv_intermediate, channel_size_conv_intermediate, kernel_size=kernel_size2, stride=1, padding="same", padding_mode=padding_mode),
            nn.BatchNorm1d(channel_size_conv_intermediate),
        )

        self.conv3 = nn.Sequential(
            nn.Conv1d(channel_size_conv_intermediate, 189, kernel_size=kernel_size3, stride=1, padding="same", padding_mode=padding_mode),
            nn.BatchNorm1d(189),
        )

    def forward(self, x):
        identity = x
        x = self.conv1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.conv3(x)
        x = x + identity
        x = self.relu(x)
        return x

class ResNet(nn.Module):


    def __init__(self, trial):
        super(ResNet, self).__init__()

        self.relu = nn.ReLU()

        n_layers_blocks = trial.suggest_int("n_layers_blocks", 1, 26, 5)

        self.blocks = nn.Sequential(*[Block(trial) for _ in range(n_layers_blocks)])

        linear_intermediate_layer_size = trial.suggest_int("linear_intermediate_layer_size", 200, 500, 50)


        self.linear = nn.Sequential(
            nn.Linear(189, linear_intermediate_layer_size),
            nn.LeakyReLU(),
            nn.Linear(linear_intermediate_layer_size, 1),
            nn.Sigmoid()
        )


    def forward(self, x):
        x = self.blocks(x)

        avg = nn.AvgPool1d(13, stride=1)
        x = avg(x)
        x = x.view(x.shape[0], -1)
        x = self.linear(x)
        return x

In [None]:
class Encoder(nn.Module):

    def __init__(self):
        super(Encoder, self).__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv1d(189, 170, kernel_size=8, stride=1),
            nn.LeakyReLU()
        )

        self.conv2 = nn.Sequential(
            nn.Conv1d(170, 140, kernel_size=5, stride=1),
            nn.LeakyReLU()
        )

        self.conv3 = nn.Sequential(
            nn.Conv1d(140, 100, kernel_size=3, stride=1),
            nn.LeakyReLU()
        )


    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        return x

class Decoder(nn.Module):

    def __init__(self):
        super(Decoder, self).__init__()

        self.conv3 = nn.Sequential(
            nn.ConvTranspose1d(170, 189, kernel_size=8, stride=1),
            nn.LeakyReLU()
        )

        self.conv2 = nn.Sequential(
            nn.ConvTranspose1d(140, 170, kernel_size=5, stride=1),
            nn.LeakyReLU()
        )

        self.conv1 = nn.Sequential(
            nn.ConvTranspose1d(100, 140, kernel_size=3, stride=1),
            nn.LeakyReLU()
        )


    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        return x

class Autoencoder(nn.Module):

    def __init__(self):
        super(Autoencoder, self).__init__()

        self.encoder = Encoder()
        self.decoder = Decoder()


    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

    def encode(self, x):
        return self.encoder(x)

    def decode(self, x):
        return self.decoder(x)

model = Autoencoder()
model = model.to(device)

In [None]:
class ConvolutionalNetwork(nn.Module):

    def __init__(self, convolution_layers, linear_layers, activation, final_activation):
        super(ConvolutionalNetwork, self).__init__()

        self.conv = nn.Sequential(
            nn.Conv1d(197, 256, 2, stride=1),
            nn.ReLU(),
            nn.Conv1d(256, 512, 2, stride=1),
            nn.ReLU(),
            nn.MaxPool1d(2, stride=1),
            nn.Conv1d(512, 1024, 2, stride=1),
            nn.ReLU(),
            nn.Conv1d(1024, 1024, 2, stride=1),
            nn.ReLU(),
            nn.MaxPool1d(2, stride=1),
            nn.Conv1d(1024, 1024, 2, stride=1),
            nn.ReLU(),
            nn.Conv1d(1024, 1024, 2, stride=1),
            nn.ReLU(),
            nn.MaxPool1d(5, stride=1),
            nn.Flatten(),
            nn.Linear(1024, 256),
            nn.ReLU(),
            nn.Linear(256, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.conv(x)
        return x

#model = ConvolutionalNetwork()
#model = model.to(device)

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        
        self.linear = nn.Sequential(
            nn.Linear(20, 10),
            nn.ReLU(),
            nn.Linear(10, 5),
            nn.ReLU(),
            nn.Linear(5, 1),
            nn.Sigmoid()
        )
    
    def forward(self,x):
        h_0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c_0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        output, (hn, cn) = self.lstm(x, (h_0, c_0))
        hidden = torch.cat((hn[-2,:,:], hn[-1,:,:]), dim = 1)     
        out = self.linear(hidden)
        return out

model = LSTM(189, 10, 10)
model = model.to(device)

In [None]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)



In [114]:
def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            pred = torch.round(pred)
            correct += (pred == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    #print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return test_loss

In [90]:
def validate(dataloader, model):
    with torch.no_grad():
        for batch, (X, y) in enumerate(dataloader):
            # Compute prediction and loss
            X = X.to(device)
            y = y.to(device)
            pred = model(X)        


            pred_df = pd.DataFrame(pred.cpu().detach(), columns=["prediction"])
            target_df = pd.DataFrame(y.cpu().detach(), columns=["target"])

            uuid_df = pd.DataFrame(data={"uuid": [uuid.uuid4() for _ in range(len(pred_df.index))]})

            pred_df = pd.concat([uuid_df, pred_df], axis=1)
            target_df = pd.concat([uuid_df, target_df], axis=1)

            amex_metric_value = amex_metric(target_df, pred_df)

            return amex_metric_value

            #print(f"Amex Score (batch {batch:>1d}): {amex_metric_value:>8f} \n")

In [57]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        X = X.to(device)
        y = y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        #if batch % 100 == 0:
            #loss, current = loss.item(), batch * len(X)


            #pred_df = pd.DataFrame(pred.cpu().detach(), columns=["prediction"])
            #target_df = pd.DataFrame(y.cpu().detach(), columns=["target"])

            #uuid_df = pd.DataFrame(data={"uuid": [uuid.uuid4() for _ in range(len(pred_df.index))]})

            #pred_df = pd.concat([uuid_df, pred_df], axis=1)
            #target_df = pd.concat([uuid_df, target_df], axis=1)

            #amex_metric_value = amex_metric(target_df, pred_df)

            #, amex-score: {amex_metric_value:>7f}

            #print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")     
    return loss.item()

In [None]:
models = {}

In [77]:
epochs = 1
learning_rate = 0.01
lr_lambda = 0.5
batch_size = 16
shuffle=True
train_test_ration = 0.9

In [134]:
full_dataset = AmericanExpressProfileTimeSeriesDataset("transformed_dataset.csv", nrows=50000)#, transformation=lambda data: data.T)

train_size = int(train_test_ration * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=shuffle)
#validate_dataloader = DataLoader(test_dataset, batch_size=math.floor(test_size/2), shuffle=shuffle)

In [None]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)
#optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
#scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lambda epoch: lr_lambda)
#scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.0001, max_lr=0.01, cycle_momentum=False)

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    models[t] = model
    test_loop(test_dataloader, model, loss_fn)
    validate(validate_dataloader, model)
    scheduler.step()
print("Done!")

In [None]:
#torch.save(models[11], "./model")
#model = torch.load("./model")
#model.eval()

In [150]:
def objective(trial):
    model = ResNet(trial).to(device)
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "SGD", "AdamW"])
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)

    #for t in range(epochs):
    # use only one epoch for the time being
    train_loop(train_dataloader, model, loss_fn, optimizer)
    loss = test_loop(test_dataloader, model, loss_fn)

    trial.report(loss, t)

    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()

    return loss

In [155]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

print("Best trial:")
trial = study.best_trial
print(" Value: ", trial.value)
print(" Params: ")
for key, value in trial.params.items():
    print("     {}: {}".format(key, value))

[32m[I 2022-09-08 15:45:36,309][0m A new study created in memory with name: no-name-43ef482f-790d-482e-82a4-dd6eb7d19584[0m


In [None]:
Best trial:
 Value:  0.06196432560682297
 Params: 
     n_layers_blocks: 4
     optimizer: Adam
     lr: 0.0009447607858409636

In [None]:
class AmericanExpressProfileTimeSeriesValidationDataset(Dataset):
    def __init__(self, dataset_file, transformation=False):
        self.dataset = pd.read_csv(dataset_file, nrows=10)
        self.transformation = transformation

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):

        row = self.dataset.iloc[[idx]]
        data = row.drop(['customer_ID'], axis=1)

        data = data.values[0].tolist()

        for idx, value in enumerate(data):
            if idx == 0: continue
            list_ = value[1:-1].split(", ")
            for idx_l, elem in enumerate(list_):
                list_[idx_l] = float(elem)
            data[idx] = list_
        data = data[1:]
        
        
        data = np.array(data, dtype=np.float32)
        data = torch.tensor(data, dtype=torch.float32, requires_grad=True)
        data = data.to(device)
        if self.transformation: data = self.transformation(data)
        return data

In [None]:
full_validation_dataset = AmericanExpressProfileTimeSeriesValidationDataset("transformed_test_dataset_normalized", transformation=lambda data: data.T)

validation_dataloader = DataLoader(full_validation_dataset, batch_size=batch_size)

In [None]:
full_validation_dataset[0].shape

In [None]:
def validation_loop(dataloader, model, out_file):

    with torch.no_grad():
        data_file = open(f"{out_file}.csv", "w")
        data_file.write("customer_ID,prediction\n")
        for X, customer_ID in dataloader:
            pred = model(X)
            pred = pred.tolist()
            for idx in range(len(customer_ID)):
                data_file.write(customer_ID[idx] + "," + str(pred[idx])+"\n")
        data_file.close()

In [None]:
validation_loop(validation_dataloader, model, "./test_data/test_labels")