In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.nn import functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
import math
import time
import uuid
import optuna
import plotly
import matplotlib
from torch.utils.tensorboard import SummaryWriter
import torchmetrics
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### Dataloader for the amex dataset

In [2]:
class AmericanExpressPreprocessedProfileTimeSeriesDataset(Dataset):
    def __init__(self, dataset_file, test=False, nrows=False, transformation=False):
        data = np.load(dataset_file)
        if not test:
            num_data = data["train_floats"].reshape(data["train_floats"].shape[0], -1, 13)
            cat_data = data["train_cat"].reshape(data["train_cat"].shape[0], -1, 13)
            self.y = data["train_y"]
        else:
            num_data = data["test_floats"].reshape(data["test_floats"].shape[0], -1, 13)
            cat_data = data["test_cat"].reshape(data["test_cat"].shape[0], -1, 13)
            self.y = np.zeros(num_data.shape[0])#data["test_y"]

        if nrows:
            num_data = num_data[0:nrows]
            cat_data = cat_data[0:nrows]
            self.y = self.y[0:nrows]

        self.dataset = np.concatenate([num_data, cat_data], axis=1)
        self.transformation = transformation

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        data = self.dataset[idx]
        label = self.y[idx]

        data = torch.tensor(data, dtype=torch.float32, requires_grad=True)
        label = torch.tensor([label], dtype=torch.float32)
        label = label.to(device)
        data = data.to(device)
        if self.transformation: data = self.transformation(data)
        return data, label

### CNN for time series classification

In [None]:
class ConvNet(nn.Module):
    def __init__(self, trial):
        super(ConvNet, self).__init__()

        self.trial = trial

        conv_activation = trial.suggest_categorical(f"activation_conv", ["LeakyReLU", "ReLU", "ELU"])
        self.conv_activ = getattr(nn, conv_activation)()

        lin_activation = trial.suggest_categorical(f"activation_lin", ["LeakyReLU", "ReLU", "ELU"])
        self.lin_activ = getattr(nn, lin_activation)()

        n_layers_blocks = self.trial.suggest_int("n_layers_conv", 1, 30)
        self.blocks = nn.Sequential()
        conv_in = 178
        for i in range(n_layers_blocks):
            conv_out = self.trial.suggest_int(f"conv_size_layer{i}", 100, 300, 10)
            kernel_size = self.trial.suggest_int(f"kernel_size_layer{i}", 1, 13)
            self.blocks.append(nn.Conv1d(conv_in, conv_out, kernel_size=kernel_size, stride=1, padding="same", padding_mode="replicate"))
            self.blocks.append(self.conv_activ)

            conv_in = conv_out


        self.linear = nn.Sequential()
        linear_in = conv_out
        n_layers_linear = self.trial.suggest_int("n_layers_linear", 1, 10)
        for k in range(n_layers_linear):
            linear_out = self.trial.suggest_int(f"lin_size_layer{k}", 100, 300, 10)
            self.linear.append(nn.Linear(linear_in, linear_out))
            self.linear.append(self.lin_activ)

            linear_in = linear_out
        self.linear.append(nn.Linear(linear_out, 1))
        self.linear.append(nn.Sigmoid())


    def forward(self, x):
        x = self.blocks(x)
        pool = self.trial.suggest_categorical(f"pool", ["AvgPool1d", "MaxPool1d"])            
        pooling = getattr(nn, pool)(13, stride=1)
        x = pooling(x)
        x = x.view(x.shape[0], -1)
        x = self.linear(x)
        return x

In [None]:
def test_loop(dataloader, model, loss_fn, epoch):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            pred = torch.round(pred)
            correct += (pred == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    return test_loss

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer, epoch):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        X = X.to(device)
        y = y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return loss.item()

In [None]:
train_test_ration = 0.7
full_dataset = AmericanExpressPreprocessedProfileTimeSeriesDataset("./amex_preprocessed.npz", nrows=10000)

train_size = int(train_test_ration * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])

### Hyperparameter optimization using optuna

In [None]:
def objective(trial):
    model = ConvNet(trial).to(device)
    loss_fn = nn.BCELoss()
    lr = trial.suggest_float("lr", 1e-7, 0.1, log=True)
    optim_ = trial.suggest_categorical("optimizer", ["AdamW", "Adam", "SGD", "Adagrad", "NAdam"])
    optimizer = getattr(optim, optim_)(model.parameters(), lr=lr)
    lr_lambda = trial.suggest_float("lr_lambda", 0.4, 0.99)
    batch_size = trial.suggest_int("batch_size", 16, 128, 16)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lambda epoch: lr_lambda)
    epochs = trial.suggest_int("epochs", 1, 3)

    for t in range(epochs):
        train_loop(train_dataloader, model, loss_fn, optimizer, t)
        loss = test_loop(test_dataloader, model, loss_fn, t)
        trial.report(loss, t)
        scheduler.step()

    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()

    return loss

In [None]:
study = optuna.create_study(study_name="optuna-study-convolution", direction="minimize", storage='sqlite:///optuna-study-conv.db', load_if_exists=True)

In [None]:
study.optimize(objective, n_trials=500)

In [None]:
print("Best trial:")
trial = study.best_trial
print(" Value: ", trial.value)
print(" Params: ")
for key, value in trial.params.items():
    print("     {}: {}".format(key, value))

### CNN with hyperparameters computed by optuna

In [3]:
class ConvNetOptimized(nn.Module):
    def __init__(self):
        super(ConvNetOptimized, self).__init__()

        self.blocks = nn.Sequential()
        self.blocks.append(nn.Conv1d(178, 300, kernel_size=9, stride=1, padding="same", padding_mode="replicate"))
        self.blocks.append(nn.ReLU())

        self.blocks.append(nn.Conv1d(300, 200, kernel_size=12, stride=1, padding="same", padding_mode="replicate"))
        self.blocks.append(nn.ReLU())
        self.blocks.append(nn.BatchNorm1d(200))

        self.blocks.append(nn.Conv1d(200, 180, kernel_size=5, stride=1, padding="same", padding_mode="replicate"))
        self.blocks.append(nn.ReLU())

        self.blocks.append(nn.Conv1d(180, 140, kernel_size=6, stride=1, padding="same", padding_mode="replicate"))
        self.blocks.append(nn.ReLU())
        self.blocks.append(nn.BatchNorm1d(140))



        self.linear = nn.Sequential()

        self.linear.append(nn.Linear(140, 200))
        self.linear.append(nn.ReLU())

        self.linear.append(nn.Linear(200, 220))
        self.linear.append(nn.ReLU())

        self.linear.append(nn.Linear(220, 260))
        self.linear.append(nn.ReLU())

        self.linear.append(nn.Linear(260, 280))
        self.linear.append(nn.ReLU())

        self.linear.append(nn.Linear(280, 100))
        self.linear.append(nn.ReLU())

        self.linear.append(nn.Linear(100, 130))
        self.linear.append(nn.ReLU())

        self.linear.append(nn.Linear(130, 230))
        self.linear.append(nn.ReLU())

        self.linear.append(nn.Linear(230, 1))
        self.linear.append(nn.Sigmoid())


    def forward(self, x):
        x = self.blocks(x)
        pooling = nn.MaxPool1d(13)
        x = pooling(x)
        x = x.view(x.shape[0], -1)
        x = self.linear(x)
        return x

In [4]:
model = ConvNetOptimized().to(device)

In [5]:
loss_fn = nn.BCELoss()
lr = 0.0013045542577553354
optimizer = optim.Adam(model.parameters(), lr=lr)
lr_lambda = 0.8976748771328298    
batch_size = 112
scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lambda epoch: lr_lambda)
epochs = 15

In [6]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

def get_amex(pred, y):
    pred_df = pd.DataFrame(pred, columns=["prediction"])
    target_df = pd.DataFrame(y, columns=["target"])
    uuid_df = pd.DataFrame(data={"uuid": [uuid.uuid4() for _ in range(len(pred_df.index))]})
    pred_df = pd.concat([uuid_df, pred_df], axis=1)
    target_df = pd.concat([uuid_df, target_df], axis=1)
    amex_metric_value = amex_metric(target_df, pred_df)
    return amex_metric_value

In [7]:
train_loop_num = 0

In [8]:
train_test_ration = 0.7
full_dataset = AmericanExpressPreprocessedProfileTimeSeriesDataset("./amex_preprocessed.npz")

train_size = int(train_test_ration * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [9]:
def test_loop(dataloader, model, loss_fn, epoch):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            pred = torch.round(pred)
            correct += (pred == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    return test_loss

In [10]:
def train_loop(dataloader, model, loss_fn, optimizer, writer, test_dataset):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        X = X.to(device)
        y = y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        global train_loop_num
        train_loop_num += 1
        writer.add_scalar('BatchLoss/train', loss, train_loop_num)
        if batch % 100 == 0 and batch != 0:
            test_loss = test_loop(test_dataloader, model, loss_fn, test_dataset)
            writer.add_scalar('Loss/test', test_loss, train_loop_num)   
    return loss.item()

In [11]:
models = {}

In [12]:
for t in range(epochs):
    writer = SummaryWriter(log_dir="runs_cnn/"+str(t))
    train_loop(train_dataloader, model, loss_fn, optimizer, writer, test_dataset)
    models[t] = model
    scheduler.step()

2022-09-25 12:21:46.213930: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-25 12:21:46.394939: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-25 12:21:46.394971: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-09-25 12:21:46.428346: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-25 12:21:47.224533: W tensorflow/stream_executor/platform/de

KeyboardInterrupt: 

In [13]:
torch.save(models[9], "model_cnn")

In [14]:
model_ = models[9]

In [None]:
full_dataset = AmericanExpressPreprocessedProfileTimeSeriesDataset("./amex_preprocessed.npz", test=True)
test_dataloader = DataLoader(full_dataset, batch_size=batch_size, shuffle=False)

In [None]:
res = torch.Tensor([]).to(device)

with torch.no_grad():
    for X, y in test_dataloader:
        pred = model_(X)
        res = torch.cat((res, pred))

In [None]:
res = res.cpu().numpy()
res = pd.DataFrame(res)
res.to_csv('pred_cnn.csv', index=False)

In [None]:
res.to_csv('pred.csv', index=False)

In [None]:
res.shape

In [None]:
data = np.load("./amex_preprocessed.npz")
y = data["test_y"]

In [None]:
y = pd.DataFrame(y)

In [None]:
y.shape

In [None]:
np.sum(np.abs(res.cpu() - y))