In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.nn import functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
import math
import time
import uuid
import optuna
import plotly
import matplotlib
from torch.utils.tensorboard import SummaryWriter
import torchmetrics
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### Datloader for the amex dataset

In [17]:
class AmericanExpressPreprocessedProfileTimeSeriesDataset(Dataset):
    def __init__(self, dataset_file, test=False, nrows=False, transformation=False):
        data = np.load(dataset_file)
        if not test:
            num_data = data["train_floats"].reshape(data["train_floats"].shape[0], -1, 13)
            cat_data = data["train_cat"].reshape(data["train_cat"].shape[0], -1, 13)
            self.y = data["train_y"]
        else:
            num_data = data["test_floats"].reshape(data["test_floats"].shape[0], -1, 13)
            cat_data = data["test_cat"].reshape(data["test_cat"].shape[0], -1, 13)
            self.y = np.zeros(num_data.shape[0])#data["test_y"]

        if nrows:
            num_data = num_data[0:nrows]
            cat_data = cat_data[0:nrows]
            self.y = self.y[0:nrows]

        self.dataset = np.concatenate([num_data, cat_data], axis=1)
        self.transformation = transformation

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        data = self.dataset[idx]
        label = self.y[idx]

        data = torch.tensor(data, dtype=torch.float32, requires_grad=True)
        label = torch.tensor([label], dtype=torch.float32)
        label = label.to(device)
        data = data.to(device)
        if self.transformation: data = self.transformation(data)
        return data, label

### FCN for time series classification

In [22]:
class FCN(nn.Module):
    def __init__(self, trial):
        super(FCN, self).__init__()

        self.trial = trial

        lin_activation = trial.suggest_categorical(f"activation_lin", ["LeakyReLU", "ReLU", "ELU"])
        self.lin_activ = getattr(nn, lin_activation)()

        self.linear = nn.Sequential()
        linear_in = 2314
        n_layers_linear = self.trial.suggest_int("n_layers_linear", 1, 10)
        for k in range(n_layers_linear):
            linear_out = self.trial.suggest_int(f"linear_size_layer{k}", 100, 300, 10)
            self.linear.append(nn.Linear(linear_in, linear_out))
            self.linear.append(self.lin_activ)
            batchnorm = self.trial.suggest_categorical(f"linear_batchnorm_layer{k}", [True, False])            
            if batchnorm: self.linear.append(nn.BatchNorm1d(linear_out))

            linear_in = linear_out
        self.linear.append(nn.Linear(linear_out, 1))
        self.linear.append(nn.Sigmoid())


    def forward(self, x):
        x = x.view(x.shape[0], -1)
        x = self.linear(x)
        return x

In [14]:
def test_loop(dataloader, model, loss_fn, epoch):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            pred = torch.round(pred)
            correct += (pred == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    return test_loss

In [15]:
def train_loop(dataloader, model, loss_fn, optimizer, epoch):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        X = X.to(device)
        y = y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    return loss.item()

In [18]:
train_test_ration = 0.7
full_dataset = AmericanExpressPreprocessedProfileTimeSeriesDataset("./amex_preprocessed.npz", nrows=10000)

train_size = int(train_test_ration * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])

### Hyperparameter optimization using optuna

In [23]:
def objective(trial):
    model = FCN(trial).to(device)
    loss_fn = nn.BCELoss()
    lr = trial.suggest_float("lr", 1e-7, 0.1, log=True)
    optim_ = trial.suggest_categorical("optimizer", ["AdamW", "Adam", "SGD", "Adagrad", "NAdam"])
    optimizer = getattr(optim, optim_)(model.parameters(), lr=lr)
    lr_lambda = trial.suggest_float("lr_lambda", 0.4, 0.99)
    batch_size = trial.suggest_int("batch_size", 16, 128, 16)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lambda epoch: lr_lambda)
    epochs = trial.suggest_int("epochs", 1, 3)

    for t in range(epochs):
        train_loop(train_dataloader, model, loss_fn, optimizer, t)
        loss = test_loop(test_dataloader, model, loss_fn, t)
        trial.report(loss, t)
        scheduler.step()

    if trial.should_prune():
        raise optuna.exceptions.TrialPruned()

    return loss

In [24]:
study = optuna.create_study(study_name="optuna-study-linear", direction="minimize", storage='sqlite:///optuna-study-linear.db', load_if_exists=True)

[32m[I 2022-09-25 12:18:29,078][0m A new study created in RDB with name: optuna-study-linear[0m


In [25]:
study.optimize(objective, n_trials=1000)

[32m[I 2022-09-25 12:18:33,557][0m Trial 0 finished with value: 0.7767959700690376 and parameters: {'activation_lin': 'ELU', 'n_layers_linear': 3, 'linear_size_layer0': 250, 'linear_batchnorm_layer0': False, 'linear_size_layer1': 150, 'linear_batchnorm_layer1': False, 'linear_size_layer2': 170, 'linear_batchnorm_layer2': True, 'lr': 2.664125370807947e-06, 'optimizer': 'SGD', 'lr_lambda': 0.7185788573513253, 'batch_size': 48, 'epochs': 1}. Best is trial 0 with value: 0.7767959700690376.[0m
[32m[I 2022-09-25 12:18:35,633][0m Trial 1 finished with value: 0.33902112974060905 and parameters: {'activation_lin': 'ReLU', 'n_layers_linear': 10, 'linear_size_layer0': 220, 'linear_batchnorm_layer0': True, 'linear_size_layer1': 110, 'linear_batchnorm_layer1': True, 'linear_size_layer2': 200, 'linear_batchnorm_layer2': True, 'linear_size_layer3': 110, 'linear_batchnorm_layer3': True, 'linear_size_layer4': 160, 'linear_batchnorm_layer4': False, 'linear_size_layer5': 250, 'linear_batchnorm_layer

KeyboardInterrupt: 

In [10]:
print("Best trial:")
trial = study.best_trial
print(" Value: ", trial.value)
print(" Params: ")
for key, value in trial.params.items():
    print("     {}: {}".format(key, value))

Best trial:
 Value:  0.25591892500718433
 Params: 
     activation_lin: ReLU
     batch_size: 112
     epochs: 2
     linear_batchnorm_layer0: False
     linear_batchnorm_layer1: False
     linear_size_layer0: 1370
     linear_size_layer1: 880
     lr: 0.001051905292972931
     lr_lambda: 0.6837203763543799
     n_layers_linear: 2
     optimizer: Adam


### CNN with hyperparameters computed by optuna

In [36]:
class FCN_optimized(nn.Module):
    def __init__(self):
        super(FCN_optimized, self).__init__()
        self.linear = nn.Sequential()
        linear_in = 2314


        self.linear.append(nn.Linear(linear_in, 210))
        self.linear.append(nn.ReLU())

        self.linear.append(nn.Linear(210, 130))
        self.linear.append(nn.ReLU())

        self.linear.append(nn.Linear(130, 170))
        self.linear.append(nn.ReLU())

        self.linear.append(nn.Linear(170, 280))
        self.linear.append(nn.ReLU())

        self.linear.append(nn.Linear(280, 1))
        self.linear.append(nn.Sigmoid())


    def forward(self, x):
        x = x.view(x.shape[0], -1)
        x = self.linear(x)
        return x

In [37]:
model = FCN_optimized().to(device)

In [38]:
loss_fn = nn.BCELoss()
lr = 0.00030123940814154353
optimizer = optim.AdamW(model.parameters(), lr=lr)
lr_lambda = 0.6093962250032199 
batch_size = 80
scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optimizer, lr_lambda=lambda epoch: lr_lambda)
epochs = 30

In [39]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

def get_amex(pred, y):
    pred_df = pd.DataFrame(pred, columns=["prediction"])
    target_df = pd.DataFrame(y, columns=["target"])
    uuid_df = pd.DataFrame(data={"uuid": [uuid.uuid4() for _ in range(len(pred_df.index))]})
    pred_df = pd.concat([uuid_df, pred_df], axis=1)
    target_df = pd.concat([uuid_df, target_df], axis=1)
    amex_metric_value = amex_metric(target_df, pred_df)
    return amex_metric_value

In [40]:
train_loop_num = 0

In [31]:
train_test_ration = 0.7
full_dataset = AmericanExpressPreprocessedProfileTimeSeriesDataset("./amex_preprocessed.npz")

train_size = int(train_test_ration * len(full_dataset))
test_size = len(full_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_size])

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [41]:
def test_loop(dataloader, model, loss_fn, epoch):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            pred = torch.round(pred)
            correct += (pred == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    return test_loss

In [42]:
def train_loop(dataloader, model, loss_fn, optimizer, writer, test_dataset):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        X = X.to(device)
        y = y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        global train_loop_num
        train_loop_num += 1
        writer.add_scalar('BatchLoss/train', loss, train_loop_num)
        if batch % 100 == 0 and batch != 0:
            test_loss = test_loop(test_dataloader, model, loss_fn, test_dataset)
            writer.add_scalar('Loss/test', test_loss, train_loop_num)   
    return loss.item()

In [43]:
models = {}

In [44]:
for t in range(epochs):
    writer = SummaryWriter(log_dir="runs_fcn/"+str(t))
    train_loop(train_dataloader, model, loss_fn, optimizer, writer, test_dataset)
    models[t] = model
    scheduler.step()

KeyboardInterrupt: 

In [45]:
torch.save(models[6], "model_fcn")

In [46]:
model_ = models[6]

In [48]:
full_dataset = AmericanExpressPreprocessedProfileTimeSeriesDataset("./amex_preprocessed.npz", test=True)
test_dataloader = DataLoader(full_dataset, batch_size=batch_size, shuffle=False)

In [None]:
res = torch.Tensor([]).to(device)

with torch.no_grad():
    for X, y in test_dataloader:
        pred = model_(X)
        res = torch.cat((res, pred))

In [None]:
res = res.cpu().numpy()
res = pd.DataFrame(res)
res.to_csv('pred_fcn.csv', index=False)

In [None]:
data = np.load("./amex_preprocessed.npz")
y = data["test_y"]

In [None]:
y = pd.DataFrame(y)

In [None]:
np.sum(np.abs(res - y))