In [None]:
import math
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import wandb
from torch import nn, optim
from tqdm.auto import tqdm

sys.path.append("..")
from otc.data.dataloader import TabDataLoader
from otc.data.dataset import TabDataset
from otc.features.build_features import features_classical_size
from otc.optim.early_stopping import EarlyStopping

In [None]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"

In [None]:
run = wandb.init(project="thesis", entity="fbv")

dataset = "fbv/thesis/ise_supervised_log_standardized_clipped:latest"
artifact = run.use_artifact(dataset)
data_dir = artifact.download()

In [None]:
# preserve relative ordering, sample for testing ace
frac = 1

# sample
X_train = pd.read_parquet(
    Path(data_dir, "train_set.parquet"), engine="fastparquet"
).sample(frac=frac)
y_train = X_train["buy_sell"]
X_train = X_train[features_classical_size]

X_val = pd.read_parquet(Path(data_dir, "val_set.parquet"), engine="fastparquet").sample(
    frac=frac
)
y_val = X_val["buy_sell"]
X_val = X_val[features_classical_size]

X_test = pd.read_parquet(Path(data_dir, "test_set.parquet"), engine="fastparquet")
y_test = X_test["buy_sell"]
X_test = X_test[features_classical_size]

In [None]:
class LogisticRegression(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, num_classes)

    def forward(self, x_cat, x_cont):
        if x_cat:
            x = torch.cat((x_cat, x_cont), 1)
        else:
            x = x_cont
        return self.linear(x)

In [None]:
training_data = TabDataset(X_train, y_train)
val_data = TabDataset(X_val, y_val)
test_data = TabDataset(X_test, y_test)

dl_params = {
    "batch_size": 32768,
    "device": "cuda",
    "shuffle": True,
}


train_loader = TabDataLoader(
    training_data.x_cat,
    training_data.x_cont,
    training_data.weight,
    training_data.y,
    **dl_params,
)
val_loader = TabDataLoader(
    val_data.x_cat, val_data.x_cont, val_data.weight, val_data.y, **dl_params
)

test_loader = TabDataLoader(
    test_data.x_cat, test_data.x_cont, test_data.weight, test_data.y, **dl_params
)

In [None]:
optim_params = {"lr": 1e-4, "weight_decay": 0.00001}

clf = LogisticRegression(input_size=X_train.shape[1], num_classes=1).to("cuda")

criterion = nn.BCEWithLogitsLoss()

optimizer = optim.AdamW(
    clf.parameters(),
    lr=optim_params["lr"],
    weight_decay=optim_params["weight_decay"],
)

In [None]:
# half precision, see https://pytorch.org/docs/stable/amp.html
scaler = torch.cuda.amp.GradScaler()
early_stopping = EarlyStopping(patience=15)
epochs = 100

step = 0
best_accuracy = -1
best_step = -1


for epoch in tqdm(range(epochs)):
    # perform training
    loss_in_epoch_train = 0

    batch = 0

    for x_cat, x_cont, weights, targets in train_loader:
        clf.train()
        optimizer.zero_grad()

        with torch.autocast(device_type="cuda", dtype=torch.float16):
            logits = clf(x_cat, x_cont).flatten()
            train_loss = criterion(logits, targets)

        scaler.scale(train_loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # add the mini-batch training loss to epoch loss
        loss_in_epoch_train += train_loss  # .item()
        wandb.log({"train_loss_step": train_loss, "epoch": epoch, "batch": batch})

        batch += 1
        step += 1

    clf.eval()
    loss_in_epoch_val = 0.0
    correct = 0

    with torch.no_grad():
        for x_cat, x_cont, weights, targets in val_loader:
            # for my implementation
            logits = clf(x_cat, x_cont).flatten()
            logits = logits.flatten()

            val_loss = criterion(logits, targets)

            # get probabilities and round to nearest integer
            preds = torch.sigmoid(logits).round()
            correct += (preds == targets).sum().item()

            loss_in_epoch_val += val_loss  # val_loss #.item()
            wandb.log({"val_loss_step": val_loss, "epoch": epoch, "batch": batch})

            batch += 1

    # loss average over all batches
    train_loss = loss_in_epoch_train / len(train_loader)
    val_loss = loss_in_epoch_val / len(val_loader)

    # correct samples / no samples
    val_accuracy = correct / len(X_val)
    if best_accuracy < val_accuracy:
        best_accuracy = val_accuracy
        best_step = step

    wandb.log({"train_loss": train_loss, "epoch": epoch})
    wandb.log({"val_loss": val_loss, "epoch": epoch})
    # wandb.log({"val_accuracy": val_accuracy, 'epoch': epoch})

    print(f"train:{train_loss} val:{val_loss}")
    print(f"val accuracy:{val_accuracy}")

    # return early if val accuracy doesn't improve. Minus to minimize.
    early_stopping(-val_accuracy)
    if early_stopping.early_stop or math.isnan(train_loss) or math.isnan(val_loss):
        print("early stopping now.")
        break

In [None]:
y_pred, y_true = [], []

for x_cat, x_cont, weights, targets in test_loader:
    logits = clf(x_cat, x_cont).flatten()
    logits = logits.flatten()

    # map between zero and one, sigmoid is otherwise included in loss already
    # https://stackoverflow.com/a/66910866/5755604
    preds = torch.sigmoid(logits.squeeze())
    y_pred.append(preds.detach().cpu().numpy())
    y_true.append(targets.detach().cpu().numpy())

# round prediction to nearest int
y_pred = np.rint(np.concatenate(y_pred))
y_true = np.concatenate(y_true)

# calculate accuracy on validation set
acc = (y_pred == y_true).sum() / len(y_true)
print(acc)