In [1]:
import os, glob
import sys

import math
from pathlib import Path
from typing import List, Optional

import numpy as np
import pandas as pd
import torch
import wandb
from torch import nn
from torch import nn, optim
from tqdm.auto import tqdm


In [2]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"
# fs = gcsfs.GCSFileSystem(project="thesis")
# fs_prefix = "gs://"


In [3]:
run = wandb.init(project="thesis", entity="fbv")

dataset = "fbv/thesis/ise_supervised_log_standardized_clipped:latest"
artifact = run.use_artifact(dataset)
data_dir = artifact.download()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkarelze[0m ([33mfbv[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Downloading large artifact ise_supervised_log_standardized_clipped:latest, 5205.52MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:6.8


In [4]:
sys.path.append("..")
from otc.data.dataset import TabDataset
from otc.data.dataloader import TabDataLoader
from otc.features.build_features import features_classical, features_classical_size
from otc.optim.early_stopping import EarlyStopping


https://arxiv.org/pdf/2106.11959.pdf

Layer count 3
Feature embedding size 192
Head count 8
Activation & FFN size factor (ReGLU,
4/3)
Attention dropout 0.2
FFN dropout 0.1
Residual dropout 0.0
Initialization Kaiming (He et al., 2015a)
Parameter count 929K The value is given for 100 numerical features
Optimizer AdamW
Learning rate 1e−4
Weight decay 1e−5 0.0 for Feature Tokenizer, LayerNorm and biases


In [5]:
# preserve relative ordering, sample for testing ache

frac = 1

# sample
X_train = pd.read_parquet(Path(data_dir, "train_set.parquet"), engine="fastparquet").sample(frac=frac)
y_train = X_train["buy_sell"]
X_train = X_train[features_classical_size]

X_val = pd.read_parquet(Path(data_dir, "val_set.parquet"), engine="fastparquet").sample(frac=frac)# .sample(frac=frac, random_state=42).sort_index()
y_val = X_val["buy_sell"]
X_val = X_val[features_classical_size]

X_test = pd.read_parquet(Path(data_dir, "test_set.parquet"), engine="fastparquet")
y_test = X_test["buy_sell"]
X_test = X_test[features_classical_size]

In [6]:
class LogisticRegression(nn.Module):
    def __init__(self, input_size, num_classes):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, num_classes)

    def forward(self, x_cat, x_cont):
        if x_cat:
            x = torch.cat((x_cat, x_cont), 1)
        else:
            x = x_cont
        return self.linear(x)

In [11]:
training_data = TabDataset(X_train, y_train)
val_data = TabDataset(X_val, y_val)
test_data = TabDataset(X_test, y_test)

dl_params = {
    "batch_size": 32768, 
    "device": "cuda",
    "shuffle": True,
}


train_loader = TabDataLoader(
    training_data.x_cat,
    training_data.x_cont,
    training_data.weight,
    training_data.y,
    **dl_params
)
val_loader = TabDataLoader(
    val_data.x_cat, val_data.x_cont, val_data.weight, val_data.y, **dl_params
)

test_loader = TabDataLoader(
    test_data.x_cat, test_data.x_cont, test_data.weight, test_data.y, **dl_params
)

In [16]:
optim_params = {"lr": 1e-4, "weight_decay": 0.00001}

clf = LogisticRegression(input_size=X_train.shape[1],num_classes=1).to("cuda")

criterion = nn.BCEWithLogitsLoss()

optimizer = optim.AdamW(clf.parameters(),
    lr=optim_params["lr"],
    weight_decay=optim_params["weight_decay"],
)


In [None]:
def checkpoint(model, filename):
    
    # remove old files
    for filename in glob.glob(f"checkpoints/{run.id}*"):
        os.remove(filename) 
    
    # create_dir
    dir_checkpoints = "checkpoints/"
    os.makedirs(dir_checkpoints, exist_ok = True) 
    
    # save new file
    print("saving new checkpoints.")
    torch.save(model.state_dict(), os.path.join(dir_checkpoints,f"{run.id}*"))

In [17]:
# half precision, see https://pytorch.org/docs/stable/amp.html
scaler = torch.cuda.amp.GradScaler()
early_stopping = EarlyStopping(patience=15)
epochs = 100

step = 0
best_accuracy = -1
best_step = -1


for epoch in tqdm(range(epochs)):

    # perform training
    loss_in_epoch_train = 0

    batch = 0
    
    for x_cat, x_cont, weights, targets in train_loader:
    
        clf.train()
        optimizer.zero_grad()

        with torch.autocast(device_type='cuda', dtype=torch.float16):
            logits = clf(x_cat, x_cont).flatten()
            train_loss = criterion(logits, targets)

        scaler.scale(train_loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        # add the mini-batch training loss to epoch loss
        loss_in_epoch_train += train_loss  # .item()
        wandb.log({"train_loss_step": train_loss, "epoch": epoch, "batch": batch})
            
        batch += 1
        step +=1

    clf.eval()
    loss_in_epoch_val = 0.0
    correct = 0
    
    with torch.no_grad():
        for x_cat, x_cont, weights, targets in val_loader:
            
            # for my implementation
            logits = clf(x_cat, x_cont).flatten()
            logits = logits.flatten()

            val_loss = criterion(logits, targets)
            
            # get probabilities and round to nearest integer
            preds = torch.sigmoid(logits).round()
            correct += (preds == targets).sum().item()

            loss_in_epoch_val += val_loss  # val_loss #.item()
            wandb.log({"val_loss_step": val_loss, "epoch": epoch, "batch": batch})
            
            batch +=1      

    # loss average over all batches
    train_loss = loss_in_epoch_train / len(train_loader)
    val_loss = loss_in_epoch_val / len(val_loader)
    
    # correct samples / no samples
    val_accuracy = correct / len(X_val)
    if best_accuracy < val_accuracy:
        checkpoint(clf, f"checkpoints/{run.id}-{step}.ptx")
        best_accuracy = val_accuracy
        best_step = step
    
    
    wandb.log({"train_loss": train_loss, 'epoch': epoch})
    wandb.log({"val_loss": val_loss, 'epoch': epoch})
    # wandb.log({"val_accuracy": val_accuracy, 'epoch': epoch})    
    
    print(f"train:{train_loss} val:{val_loss}")
    print(f"val accuracy:{val_accuracy}")

    # return early if val accuracy doesn't improve. Minus to minimize.
    early_stopping(-val_accuracy)
    if early_stopping.early_stop or math.isnan(train_loss) or math.isnan(val_loss):
        print("early stopping now.")
        break


  0%|          | 0/100 [00:00<?, ?it/s]

saving new checkpoints.
train:0.6196326017379761 val:0.6251112222671509
val accuracy:0.6699623499176299
saving new checkpoints.
train:0.5817510485649109 val:0.6106944680213928
val accuracy:0.6789899480779357
saving new checkpoints.
train:0.557647705078125 val:0.6030252575874329
val accuracy:0.6861189210454878
saving new checkpoints.
train:0.5399504899978638 val:0.5986984372138977
val accuracy:0.6924417385902207
saving new checkpoints.
train:0.5262736678123474 val:0.5967432260513306
val accuracy:0.6959893920280118
saving new checkpoints.
train:0.5157594680786133 val:0.5966953039169312
val accuracy:0.6979309389452708
saving new checkpoints.
train:0.50788813829422 val:0.5979034900665283
val accuracy:0.6987825588487865
saving new checkpoints.
train:0.5022001266479492 val:0.6003912091255188
val accuracy:0.6991216608144285
saving new checkpoints.
train:0.49826887249946594 val:0.6031561493873596
val accuracy:0.699231507881883
train:0.49568480253219604 val:0.6058236360549927
val accuracy:0.699

In [18]:
cp =  glob.glob(f"checkpoints/{run.id}*")

In [19]:
clf.load_state_dict(torch.load(cp[0]))

<All keys matched successfully>

In [20]:
y_pred, y_true = [], []

for x_cat, x_cont, weights, targets in test_loader:
    # logits = clf(x_cont,x_cat).flatten() #
    # for my implementation
    logits = clf(x_cat, x_cont).flatten()
    logits = logits.flatten()


    # map between zero and one, sigmoid is otherwise included in loss already
    # https://stackoverflow.com/a/66910866/5755604
    preds = torch.sigmoid(logits.squeeze())
    y_pred.append(preds.detach().cpu().numpy())
    y_true.append(targets.detach().cpu().numpy())  # type: ignore

# round prediction to nearest int
y_pred = np.rint(np.concatenate(y_pred))
y_true = np.concatenate(y_true)

acc = (y_pred == y_true).sum() / len(y_true)
print(acc)

0.668656308078952
