In [None]:
import os
import math
from pathlib import Path
from typing import List, Optional

import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import torch
import wandb
from sklearn.metrics import accuracy_score
from torch import nn
from torch import nn, optim
from tqdm.auto import tqdm


In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
os.environ["GCLOUD_PROJECT"] = "flowing-mantis-239216"
# fs = gcsfs.GCSFileSystem(project="thesis")
# fs_prefix = "gs://"


In [None]:
run = wandb.init(project="thesis", entity="fbv")

dataset = "fbv/thesis/ise_supervised_log_standardized:latest"
artifact = run.use_artifact(dataset)
data_dir = artifact.download()


In [None]:
import sys

sys.path.append("..")
from otc.models.fttransformer import FeatureTokenizer, FTTransformer, Transformer
from otc.models.activation import ReGLU
from otc.data.dataset import TabDataset
from otc.data.dataloader import TabDataLoader
from otc.features.build_features import features_classical, features_classical_size
from otc.optim.early_stopping import EarlyStopping


https://arxiv.org/pdf/2106.11959.pdf

Layer count 3
Feature embedding size 192
Head count 8
Activation & FFN size factor (ReGLU,
4/3)
Attention dropout 0.2
FFN dropout 0.1
Residual dropout 0.0
Initialization Kaiming (He et al., 2015a)
Parameter count 929K The value is given for 100 numerical features
Optimizer AdamW
Learning rate 1e−4
Weight decay 1e−5 0.0 for Feature Tokenizer, LayerNorm and biases


In [None]:
# preserve relative ordering, sample for testing ache

frac = 0.05


# sample
X_train = pd.read_parquet(Path(data_dir, "train_set.parquet"), engine="fastparquet")
X_train = X_train.tail(int(len(X_train)*frac))# .sample(frac=frac)# .sample(frac=frac, random_state=42).sort_index()
y_train = X_train["buy_sell"]
X_train = X_train[features_classical_size]

X_val = pd.read_parquet(Path(data_dir, "val_set.parquet"), engine="fastparquet").sample(frac=frac)# .sample(frac=frac, random_state=42).sort_index()
y_val = X_val["buy_sell"]
X_val = X_val[features_classical_size]

# eps = 0.1

# y_train[np.where(y_train == 0)] = eps
# y_train[np.where(y_train == 1)] = 1.0 - eps

# y_val[np.where(y_val == 0)] = eps
# y_val[np.where(y_val == 1)] = 1.0 - eps

In [None]:
X_test = pd.read_parquet(Path(data_dir, "test_set.parquet"), engine="fastparquet")
y_test = X_test["buy_sell"]
X_test = X_test[features_classical_size]


In [None]:
X_test.max()

In [None]:
X_test.quantile(q=0.99)

In [None]:
X_train.clip(lower=X_train.quantile(q=0.01), upper=X_train.quantile(q=0.99), axis=1, inplace=True)
X_val.clip(lower=X_val.quantile(q=0.01), upper=X_val.quantile(q=0.99), axis=1, inplace=True)
X_test.clip(lower=X_test.quantile(q=0.01), upper=X_test.quantile(q=0.99), axis=1, inplace=True)

## Test Ground Adult Dataset

In [None]:
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data")
df_test = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test")

In [None]:
df_test = df_test.reset_index()
df_test.columns = df.columns
df_tot = pd.concat([df, df_test])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, QuantileTransformer, OrdinalEncoder
from sklearn.compose import ColumnTransformer


In [None]:
y = df_tot[' <=50K'].str.replace(".","")
# < = 0 and > = 1
y = y.str.contains(">") +0
#y = np.where(y, -1,1)
X = df_tot.drop(columns=[' <=50K'])

In [None]:
X_train = X.iloc[:26048]
X_val = X.iloc[26048:32561]
X_test = X.iloc[32561:]
y_train = y.iloc[:26048]
y_val = y.iloc[26048:32561]
y_test = y.iloc[32561:]

# X_train, X_test, y_train, y_t 

In [None]:
categorical_features = list(X_train.select_dtypes(include=['object']).columns)
continuous_features = list(X_train.select_dtypes(include=['float',"int64"]).columns)

categorical_transformer = Pipeline(steps=[('le', OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))])
# adapted from groshiny
continuous_transformer = Pipeline(steps=[("ce", QuantileTransformer(
            output_distribution='normal',
            n_quantiles=1000,
            subsample=1e9,
            random_state=42,
))])
preprocessor = ColumnTransformer(transformers=[('cat', 
                                                 categorical_transformer, 
                                                 categorical_features), ("cont", continuous_transformer, continuous_features)])
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])


X_train = pipeline.fit_transform(X_train)
X_val = pipeline.transform(X_val)
X_test = pipeline.transform(X_test)


In [None]:
# get feature names and cat feature names
feature_names = pipeline.get_feature_names_out()
cat_feature_names = [col for col in feature_names if col.startswith("cat_")]
cont_feature_names = [col for col in feature_names if col.startswith("cont_")]

In [None]:
# get cardinalities
temp = pd.DataFrame(X_train, columns = feature_names)
val_temp = temp[cat_feature_names].nunique() + 2
cat_unique_counts = tuple(val_temp.values)

cat_idx = [list(feature_names).index(x) for x in cat_feature_names if x in feature_names]

In [None]:
cat_idx

In [None]:
X_train = pd.DataFrame(X_train, columns=feature_names)
X_val = pd.DataFrame(X_val, columns=feature_names)
X_test = pd.DataFrame(X_test, columns=feature_names)

X_train[cat_feature_names] = X_train[cat_feature_names].astype(int)
X_val[cat_feature_names] = X_val[cat_feature_names].astype(int)
X_test[cat_feature_names] = X_test[cat_feature_names].astype(int)

In [None]:
from catboost import CatBoostClassifier

clf = CatBoostClassifier(
    iterations=1000, 
    #learning_rate=0.1, 
    # loss_function='CrossEntropy',
    task_type="GPU",
)

clf.fit(X_train, y_train, 
        cat_features=cat_idx, 
        eval_set=(X_val, y_val), 
)

acc = clf.score(X_test, y_test)
print(acc)

In [None]:
# https://colab.research.google.com/github/Yura52/rtdl/blob/main/examples/rtdl.ipynb#scrollTo=RtYkwZjE4mEx
import rtdl
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device('cuda')
task_type = 'binclass'
d_out = 1
epochs = 1000

clf = rtdl.FTTransformer.make_default(
    n_num_features= len(cont_feature_names),
    cat_cardinalities=list(cat_unique_counts),
    last_layer_query_idx=[-1],  # it makes the model faster and does NOT affect its output
    d_out=d_out,
)

clf.to(device)
optimizer = (
    clf.make_default_optimizer()
    if isinstance(clf, rtdl.FTTransformer)
    else torch.optim.AdamW(clf.parameters(), lr=lr, weight_decay=weight_decay)
)
criterion = (
    F.binary_cross_entropy_with_logits
)

In [None]:
device = torch.device('cuda')

dl_params = {
    "batch_size": 256,  # dataprallel splits batches across devices
    "shuffle": False,
    "device": device,
}

train_loader = TabDataLoader(
    torch.Tensor(X_train[cat_feature_names].values).long(), torch.Tensor(X_train[cont_feature_names].values),None, torch.Tensor(y_train.values), **dl_params
)

val_loader = TabDataLoader(
    torch.Tensor(X_val[cat_feature_names].values).long(), torch.Tensor(X_val[cont_feature_names].values),None, torch.Tensor(y_val.values), **dl_params
)
test_loader = TabDataLoader(
    torch.Tensor(X_test[cat_feature_names].values).long(), torch.Tensor(X_test[cont_feature_names].values),None, torch.Tensor(y_test.values), **dl_params
)

## Carlifornia Housing

In [None]:
# from https://colab.research.google.com/github/Yura52/rtdl/blob/main/examples/rtdl.ipynb#scrollTo=3bzc8TEGEvmh
# !!! NOTE !!! The dataset splits, preprocessing and other details are
# significantly different from those used in the
# paper "Revisiting Deep Learning Models for Tabular Data",
# so the results will be different from the reported in the paper.
import sklearn.datasets
import sklearn.model_selection
dataset = sklearn.datasets.fetch_california_housing()
task_type = 'regression'
device = "cuda"
# dataset = sklearn.datasets.fetch_covtype()
# task_type = 'multiclass'

assert task_type in ['binclass', 'multiclass', 'regression']

X_all = dataset['data'].astype('float32')
y_all = dataset['target'].astype('float32' if task_type == 'regression' else 'int64')
if task_type != 'regression':
    y_all = sklearn.preprocessing.LabelEncoder().fit_transform(y_all).astype('int64')
n_classes = int(max(y_all)) + 1 if task_type == 'multiclass' else None

X = {}
y = {}
X['train'], X['test'], y['train'], y['test'] = sklearn.model_selection.train_test_split(
    X_all, y_all, train_size=0.8
)
X['train'], X['val'], y['train'], y['val'] = sklearn.model_selection.train_test_split(
    X['train'], y['train'], train_size=0.8
)

# not the best way to preprocess features, but enough for the demonstration
preprocess = sklearn.preprocessing.StandardScaler().fit(X['train'])
X = {
    k: torch.tensor(preprocess.transform(v), device=device)
    for k, v in X.items()
}
y = {k: torch.tensor(v, device=device) for k, v in y.items()}

# !!! CRUCIAL for neural networks when solving regression problems !!!
y_mean = y['train'].mean().item()
y_std = y['train'].std().item()
y = {k: (v - y_mean) / y_std for k, v in y.items()}


In [None]:
dl_params = {
    "batch_size": 256,  # dataprallel splits batches across devices
    "shuffle": False,
    "device": device,
}

train_loader = TabDataLoader(
    None, torch.Tensor(X["train"]),None, torch.Tensor(y["train"]), **dl_params
)

val_loader = TabDataLoader(
    None, torch.Tensor(X["val"]),None, torch.Tensor(y["val"]), **dl_params
)


In [None]:
import rtdl
import torch
import torch.nn as nn
import torch.nn.functional as F

d_out = 1
epochs = 1000


model = rtdl.FTTransformer.make_default(
    n_num_features=X_all.shape[1],
    cat_cardinalities=None,
    last_layer_query_idx=[-1],  # it makes the model faster and does NOT affect its output
    d_out=d_out,
)


model.to(device)
optimizer = (
    model.make_default_optimizer()
    if isinstance(model, rtdl.FTTransformer)
    else torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
)
criterion = (
    F.binary_cross_entropy_with_logits
    if task_type == 'binclass'
    else F.cross_entropy
    if task_type == 'multiclass'
    else F.mse_loss
)

## Run Area

In [None]:
frac=1

device = "cuda"
batch_size = 16192
epochs = 100

d_token = 192
n_blocks = 3
attention_dropout = 0.2
ffn_dropout = 0.1
residual_dropout = 0.0
attention_heads = 8


# clipping_value = 5
reduction = "mean"

# other_kwargs = {
#  "clipping_value": clipping_value,
#  "frac": frac,
# }

# feature_tokenizer_kwargs = {
#     "num_continous": len(continuous_features),
#     "cat_cardinalities": list(cat_unique_counts),
#     "d_token": d_token,
# }
feature_tokenizer_kwargs = {
    "num_continous": len(X_train.columns.tolist()),
    "cat_cardinalities": (),
    "d_token": d_token,
}

dl_params = {
    "batch_size": batch_size,  # dataprallel splits batches across devices
    "shuffle": False,
    "device": device,
}

transformer_kwargs = {
    "d_token": d_token,
    "n_blocks": n_blocks,
    "attention_n_heads": attention_heads,
    "attention_initialization": "kaiming",
    "ffn_activation": ReGLU,
    "attention_normalization": nn.LayerNorm,
    "ffn_normalization": nn.LayerNorm,
    "ffn_dropout": ffn_dropout,
    # fix at 4/3, as activation (see search space B in
    # https://arxiv.org/pdf/2106.11959v2.pdf)
    # is static with ReGLU / GeGLU
    "ffn_d_hidden": int(d_token * (4 / 3)),
    "attention_dropout": attention_dropout,
    "residual_dropout": residual_dropout,  # see search space (B)
    "prenormalization": True,
    "first_prenormalization": False,
    "last_layer_query_idx": None,
    "n_tokens": None,
    "kv_compression_ratio": None,
    "kv_compression_sharing": None,
    "head_activation": nn.GELU, # nn.ReLU
    "head_normalization": nn.LayerNorm,
    "d_out": 1,  # fix at 1, due to binary classification
}


optim_params = {"lr": 1e-4, "weight_decay": 0.00001}

module_params = {
    "transformer": Transformer(**transformer_kwargs),  # type: ignore
    "feature_tokenizer": FeatureTokenizer(**feature_tokenizer_kwargs),  # type: ignore # noqa: E501
    "cat_features": cat_idx,
    "cat_cardinalities": cat_unique_counts,
}

clf = FTTransformer(**module_params)
# use multiple gpus, if available
clf = nn.DataParallel(clf).to(device)


criterion = nn.BCEWithLogitsLoss()
# wandb.log(other_kwargs)
# wandb.log(transformer_kwargs)
# wandb.log(optim_params)
# wandb.log(feature_tokenizer_kwargs)
# wandb.log(dl_params)

In [None]:
training_data = TabDataset(X_train, y_train)
val_data = TabDataset(X_val, y_val)
test_data = TabDataset(X_test, y_test)

train_loader = TabDataLoader(
    training_data.x_cat,
    training_data.x_cont,
    training_data.weight,
    training_data.y,
    **dl_params
)
val_loader = TabDataLoader(
    val_data.x_cat, val_data.x_cont, val_data.weight, val_data.y, **dl_params
)

test_loader = TabDataLoader(
    test_data.x_cat, test_data.x_cont, test_data.weight, test_data.y, **dl_params
)

In [None]:
class CosineWarmupScheduler(optim.lr_scheduler._LRScheduler):
    def __init__(self, optimizer, warmup, max_iters):
        self.warmup = warmup
        self.max_num_iters = max_iters
        super().__init__(optimizer)

    def get_lr(self):
        lr_factor = self.get_lr_factor(epoch=self.last_epoch)
        return [base_lr * lr_factor for base_lr in self.base_lrs]

    def get_lr_factor(self, epoch):
        lr_factor = 0.5 * (1 + np.cos(np.pi * epoch / self.max_num_iters))
        if epoch <= self.warmup:
            lr_factor *= epoch * 1.0 / self.warmup
        return lr_factor

In [None]:
# https://lightning.ai/docs/pytorch/latest/notebooks/course_UvA-DL/05-transformers-and-MH-attention.html?highlight=warmup
# Needed for initializing the lr scheduler
p = nn.Parameter(torch.empty(4, 4))

# clf = FTTransformer(**module_params)
# clf = model
# use multiple gpus, if available
# clf = nn.DataParallel(clf).to(device)


optimizer = optim.AdamW(clf.parameters(),
    lr=optim_params["lr"],
    weight_decay=optim_params["weight_decay"],
)

max_iters = epochs * len(train_loader)
# saw recommendation of 5 - 10 % of total training budget or 100 to 500 steps
warmup = int(0.05 * max_iters)
print(f"warmup steps: {warmup}")
print(max_iters)

scheduler = CosineWarmupScheduler(optimizer=optimizer, warmup=warmup, max_iters=max_iters)

# Plotting
epochs_plt = list(range(max_iters))
plt.figure(figsize=(8, 3))
plt.plot(epochs_plt, [scheduler.get_lr_factor(e) for e in epochs_plt])
plt.ylabel("Learning rate factor")
plt.xlabel("Iterations (in batches)")
plt.title("Cosine Warm-up Learning Rate Scheduler")
plt.show()

In [None]:
import os, glob



def checkpoint(model, filename):
    
    # remove old files
    for filename in glob.glob(f"checkpoints/{run.id}*"):
        os.remove(filename) 
    
    # create_dir
    dir_checkpoints = "checkpoints/
    os.makedirs(dir_checkpoints, exist_ok = True) 
    
    # save new file
    print("saving new checkpoints.")
    torch.save(model.state_dict(), os.path.join(path,f"{run.id}*")

In [None]:
# half precision, see https://pytorch.org/docs/stable/amp.html
scaler = torch.cuda.amp.GradScaler()
# Generate the optimizers

# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.1, verbose=True)

# compiled_clf = clf #torch.compile(clf)

early_stopping = EarlyStopping(patience=15)

# see https://stackoverflow.com/a/53628783/5755604
# no sigmoid required; numerically more stable
# do not reduce, calculate mean after multiplication with weight

step = 0
best_accuracy = -1
best_step = -1

for epoch in tqdm(range(epochs)):

    # perform training
    loss_in_epoch_train = 0

    batch = 0
    
    for x_cat, x_cont, weights, targets in train_loader:
    
        clf.train()
        optimizer.zero_grad()

        # for rtd implementation
        # logits = clf(x_cont,x_cat).flatten() #
        # for my implementation
        with torch.autocast(device_type='cuda', dtype=torch.float16):
            logits = clf(x_cat, x_cont).flatten()
            train_loss = criterion(logits, targets)

        # train_loss.backward()
        # optimizer.step()
        scaler.scale(train_loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        scheduler.step()
        
        # add the mini-batch training loss to epoch loss
        loss_in_epoch_train += train_loss  # .item()
        wandb.log({"train_loss_step": train_loss, "epoch": epoch, "batch": batch})
            
        batch += 1
        step +=1

    clf.eval()
    loss_in_epoch_val = 0.0
    correct = 0
    
    with torch.no_grad():
        for x_cat, x_cont, weights, targets in val_loader:
            # for rtd implementation
            # logits = clf(x_cont,x_cat).flatten() #
            # for my implementation
            logits = clf(x_cat, x_cont).flatten()
            logits = logits.flatten()

            val_loss = criterion(logits, targets)
            
            # get probabilities and round to nearest integer
            preds = torch.sigmoid(logits).round()
            correct += (preds == targets).sum().item()

            loss_in_epoch_val += val_loss  # val_loss #.item()
            wandb.log({"val_loss_step": val_loss, "epoch": epoch, "batch": batch})
            
            batch +=1      

    # loss average over all batches
    train_loss = loss_in_epoch_train / len(train_loader)
    val_loss = loss_in_epoch_val / len(val_loader)
    
    # correct samples / no samples
    val_accuracy = correct / len(X_val)
    if best_accuracy < val_accuracy:
        checkpoint(clf, f"checkpoints/{run.id}-{step}.ptx")
        best_accuracy = val_accuracy
        best_step = step
    
    
    wandb.log({"train_loss": train_loss, 'epoch': epoch})
    wandb.log({"val_loss": val_loss, 'epoch': epoch})
    # wandb.log({"val_accuracy": val_accuracy, 'epoch': epoch})    
    
    print(f"train:{train_loss} val:{val_loss}")
    print(f"val accuracy:{val_accuracy}")

    # return early if val accuracy doesn't improve. Minus to minimize.
    early_stopping(-val_accuracy)
    if early_stopping.early_stop or math.isnan(train_loss) or math.isnan(val_loss):
        print("meh... early stopping")
        break


In [None]:
cp =  glob.glob(f"checkpoints/{run.id}*")
print(filenames)

In [None]:
clf.load_state_dict(torch.load(cp[0]))

In [None]:
y_pred, y_true = [], []

for x_cat, x_cont, weights, targets in test_loader:
    # logits = clf(x_cont,x_cat).flatten() #
    # for my implementation
    logits = clf(x_cat, x_cont).flatten()
    logits = logits.flatten()


    # map between zero and one, sigmoid is otherwise included in loss already
    # https://stackoverflow.com/a/66910866/5755604
    preds = torch.sigmoid(logits.squeeze())
    y_pred.append(preds.detach().cpu().numpy())
    y_true.append(targets.detach().cpu().numpy())  # type: ignore

# round prediction to nearest int
y_pred = np.rint(np.concatenate(y_pred))
y_true = np.concatenate(y_true)

acc = accuracy_score(y_pred, y_true)
print(acc)