In [1]:
# Imports

import os
import re
from argparse import ArgumentParser

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import torch
import pytorch_lightning as pl

import torchmetrics.functional as metrics

from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig, TabNetModelConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig, ExperimentConfig
from pytorch_tabular.utils import get_balanced_sampler, get_class_weighted_cross_entropy



import wandb

DEVICE = torch.device("cpu")

# Import GPU-related things
if torch.cuda.is_available():
    # import cupy as np
    # import cudf as pd

    # Ensure that all operations are deterministic on GPU (if used) for reproducibility
    torch.backends.cudnn.determinstic = True
    torch.backends.cudnn.benchmark = False

    DEVICE = torch.device("cuda:0")
# else:

# Path to the folder where the datasets are/should be downloaded (e.g. CIFAR10)
DATASET_PATH = os.environ.get("PATH_DATASETS", "data/")
# Path to the folder where the pretrained models are saved
CHECKPOINT_PATH = os.environ.get("PATH_CHECKPOINT", "saved_models/")

# Setting the seed
pl.seed_everything(42)

print('CUDA:', torch.cuda.is_available())
print("Device:", DEVICE)

Global seed set to 42


CUDA: True
Device: cuda:0


In [2]:
def in_ipython():
    try:
        return __IPYTHON__
    except NameError:
        return False

In [3]:
df = pd.read_csv('./data.csv')

In [4]:
drop_n9 = df.loc[df['type'] == 'n9'].index
df = df.drop(index=drop_n9)

target_col = ['type']

cat_columns = [
    'name',
    'isA',
    'isR'
]

target_and_cat = target_col.copy()
target_and_cat.extend(cat_columns)

cont_columns = df.drop(columns=target_and_cat).columns.values.tolist()

target = df['type']

train, test = train_test_split(df, random_state=42, stratify=target)
train, val = train_test_split(train, random_state=42, stratify=train['type'])

print(len(df['type'].unique()))
print(len(train['type'].unique()))

Int64Index([5], dtype='int64')
37
37


In [5]:
def print_metrics(y_true, y_pred, tag):
    if isinstance(y_true, pd.DataFrame) or isinstance(y_true, pd.Series):
        y_true = y_true.values
    if isinstance(y_pred, pd.DataFrame) or isinstance(y_pred, pd.Series):
        y_pred = y_pred.values
    if y_true.ndim>1:
        y_true=y_true.ravel()
    if y_pred.ndim>1:
        y_pred=y_pred.ravel()
    val_acc = metrics.accuracy(y_pred, y_true)
    val_f1 = metrics.f1_score1(y_pred, y_true)
    print(f"{tag} Acc: {val_acc} | {tag} F1: {val_f1}")

In [6]:
# train.py
def main(hparams):
    wandb.finish()
    sampler = get_balanced_sampler(df['type'].values.ravel())

    data_config = DataConfig(
        target=['type'], #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
        continuous_cols=cont_columns,
        categorical_cols=cat_columns
    )

    trainer_config = TrainerConfig(
        auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
        batch_size=1024,
        max_epochs=10,
        gpus=1, #index of the GPU to use. -1 means all available GPUs, None, means CPU
    )

    optimizer_config = OptimizerConfig()

    model_config = TabNetModelConfig(
        task="classification",
    )

    experiment_config = ExperimentConfig(
        project_name='bachelor',
        exp_watch='gradients',
        log_target='wandb',
        log_logits=True
    )

    tabular_model = TabularModel(
        data_config=data_config,
        model_config=model_config,
        optimizer_config=optimizer_config,
        trainer_config=trainer_config,
        experiment_config=experiment_config
    )

    tabular_model.fit(train=train, validation=val, train_sampler=sampler)
    result = tabular_model.evaluate(test)

    print(result)

    pred_df = tabular_model.predict(test)
    pred_df.head()

    print_metrics(test['type'], pred_df["prediction"], tag="Holdout")

In [7]:
if __name__ == "__main__":
    if not in_ipython():
        root_dir = os.path.dirname(os.path.realpath(__file__))
        parser = ArgumentParser(add_help=False)
        hyperparams = parser.parse_args()

        # TRAIN
        main(hyperparams)
    else:
        main(None)

[34m[1mwandb[0m: Currently logged in as: [33mcaigh[0m. Use [1m`wandb login --relogin`[0m to force relogin


Global seed set to 42
[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


TypeError: __init__() got an unexpected keyword argument 'checkpoint_callback'