# Debugging autoreload

In [None]:
%load_ext autoreload
%autoreload 2

# Load packages

In [None]:
from pytorch_tabular.utils import load_covertype_dataset
from rich.pretty import pprint
import torch
from glob import glob
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import copy
from sklearn.model_selection import train_test_split
import numpy as np
from pytorch_tabular.utils import make_mixed_dataset, print_metrics
from pytorch_tabular import available_models
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig, GANDALFConfig, TabNetModelConfig, FTTransformerConfig, DANetConfig
from pytorch_tabular.config import DataConfig, OptimizerConfig, TrainerConfig
from pytorch_tabular.models.common.heads import LinearHeadConfig
from pytorch_tabular.tabular_model_tuner import TabularModelTuner
from torchmetrics.functional.regression import mean_absolute_error, pearson_corrcoef
from sklearn.model_selection import RepeatedStratifiedKFold
from pytorch_tabular import MODEL_SWEEP_PRESETS
import pandas as pd
from pytorch_tabular import model_sweep
from src.pt.model_sweep import model_sweep_custom
import warnings
from src.utils.configs import read_parse_config
from src.utils.hash import dict_hash


# Load data

In [None]:
path_data = "D:/YandexDisk/Work/bbd/immunology/002_central_vs_yakutia/classification"
path_configs = "D:/Work/bbs/notebooks/immunology/002_central_vs_yakutia/pt_configs"
data = pd.read_excel(f"{path_data}/data.xlsx", index_col=0)
feats = pd.read_excel(f"{path_data}/feats.xlsx", index_col=0).index.values.tolist()

test_split_id = 0

val_n_splits = 4
val_random_state = 1337
val_fold_id = 0

for fold_id in range(val_n_splits):
    data[f"Fold_{fold_id}"] = data[f"Split_{test_split_id}"]

stratify_cat_parts = {
    'Central': data.index[(data['Region'] == 'Central') & (data[f"Split_{test_split_id}"] == 'trn_val')].values,
    'Yakutia': data.index[(data['Region'] == 'Yakutia') & (data[f"Split_{test_split_id}"] == 'trn_val')].values,
}
for part, ids in stratify_cat_parts.items():
    print(f"{part}: {len(ids)}")
    con = data.loc[ids, 'Age'].values
    ptp = np.ptp(con)
    num_bins = 5
    bins = np.linspace(np.min(con) - 0.1 * ptp, np.max(con) + 0.1 * ptp, num_bins + 1)
    binned = np.digitize(con, bins) - 1
    unique, counts = np.unique(binned, return_counts=True)
    occ = dict(zip(unique, counts))
    k_fold = RepeatedStratifiedKFold(
        n_splits=val_n_splits,
        n_repeats=1,
        random_state=val_random_state
    )
    splits = k_fold.split(X=ids, y=binned, groups=binned)
    
    for fold_id, (ids_trn, ids_val) in enumerate(splits):
        data.loc[ids[ids_trn], f"Fold_{fold_id}"] = "trn"
        data.loc[ids[ids_val], f"Fold_{fold_id}"] = "val"
        
test = data.loc[data[f"Split_{test_split_id}"] == "tst", feats + ['Region']]
train_validation = data.loc[data[f"Split_{test_split_id}"] == "trn_val", feats + ['Region'] + [f"Fold_{i}" for i in range(val_n_splits)]]
train_only = data.loc[data[f"Fold_{val_fold_id}"] == "trn", feats + ['Region']]
validation_only = data.loc[data[f"Fold_{val_fold_id}"] == "val", feats + ['Region']]
cv_indexes = [
    (
        np.where(train_validation.index.isin(train_validation.index[train_validation[f"Fold_{i}"] == 'trn']))[0],
        np.where(train_validation.index.isin(train_validation.index[train_validation[f"Fold_{i}"] == 'val']))[0],
    )
    for i in range(val_n_splits)
]

# Simple TabularModel training

In [8]:
trainer_config = read_parse_config(f"{path_configs}/TrainerConfig.yaml", TrainerConfig)
trainer_config['checkpoints'] = 'valid_loss'
trainer_config['load_best'] = True
trainer_config['auto_lr_find'] = True

tabular_model = TabularModel(
    data_config=f"{path_configs}/DataConfig.yaml",
    model_config=f"{path_configs}/models/CategoryEmbeddingModelConfig.yaml",
    optimizer_config=f"{path_configs}/OptimizerConfig.yaml",
    trainer_config=trainer_config,
    verbose=True,
    suppress_lightning_logger=False
)

tabular_model.fit(
    train=train_only,
    validation=validation_only,
    # target_transform=[np.log, np.exp],
    # callbacks=[DeviceStatsMonitor()],
)

Seed set to 42


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


You are using a CUDA device ('NVIDIA GeForce RTX 4090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Missing logger folder: D:\Work\bbs\notebooks\immunology\002_central_vs_yakutia\lightning_logs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
C:\Users\user\anaconda3\envs\py311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
C:\Users\user\anaconda3\envs\py311\Lib\site-packages\pytorch_lightning\loops\fit_loop.py:293: The number of training batches (1) is smaller than the logging interval Trainer(log_every_n

Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

LR finder stopped early after 93 steps due to diverging loss.
Learning rate set to 0.006918309709189364
Restoring states from the checkpoint path at D:\Work\bbs\notebooks\immunology\002_central_vs_yakutia\.lr_find_68ee2bd5-c7c0-4f8c-8769-f83995731cf8.ckpt
Restored all states from the checkpoint at D:\Work\bbs\notebooks\immunology\002_central_vs_yakutia\.lr_find_68ee2bd5-c7c0-4f8c-8769-f83995731cf8.ckpt


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name             | Type                      | Params
---------------------------------------------------------------
0 | _backbone        | CategoryEmbeddingBackbone | 49.6 K
1 | _embedding_layer | Embedding1dLayer          | 64    
2 | head             | LinearHead                | 130   
3 | loss             | CrossEntropyLoss          | 0     
---------------------------------------------------------------
49.8 K    Trainable params
0         Non-trainable params
49.8 K    Total params
0.199     Total estimated model params size (MB)


<pytorch_lightning.trainer.trainer.Trainer at 0x1b9b26364d0>

## Play with trained model

In [10]:
tabular_model.predict(test, progress_bar='rich')

Output()

Unnamed: 0_level_0,Central_probability,Yakutia_probability,prediction
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
L10,0.999515,0.000485,Central
F1-L9,0.949430,0.050570,Central
F1-L18,0.949235,0.050765,Central
MQ2,0.997707,0.002293,Central
MQ13,0.926011,0.073989,Central
...,...,...,...
DP-8,0.431097,0.568903,Yakutia
T-85,0.725689,0.274311,Central
T-25,0.435630,0.564370,Yakutia
T-63,0.499097,0.500903,Yakutia


In [9]:
tabular_model.evaluate(test, verbose=True, ckpt_path="best")

Restoring states from the checkpoint path at D:\YandexDisk\Work\bbd\immunology\002_central_vs_yakutia\classification\pytorch_tabular\classification-1_epoch=5-valid_loss=0.46.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at D:\YandexDisk\Work\bbd\immunology\002_central_vs_yakutia\classification\pytorch_tabular\classification-1_epoch=5-valid_loss=0.46.ckpt
C:\Users\user\anaconda3\envs\py311\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


[{'test_loss': 0.35063597559928894,
  'test_accuracy': 0.8977272510528564,
  'test_f1_score': 0.8932405710220337,
  'test_precision': 0.9034373760223389,
  'test_recall': 0.8977272510528564,
  'test_specificity': 0.7998917102813721,
  'test_cohen_kappa': 0.7474489808082581,
  'test_auroc': 0.9476190209388733}]

In [None]:
tabular_model.config['checkpoints_path']

In [None]:
print(tabular_model.trainer.checkpoint_callback.best_model_path)

In [None]:
tabular_model.summary()

In [None]:
tabular_model.save_model(tabular_model.config['checkpoints_path'])

In [None]:
tabular_model.save_config(tabular_model.config['checkpoints_path'])

In [None]:
tabular_model = TabularModel.load_model(tabular_model.config['checkpoints_path'])