In [1]:
from xgboost import XGBClassifier
import warnings
from tabpfn_new.scripts.transformer_prediction_interface import TabPFNClassifier
import numpy as np
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from data_prep_utils import *
from evaluate import *
from load_models import *
import matplotlib.pyplot as plt
import torch
import openml
import time

In [2]:
path = "datasets/data_all.csv"
data, labels = get_microbiome(path)
data = top_non_zero(data)
data, labels = unison_shuffled_copies(data, labels)

In [5]:
for sampling in [None, undersample]:
    cv = 3
    strat_split = True
    n_optim = 1000
    ft_epochs = 10
    max_samples = 1000
    metrics = metrics = ["accuracy", "precision", "recall", "roc_auc"]
    models = [
        XGBClassifier(n_estimators=5, max_depth=5, learning_rate=1, objective='binary:logistic'),
        XGBoostOptim(n_optim=n_optim),
        LogisticRegression(max_iter=500), 
        TabPFNClassifier(device='cpu', N_ensemble_configurations=5, no_preprocess_mode=True),
        TabForestPFNClassifier("saved_models/tabforest/mix600k/tabforestpfn.pt", "saved_models/tabforest/mix600k/config_run.yaml", max_epochs=ft_epochs)
    ]
    results = pd.DataFrame(np.zeros((len(models), len(metrics)+1)), 
                           index=[m.__class__.__name__ for m in models],
                          columns=metrics+["runtime"])
    
    for ii, model in enumerate(models):
        results.iloc[ii,:] = cross_validate_sample(model, data, labels, metrics, strat_split, cv, sampling, max_samples)
    print(results)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
[32m2024-10-02 21:21:43.961[0m | [1mINFO    [0m | [36mtabularbench.core.trainer_finetune[0m:[36mtrain[0m:[36m83[0m - [1mEpoch 000 | Train loss: -.---- | Train score: -.---- | Val loss: 0.1896 | Val score: 0.9450[0m
[32m2024-10-02 21:21:46.607[0m | [1mINFO    [0m | [36mtabularbench.core.trainer_finetune[0m:[36mtrain[0m:[36m94[0m - [1mEpoch 001 | Train loss: 0.1990 | Train score: 0.9437 | Val loss: 0.1940 | Val score: 0.9450[0m
[32m2024-10-02 21:21:49.487[0m | [1mINFO    [0m | [36mtabularbench.core.trainer_finetune[0m:[36mtrain[0m:[36m94[0m - [1mEpoch 002 | Train loss: 0.2016 | Train score: 0.9375 | Val loss: 0.1959 | Val score: 0.9450[0m
[32m2024-10-02 21:21:52.263[0m | [1mINFO    [0m | [36mtabularbench.core.trainer_finetune[0m:[36mtrain[0m:[36m94[0m - [1mEpoch 003 | Train loss: 0.1619 | Train sco

                        accuracy  precision    recall   roc_auc     runtime
XGBClassifier           0.925852   0.303644  0.166667  0.570540    0.028001
XGBoostOptim            0.946560   0.708333  0.211111  0.602357  149.215310
LogisticRegression      0.927188   0.243590  0.100000  0.540050    0.054667
TabPFNClassifier        0.939212   0.000000  0.000000  0.499645    4.207857
TabForestPFNClassifier  0.943888   0.512500  0.200000  0.595736   28.593347


[32m2024-10-02 21:27:50.387[0m | [1mINFO    [0m | [36mtabularbench.core.trainer_finetune[0m:[36mtrain[0m:[36m83[0m - [1mEpoch 000 | Train loss: -.---- | Train score: -.---- | Val loss: 0.6394 | Val score: 0.6667[0m
[32m2024-10-02 21:27:51.087[0m | [1mINFO    [0m | [36mtabularbench.core.trainer_finetune[0m:[36mtrain[0m:[36m94[0m - [1mEpoch 001 | Train loss: 0.5949 | Train score: 0.6000 | Val loss: 0.6519 | Val score: 0.6250[0m
[32m2024-10-02 21:27:51.799[0m | [1mINFO    [0m | [36mtabularbench.core.trainer_finetune[0m:[36mtrain[0m:[36m94[0m - [1mEpoch 002 | Train loss: 0.4604 | Train score: 0.8000 | Val loss: 0.6919 | Val score: 0.6667[0m
[32m2024-10-02 21:27:52.502[0m | [1mINFO    [0m | [36mtabularbench.core.trainer_finetune[0m:[36mtrain[0m:[36m94[0m - [1mEpoch 003 | Train loss: 0.4103 | Train score: 0.8500 | Val loss: 0.7167 | Val score: 0.6667[0m
[32m2024-10-02 21:27:53.153[0m | [1mINFO    [0m | [36mtabularbench.core.trainer_finetun

                        accuracy  precision    recall   roc_auc    runtime
XGBClassifier           0.661991   0.110978  0.655556  0.658979   0.011333
XGBoostOptim            0.732799   0.134467  0.633333  0.686247  92.377979
LogisticRegression      0.639947   0.088202  0.533333  0.590050   0.064170
TabPFNClassifier        0.649967   0.119187  0.733333  0.688984   0.916393
TabForestPFNClassifier  0.700735   0.140319  0.755556  0.726392   6.917409


In [6]:
results_sorted = results.sort_values("roc_auc")

In [7]:
print(results_sorted)

                        accuracy  precision    recall   roc_auc    runtime
LogisticRegression      0.639947   0.088202  0.533333  0.590050   0.064170
XGBClassifier           0.661991   0.110978  0.655556  0.658979   0.011333
XGBoostOptim            0.732799   0.134467  0.633333  0.686247  92.377979
TabPFNClassifier        0.649967   0.119187  0.733333  0.688984   0.916393
TabForestPFNClassifier  0.700735   0.140319  0.755556  0.726392   6.917409
