In [1]:
from xgboost import XGBClassifier
import warnings
from tabpfn import TabPFNClassifier
import numpy as np
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from data_prep_utils import *
from evaluate import *
from load_models import *
import matplotlib.pyplot as plt
import torch
import openml

In [2]:
path = "datasets/data_all.csv"
data, labels = get_microbiome(path)
data = top_non_zero(data)
data, labels = unison_shuffled_copies(data, labels)

In [3]:
cv = 3
n_optim = 10
ft_epochs = 10
sampling = None
metrics = metrics = ["accuracy", "precision", "recall", "roc_auc"]
models = [
    XGBClassifier(n_estimators=5, max_depth=5, learning_rate=1, objective='binary:logistic'),
    XGBoostOptim(n_optim=n_optim),
    LogisticRegression(max_iter=500), 
    TabPFNClassifier(device='cpu', N_ensemble_configurations=3),
    TabForestPFNClassifier("saved_models/tabforest/mix600k/tabforestpfn.pt", "saved_models/tabforest/mix600k/config_run.yaml")
]
results = pd.DataFrame(np.zeros((len(models), len(metrics))), 
                       index=[m.__class__.__name__ for m in models],
                      columns=metrics)

for ii, model in enumerate(models):
    results.iloc[ii,:] = cross_validate_sample(model, data, labels, metrics, cv, sampling)

[32m2024-09-10 17:19:26.986[0m | [1mINFO    [0m | [36mtabularbench.core.trainer_finetune[0m:[36mtrain[0m:[36m83[0m - [1mEpoch 000 | Train loss: -.---- | Train score: -.---- | Val loss: 0.1753 | Val score: 0.9405[0m
[32m2024-09-10 17:20:19.164[0m | [1mINFO    [0m | [36mtabularbench.core.trainer_finetune[0m:[36mtrain[0m:[36m94[0m - [1mEpoch 001 | Train loss: 0.1268 | Train score: 0.9609 | Val loss: 0.1781 | Val score: 0.9411[0m
[32m2024-09-10 17:21:04.797[0m | [1mINFO    [0m | [36mtabularbench.core.trainer_finetune[0m:[36mtrain[0m:[36m94[0m - [1mEpoch 002 | Train loss: 0.1474 | Train score: 0.9492 | Val loss: 0.1721 | Val score: 0.9444[0m
[32m2024-09-10 17:21:49.023[0m | [1mINFO    [0m | [36mtabularbench.core.trainer_finetune[0m:[36mtrain[0m:[36m94[0m - [1mEpoch 003 | Train loss: 0.1570 | Train score: 0.9570 | Val loss: 0.1717 | Val score: 0.9405[0m
[32m2024-09-10 17:22:45.020[0m | [1mINFO    [0m | [36mtabularbench.core.trainer_finetun

In [4]:
print(results)

                        accuracy  precision    recall   roc_auc
XGBClassifier           0.932030   0.423717  0.248904  0.613192
XGBoostOptim            0.942414   0.850587  0.102089  0.550206
LogisticRegression      0.933601   0.310317  0.053072  0.522628
TabPFNClassifier        0.938574   1.000000  0.015378  0.507689
TabForestPFNClassifier  0.943635   0.735670  0.150965  0.573668


In [5]:
data, labels = oversample(data, labels)