In [1]:
from xgboost import XGBClassifier
import warnings
from tabpfn import TabPFNClassifier
import numpy as np
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from data_prep_utils import *
from evaluate import *
from load_models import *
import matplotlib.pyplot as plt
import torch
import openml

In [2]:
path = "datasets/data_all.csv"
data, labels = get_microbiome(path)
data = top_non_zero(data)
data, labels = unison_shuffled_copies(data, labels)

In [3]:
n_optim = 100
ft_epochs = 10
sampling = undersample
metrics = metrics = ["accuracy", "precision", "recall", "roc_auc"]
models = [
    XGBClassifier(n_estimators=5, max_depth=5, learning_rate=1, objective='binary:logistic'),
    XGBoostOptim(n_optim=n_optim),
    LogisticRegression(max_iter=500), 
    TabPFNClassifier(device='cpu', N_ensemble_configurations=3),
    TabForestPFNClassifier("saved_models/tabforest/mix600k/tabforestpfn.pt", "saved_models/tabforest/mix600k/config_run.yaml")
]
results = pd.DataFrame(np.zeros((len(models), len(metrics))), 
                       index=[m.__class__.__name__ for m in models],
                      columns=metrics)

for ii, model in enumerate(models):
    results.iloc[ii,:] = cross_validate_sample(model, data, labels, metrics)

[32m2024-09-03 09:59:23.937[0m | [1mINFO    [0m | [36mtabularbench.core.trainer_finetune[0m:[36mtrain[0m:[36m83[0m - [1mEpoch 000 | Train loss: -.---- | Train score: -.---- | Val loss: 0.1709 | Val score: 0.9411[0m
[32m2024-09-03 10:00:02.452[0m | [1mINFO    [0m | [36mtabularbench.core.trainer_finetune[0m:[36mtrain[0m:[36m94[0m - [1mEpoch 001 | Train loss: 0.2245 | Train score: 0.9238 | Val loss: 0.1706 | Val score: 0.9424[0m
[32m2024-09-03 10:00:42.244[0m | [1mINFO    [0m | [36mtabularbench.core.trainer_finetune[0m:[36mtrain[0m:[36m94[0m - [1mEpoch 002 | Train loss: 0.1528 | Train score: 0.9512 | Val loss: 0.1643 | Val score: 0.9438[0m
[32m2024-09-03 10:01:21.822[0m | [1mINFO    [0m | [36mtabularbench.core.trainer_finetune[0m:[36mtrain[0m:[36m94[0m - [1mEpoch 003 | Train loss: 0.1690 | Train score: 0.9492 | Val loss: 0.1643 | Val score: 0.9438[0m
[32m2024-09-03 10:02:06.360[0m | [1mINFO    [0m | [36mtabularbench.core.trainer_finetun

In [4]:
print(results)

                        accuracy  precision    recall   roc_auc
XGBClassifier           0.931769   0.405109  0.247091  0.611600
XGBoostOptim            0.946514   0.715359  0.203088  0.598897
LogisticRegression      0.937353   0.406318  0.061643  0.527895
TabPFNClassifier        0.939709   0.794872  0.021398  0.510328
TabForestPFNClassifier  0.943897   0.639189  0.185478  0.589303


In [5]:
data, labels = oversample(data, labels)