In [1]:
from xgboost import XGBClassifier
import warnings
from tabpfn import TabPFNClassifier
import numpy as np
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from data_prep_utils import *
from evaluate import *
from load_models import *
import matplotlib.pyplot as plt
import torch
import openml
import time

In [2]:
path = "datasets/data_all.csv"
data, labels = get_microbiome(path)
data = top_non_zero(data)
data, labels = unison_shuffled_copies(data, labels)

In [3]:
cv = 3
strat_split = True
n_optim = 1000
ft_epochs = 10
sampling = None
metrics = metrics = ["accuracy", "precision", "recall", "roc_auc"]
models = [
    XGBClassifier(n_estimators=5, max_depth=5, learning_rate=1, objective='binary:logistic'),
    XGBoostOptim(n_optim=n_optim),
    LogisticRegression(max_iter=500), 
    TabPFNClassifier(device='cpu', N_ensemble_configurations=3),
    TabForestPFNClassifier("saved_models/tabforest/mix600k/tabforestpfn.pt", "saved_models/tabforest/mix600k/config_run.yaml", max_epochs=ft_epochs)
]
results = pd.DataFrame(np.zeros((len(models), len(metrics)+1)), 
                       index=[m.__class__.__name__ for m in models],
                      columns=metrics+["runtime"])

for ii, model in enumerate(models):
    results.iloc[ii,:] = cross_validate_sample(model, data, labels, metrics, strat_split, cv, sampling)

In [4]:
print(results)

               accuracy  precision    recall   roc_auc     runtime
XGBClassifier  0.930785   0.379902  0.213162  0.595287    0.167719
XGBoostOptim   0.947630   0.742648  0.216023  0.605595  601.872008


In [5]:
def stratified_split(data, labels, cv=3):
    size = labels.shape[0]
    fold_size = size//cv
    counts = np.unique(labels, return_counts=True)
    c0_size = np.floor(fold_size*counts[1][0]/size).astype(int)
    c1_size = np.floor(fold_size*counts[1][1]/size).astype(int)#fold_size-c0_size
    
    c0_data = data[labels==0]
    c1_data = data[labels==1]
    np.random.shuffle(c0_data)
    np.random.shuffle(c1_data)
    
    data_folds, labels_folds = [], []
    for f in range(cv):
        data_single_fold = np.concatenate((c0_data[c0_size*f:c0_size*(f+1),:],c1_data[c1_size*f:c1_size*(f+1),:]))
        labels_single_fold = np.concatenate((np.zeros((c0_size)), np.ones((c1_size))))
        data_single_fold, labels_single_fold = unison_shuffled_copies(data_single_fold, labels_single_fold)
        data_folds.append(data_single_fold)
        labels_folds.append(labels_single_fold)
        
    return data_folds, labels_folds
data[100] = 1e10
data_folds, labels_folds = stratified_split(data, labels)

for fold in labels_folds:
    counts = np.unique(fold, return_counts=True)
    print(counts[1][0]/np.sum(counts[1]))
for fold in data_folds:
    print(np.max(fold))

0.9389892642052894
0.9389892642052894
0.9389892642052894
152.9787
156.81378
10000000000.0
