In [2]:
import pandas as pd
import numpy as np
import sys
import pickle
import warnings
warnings.filterwarnings("ignore")

sys.path.insert(0, '..')
from DataModule.Data_Preparation import CoronnaCERTAINDataset
import EvaluationModule
from ModelModule import models

import xgboost as xgb
import sklearn
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.pipeline import Pipeline

In [2]:
dataset = CoronnaCERTAINDataset(
    library_root = '/Users/gaskell/Dropbox/Mac/Desktop/Autoimmune_Disease/Code/ML_RA_EHR/Dataset/',
    challenge = 'regression_delta_binary', #option: regression, regression_delta, classification, binary_classification, regression_delta_binary
    dataset = 'CORRONA CERTAIN', 
    process_approach = 'SC', #option: KVB, SC
    imputation = 'IterativeImputer', #option: SimpleFill, KNN, SoftImpute, BiScaler, NuclearNormMinimization, IterativeImputer, IterativeSVD, None(raw)
    patient_group = ['bionaive TNF'], #option: "all", "bioexp nTNF", "bionaive TNF", "bionaive orencia", "KVB"
    drug_group = 'all', #option: "all", "actemra", "cimzia", "enbrel", "humira", "orencia", "remicade", "rituxan", "simponi"
    time_points = (0,3), 
    train_test_rate = 0.8,
    remove_low_DAS = True,
    save_csv = False, 
    random_state = 2022,
    verbose=False)

# read train, test from dataloader
train_set, train_loc = dataset.get_train()
test_set, test_loc = dataset.get_test()

In [3]:
# model_builder = models.make_models("SC_Jul24_test",['Linear','Ridge','Lasso','SVM','KNN','XGBoost','Random Forest'],[],'Regression')

In [4]:
# for key in model_builder.model_dict:
#     print(model_builder.model_dict[key])

In [5]:
aml = EvaluationModule.AutoBuild(seed=dataset.random_state, project_name="SC_Jul24_test", challenge=dataset.challenge, balance_class=2)
# define models
if "regression" in dataset.challenge:
#     for key in model_builder.model_dict:
#         model = model_builder.model_dict[key]
#         aml.validate(key, model, train_set, test_set)
    model = FineTuneModule.fine_tune(train=train_set, model="drf_regression", search_methods="RandomSearch")
elif "classification" in dataset.challenge:
    model = FineTuneModule.fine_tune(train=train_set, model="drf_classification", search_methods="RandomSearch")
    
aml.validate('rf', model, train_set, test_set)
# aml.validation_output(dataset)
# aml.test_output(dataset)

param_search_grid: {'max_samples': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0, None], 'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}
best_params: {'n_estimators': 1100, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_samples': None, 'max_features': 'log2', 'max_depth': 40, 'bootstrap': False}
before sampling: Responder       208
Nonresponder     65
dtype: int64
after sampling: Nonresponder    208
Responder       198
dtype: int64
before sampling: Responder       208
Nonresponder     65
dtype: int64
after sampling: Nonresponder    208
Responder       203
dtype: int64
before sampling: Responder       208
Nonresponder     65
dtype: int64
after sampling: Nonresponder    208
Responder       203
dtype: int64
before samplin

In [6]:
aml.train_perf

Unnamed: 0,model,MAE,MSE,RMSE,R2,Pearson_Correlation,Accuracy,F1-Score
0,rf,0.084686,0.018719,0.136816,0.979473,0.995321,1.0,1.0
1,rf,0.08742,0.019906,0.141087,0.978055,0.99486,1.0,1.0
2,rf,0.084854,0.018899,0.137474,0.979299,0.995024,1.0,1.0
3,rf,0.086161,0.019911,0.141108,0.976139,0.994365,1.0,1.0
4,rf,0.083758,0.018707,0.136773,0.978545,0.995132,1.0,1.0
5,rf,0.083384,0.018025,0.134257,0.979231,0.995335,1.0,1.0
6,rf,0.083345,0.01956,0.139855,0.975597,0.994196,1.0,1.0
7,rf,0.084389,0.018556,0.136221,0.979482,0.994915,1.0,1.0
8,rf,0.087105,0.018985,0.137788,0.980388,0.99508,1.0,1.0
9,rf,0.086269,0.019062,0.138065,0.97955,0.995139,1.0,1.0


In [7]:
aml.val_perf

Unnamed: 0,model,MAE,MSE,RMSE,R2,Pearson_Correlation,Accuracy,F1-Score
0,rf,3.348017,12.649644,3.556634,-7.243906,0.311026,0.741935,0.425926
1,rf,3.470257,13.222314,3.63625,-9.443264,0.326711,0.741935,0.425926
2,rf,3.448496,13.310552,3.648363,-10.19232,0.149196,0.741935,0.425926
3,rf,3.314047,12.410373,3.522836,-7.177222,0.359093,0.774194,0.436364
4,rf,3.469023,12.978661,3.602591,-11.220044,0.385845,0.766667,0.433962
5,rf,3.451814,13.564143,3.682953,-8.93927,0.110938,0.766667,0.433962
6,rf,3.326988,13.063471,3.614342,-3.275856,0.595319,0.766667,0.433962
7,rf,3.231633,11.831097,3.439636,-5.185799,0.524591,0.766667,0.433962
8,rf,3.400032,12.657605,3.557753,-8.722722,0.396396,0.766667,0.433962
9,rf,3.461685,13.245359,3.639417,-9.30633,0.269751,0.766667,0.433962


In [8]:
aml.test_perf

Unnamed: 0,model,MAE,MSE,RMSE,R2,Pearson_Correlation,Accuracy,F1-Score
0,rf,0.983521,1.400372,1.183373,-0.005724,0.240549,0.705882,0.413793


In [9]:
aml.confusion_matrix("XGBoost")

KeyError: 'XGBoost'

In [None]:
aml.confusion_matrix("Linear")