In [None]:
import h2o
from h2o.automl import H2OAutoML
from h2o.estimators.infogram import H2OInfogram
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators import H2ORandomForestEstimator
# from imblearn.over_sampling import SMOTE
# from imblearn.under_sampling import RandomUnderSampler
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, '..')
from DataModule.Data_Preparation import CoronnaCERTAINDataset
import EvaluationModule
pd.options.mode.chained_assignment = None

In [None]:
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [None]:
dataset = CoronnaCERTAINDataset(
    library_root='/Users/gaskell/Dropbox/Mac/Desktop/Autoimmune_Disease/Code/ML_RA_EHR/Dataset/',
    challenge="binary_classification", #option: regression, regression_delta, classification, binary_classification
    dataset='CORRONA CERTAIN', 
    process_approach='SC', #option: KVB, SC
    imputation="IterativeImputer", #option: SimpleFill, KNN, SoftImpute, BiScaler, NuclearNormMinimization, IterativeImputer, IterativeSVD, None(raw)
    patient_group='bionaive TNF', #option: "all", "bioexp nTNF", "bionaive TNF", "bionaive orencia", "KVB"
    drug_group='all', #option: "all", "actemra", "cimzia", "enbrel", "humira", "orencia", "remicade", "rituxan", "simponi"
    time_points=(0,3), 
    train_test_rate=0.8,
    remove_low_DAS = True,
    save_csv=False,
    balance_class=True,
    random_state=2022)

In [None]:
train_set, train_loc = dataset.get_train()
test_set, test_loc = dataset.get_test()

In [None]:
train_set

In [None]:
# Start the H2O cluster (locally)
h2o.init()

# Import a sample binary outcome train/test set into H2O
train_h2o = h2o.upload_file(str(train_loc))
test_h2o = h2o.upload_file(str(test_loc))

# Identify predictors and response
predictors = train_h2o.columns[:-1]
# y = "DAS28_CRP_3M"
response = "DrugResponse_binary"

for feature in dataset.categorical:
    train_h2o[feature] = train_h2o[feature].asfactor()
    test_h2o[feature] = test_h2o[feature].asfactor()
train_h2o[response] = train_h2o[response].asfactor()
test_h2o[response] = test_h2o[response].asfactor()

train, valid = train_h2o.split_frame(ratios=[.8], seed=1)

In [None]:
# Build and train the model:
RAdrugs_gbm = H2OGradientBoostingEstimator(
    model_id = "GBM_grid_1_AutoML_1_20220711_223756_model_8"
    nfolds = 10,
    ntrees = 50,
    max_depth = 20,
    min_rows=10,
#     balance_classes = True,
#     max_after_balance_size = 1.0,
    calibrate_model=True,
    calibration_frame=valid,
    histogram_type="Random",
    binomial_double_trees=True)

In [None]:
RAdrugs_drf.train(x=predictors,
                  y=response,
                  training_frame=train,
                  validation_frame=valid)

In [None]:
# Eval performance:
perf = RAdrugs_drf.model_performance()

In [None]:
# Generate predictions on a validation set (if necessary):
pred = RAdrugs_drf.predict(valid)

In [None]:
pred

In [None]:
pred.head(100)

In [None]:
valid[response].head(100)