## With H20:

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
import h2o
from h2o.automl import H2OAutoML

In [5]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,12 mins 15 secs
H2O_cluster_timezone:,Asia/Jerusalem
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.3
H2O_cluster_version_age:,2 months and 2 days
H2O_cluster_name:,H2O_from_python_liorn_2hplcc
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.919 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


# Classification

Pre-Processing included:

On CAN'd data:
* White spaces to nan

* `abn_pap` column - 99 to mode

* imputing missing values to modes

* new features - `weight_diff`, `num_of_prior_diseases`, `body_BMI`

* removed columns containing only zeros

On combined dataset:

* Removed sessions with duration of less than 5 minutes

* dropped columns - `body_meas_height`, `body_meas_bmi`, `body_meas_prior_pregnancy_weight`, `body_meas_prior_pregnancy_weight`, `mon_sess_time_zone_diff`, `mon_sess_date`, `mon_sess_start_time`, `on_sess_duration`, `dem_det_age`, `subject_id`, `session_id`, `current_weight_kg`, `weight_before_kg`, `current_height_cm`, `body_meas_current_weight`, `MasterID`.

* Dropped features with a high portion of missing values

* Configured LTI_STI features

* Filled missing values with means

* One-Hot encoded demographic features

* Dropped features with low cardinality

#### Importing data:

In [6]:
df_feats = pd.read_csv('df_feats_baselines.csv')

df_targets = pd.read_csv('df_targets_baselines.csv')

#### Training (5-Fold CV)

In [None]:
X = df_feats.drop(columns=['study_id'])

df_auto = pd.concat([X, df_targets["cesdclinmA"]], axis=1)

# converting to h20 data object
df_h2o = h2o.H2OFrame(df_auto)

# configuring y as binary target
y = "cesdclinmA"
df_h2o[y] = df_h2o[y].asfactor()

# training with 5Fold CV
aml_proc_clf = H2OAutoML(max_models = 10, seed = 1, nfolds=5, sort_metric='auc', keep_cross_validation_predictions=True)

aml_proc_clf.train(y = y, training_frame = df_h2o)

aml_proc_clf.leaderboard

Parse progress: |█████████████████████████████████████████████████████████| 100%
AutoML progress: |█

#### Top Model

In [None]:
top_model_proc = aml_proc_clf.leader
print(top_model_proc.model_id)
top_model_proc.summary()

#### Top Model CV Results:

In [None]:
top_model_proc_cv_metrics = top_model_proc.cross_validation_metrics_summary().as_data_frame()
print(top_model_proc.model_id)
top_model_proc_cv_metrics

#### Top models vs Top features heatmap

In [None]:
# leaderboard to dataframe
leaderboard = aml_proc_clf.leaderboard.as_data_frame()

# getting feature names and sorting
column_names = [col for col, rel_imp, scal_imp, percent in top_model_proc.varimp()]
column_names.sort()

df_imp = pd.DataFrame(columns=column_names)

model_ids = []

for i in range(len(leaderboard)):
    
    # Accessing the information of each model in the leaderboard
    model_id = leaderboard.iloc[i]['model_id']
    
    if not model_id.startswith('Stacked'):
        
        model_ids.append(model_id)
        model = h2o.get_model(model_id)
    
        # getting the name and importance of each feature
        coefs = [(col, percent) for col, rel_imp, scal_imp, percent in model.varimp()]
    
        # sorting alphabetically to insert properly all the values to the dataframe
        coefs.sort()
        coefs = [percent for col, percent in coefs]
        # adding to dataframe
        df_len = len(df_imp)
        df_imp.loc[df_len] = coefs
    
df_imp['Model'] = model_ids
df_imp = df_imp.set_index(['Model'])

# normalizing coefficients
df_imp = df_imp.abs()
df_imp = (df_imp.T / df_imp.T.sum()).T

In [None]:
fig, ax = plt.subplots(figsize=(12,10))

# Filtering values
top_coefs = df_imp.loc[df_imp.max(axis=1) > 0.05, df_imp.max(axis=0) > 0.05]

sns.heatmap(top_coefs, robust=True, annot=True, cmap="Blues")
ax.set_xticklabels(top_coefs.columns, rotation=45)
plt.show()

#### Feature Importances of top 3 models

In [None]:
for i in range(3):
    m_id = leaderboard.iloc[i]['model_id']
    print(m_id)
    model = h2o.get_model(m_id)
    model.varimp_plot()

In [None]:
for i in range(3):
    m_id = leaderboard.iloc[i]['model_id']
    print(m_id)
    model = h2o.get_model(m_id)
    print(model.cross_validation_metrics_summary())

#### Saving coefficients of top model to csv

In [None]:
top_clf_coefs = top_model_proc.varimp()

top_clf_coefs_df = pd.DataFrame(top_clf_coefs, columns = 
                                ['variable', 'relative_importance', 'scaled_importance', 'percentage'])

top_clf_coefs_df.to_csv('Top_AutoML_Classifier_Coefficients_5_fold(Processed_Data - baseline_data).csv')

#### Training with 10-Fold CV:

In [None]:
X = df_feats.drop(columns=['study_id'])

df_auto = pd.concat([X, df_targets["cesdclinmA"]], axis=1)

# converting to h20 data object
df_h2o = h2o.H2OFrame(df_auto)

# configuring y as binary target
y = "cesdclinmA"
df_h2o[y] = df_h2o[y].asfactor()

# training with 10Fold CV,
aml_proc_clf_ten = H2OAutoML(max_models = 10, seed = 1, nfolds=10, sort_metric='auc', exclude_algos = ["DeepLearning"],
                            keep_cross_validation_predictions=True)

aml_proc_clf_ten.train(y = y, training_frame = df_h2o)

aml_proc_clf_ten.leaderboard

#### Top Model

In [None]:
top_model_proc_ten = aml_proc_clf_ten.leader
print(top_model_proc_ten.model_id)
top_model_proc_ten.summary()

#### Top Model CV Results:

In [None]:
top_model_proc_ten_cv_metrics = top_model_proc_ten.cross_validation_metrics_summary().as_data_frame()
print(top_model_proc_ten.model_id)
top_model_proc_ten_cv_metrics

#### Top models vs Top features heatmap

In [None]:
# leaderboard to dataframe
leaderboard = aml_proc_clf_ten.leaderboard.as_data_frame()

# getting feature names and sorting
column_names = [col for col, rel_imp, scal_imp, percent in top_model_proc_ten.varimp()]
column_names.sort()

df_imp = pd.DataFrame(columns=column_names)

model_ids = []

for i in range(len(leaderboard)):
    
    # Accessing the information of each model in the leaderboard
    model_id = leaderboard.iloc[i]['model_id']
    
    if not model_id.startswith('Stacked'):
        
        model_ids.append(model_id)
        model = h2o.get_model(model_id)
    
        # getting the name and importance of each feature
        coefs = [(col, percent) for col, rel_imp, scal_imp, percent in model.varimp()]
    
        # sorting alphabetically to insert properly all the values to the dataframe
        coefs.sort()
        coefs = [percent for col, percent in coefs]
        # adding to dataframe
        df_len = len(df_imp)
        df_imp.loc[df_len] = coefs
    
df_imp['Model'] = model_ids
df_imp = df_imp.set_index(['Model'])

In [None]:
fig, ax = plt.subplots(figsize=(12,10))

# Filtering values
top_coefs = df_imp.loc[df_imp.max(axis=1) > 0.05, df_imp.max(axis=0) > 0.05]

sns.heatmap(top_coefs, robust=True, annot=True, cmap="Blues")
plt.show()

#### Feature Importances of top 3 models

In [None]:
for i in range(3):
    m_id = leaderboard.iloc[i]['model_id']
    print(m_id)
    model = h2o.get_model(m_id)
    model.varimp_plot()

In [None]:
for i in range(3):
    m_id = leaderboard.iloc[i]['model_id']
    print(m_id)
    model = h2o.get_model(m_id)
    print(model.cross_validation_metrics_summary())

#### Saving coefficients of top model to csv

In [None]:
top_clf_coefs = top_model_proc_ten.varimp()

top_clf_coefs_df = pd.DataFrame(top_clf_coefs, columns = 
                                ['variable', 'relative_importance', 'scaled_importance', 'percentage'])

top_clf_coefs_df.to_csv('Top_AutoML_Classifier_Coefficients_10_fold(Processed_Data - baseline_data).csv')

#### Combining results of both K-Fold (5 and 10) CV to one dataframe:

In [None]:
top_model_proc_cv_metrics.insert(1, 'K-Fold', 5)
top_model_proc_ten_cv_metrics.insert(1, 'K-Fold', 10)

top_model_proc_cv_metrics.insert(2, 'model', top_model_proc.model_id)
top_model_proc_ten_cv_metrics.insert(2, 'model', top_model_proc_ten.model_id)

combined = pd.concat([top_model_proc_cv_metrics,top_model_proc_ten_cv_metrics], axis=0)
combined.columns = ["metric", "K-Fold", 'model', "mean", "sd", "cv_1_valid", "cv_2_valid", "cv_3_valid", "cv_4_valid", "cv_5_valid",
                   "cv_6_valid", "cv_7_valid", "cv_8_valid", "cv_9_valid", "cv_10_valid"]
combined = combined.set_index(["metric", "model"]).sort_values(by='metric')
combined

In [None]:
combined.to_csv('Top_AutoML_Classifiers_CV_Results_combined(Processed_baseline_data).csv')

#### Testing top model hyper-parameters (of 5-Fold CV auto-ML) with Sklearn:

In [None]:
tmp_feat_arr = top_model_proc.varimp()

feat_lst = []

for coef in tmp_feat_arr:
    if coef[1] != 0:
        feat_lst.append(coef[0])

X = df_feats.drop(columns=['study_id'])
y = df_targets["cesdclinmA"]

X = X[feat_lst]

scaler = StandardScaler()
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X), columns = X.columns)

clf_test = GradientBoostingClassifier(n_estimators=63, max_depth=3, min_samples_leaf=2, max_leaf_nodes=4, random_state=0)

skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)

avg = 0

# cross-validating
for ind, (train_index, val_index) in enumerate(skf.split(X, y)):
    
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    clf_test.fit(X_train, y_train)
    
    train_acc = clf_test.score(X_train, y_train)
    acc = clf_test.score(X_val, y_val)
    
    avg += acc
    
    print(f'{ind+1}-fold:  test accuracy = {acc}, train accuracy = {train_acc}')

print(f' CV accuracy = {round(avg/10, 4)}')

#### Feature Importance

In [None]:
coef_df = pd.DataFrame(clf_test.feature_importances_, index = feat_lst, columns = ['Coefficient']).sort_values(by='Coefficient')

err = [(X[feat].std()/np.sqrt(len(X))) for feat in feat_lst]

coef_df['err'] = err

coef_df['t'] = coef_df['Coefficient'] / coef_df['err']

top_coef = coef_df.sort_values(by='t', ascending=False)[0:15]

plt.figure(figsize=(10,10))
plt.barh(top_coef.index, top_coef['t'])
plt.gca().invert_yaxis()
plt.title("Feature Importance - Gradient Boosting Classifier")
plt.xlabel('Importance Score');

#### Testing top model hyper-parameters (of 10-Fold CV auto-ML) with Sklearn:

In [None]:
tmp_feat_arr = top_model_proc_ten.varimp()

feat_lst = []

for coef in tmp_feat_arr:
    if coef[1] != 0:
        feat_lst.append(coef[0])

X = df_feats.drop(columns=['study_id'])
y = df_targets["cesdclinmA"]

X = X[feat_lst]

scaler = StandardScaler()
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X), columns = X.columns)

clf_test = GradientBoostingClassifier(n_estimators=87, max_depth=3, min_samples_leaf=2, max_leaf_nodes=4, random_state=0)

skf = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)

avg = 0

# cross-validating
for ind, (train_index, val_index) in enumerate(skf.split(X, y)):
    
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    clf_test.fit(X_train, y_train)
    
    train_acc = clf_test.score(X_train, y_train)
    acc = clf_test.score(X_val, y_val)
    
    avg += acc
    
    print(f'{ind+1}-fold:  test accuracy = {acc}, train accuracy = {train_acc}')

print(f' CV accuracy = {round(avg/10, 4)}')

#### Feature Importance

In [None]:
coef_df = pd.DataFrame(clf_test.feature_importances_, index = feat_lst, columns = ['Coefficient']).sort_values(by='Coefficient')

err = [(X[feat].std()/np.sqrt(len(X))) for feat in feat_lst]

coef_df['err'] = err

coef_df['t'] = coef_df['Coefficient'] / coef_df['err']

top_coef = coef_df.sort_values(by='t', ascending=False)[0:15]

plt.figure(figsize=(10,10))
plt.barh(top_coef.index, top_coef['t'])
plt.gca().invert_yaxis()
plt.title("Feature Importance - Gradient Boosting Classifier")
plt.xlabel('Importance Score');

## Regression

### On processed data:

In [None]:
X = df_feats.drop(columns=['study_id'])

df_auto = pd.concat([X, df_targets["psatotmA"]], axis=1)

# converting to h20 data object
df_h2o_reg = h2o.H2OFrame(df_auto)

y = "psatotmA"

aml_proc_reg = H2OAutoML(max_models = 10, seed = 1, nfolds=5, sort_metric='rmse', keep_cross_validation_predictions=True)

aml_proc_reg.train(y = y, training_frame = df_h2o_reg)

aml_proc_reg.leaderboard.head()

#### Top Model:

In [None]:
top_model_proc_reg = aml_proc_reg.leader
top_model_proc_reg.summary()

#### Top Model CV Results (+Saving to csv):

In [None]:
top_model_cv_metrics = top_model_proc_reg.cross_validation_metrics_summary().as_data_frame()
top_model_cv_metrics.to_csv('Top_AutoML_Regressor_CV_Results - baseline_data.csv')
top_model_cv_metrics

#### Top Model - Feature Importance

In [None]:
top_model_proc_reg.varimp_plot()

#### And saving coefficients to csv

In [None]:
top_reg_coefs = top_model_orig_reg.varimp()
top_reg_coefs_df = pd.DataFrame(top_reg_coefs, columns = ['variable', 'relative_importance', 'scaled_importance', 'percentage'])
top_reg_coefs_df.to_csv('Top_AutoML_Regressor_Coefficients - baseline_data.csv')