In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit,GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score
from tqdm import tqdm


In [6]:
def get_target_data():

    microbiome = pd.read_csv('../../data/raw/curated_metagenomics/relative_abundance.csv',index_col=0).transpose()
    metadata = pd.read_csv('../../data/raw/curated_metagenomics/metadata.csv',index_col='sample_id',low_memory=False)

    #get stool samples
    metadata = metadata.loc[metadata.body_site == 'stool',:]

    #Add obesity disease tags to disease BMI
    to_change = metadata.BMI>=30
    metadata.loc[to_change,'disease'] = 'obesity'

    to_change = metadata.BMI<16
    metadata.loc[to_change,'disease'] = 'severe_underweight'

    # Remove all disease NaNs
    metadata = metadata.loc[metadata.disease==metadata.disease,:]

    #
    to_keep = metadata.age_category != 'newborn'
    metadata = metadata.loc[to_keep,:]

    # Get the overlapping set of samples between metadata and microbiome data
    overlapping_samples = list(set(metadata.index) & set(microbiome.index))
    microbiome= microbiome.loc[overlapping_samples,:]
    metadata = metadata.loc[overlapping_samples,:]

    target_metadata = metadata.loc[metadata.study_name == 'YachidaS_2019',:]
    target_microbiome = microbiome.loc[target_metadata.index,:]

    return target_metadata,target_microbiome

metadata, microbiome = get_target_data()

In [7]:
y = np.asarray( metadata.disease != 'healthy',dtype=int)
feature_names = microbiome.columns

In [8]:
param_grid = {
    "n_estimators":[100, 300, 800, 1000, ],
    "criterion":['gini'],
    "max_depth":[2, 5, 7, None],
    "min_samples_split":[2],
    "min_samples_leaf":[1,5,10],
    "max_features":['sqrt'],
    "random_state":[512],
    "bootstrap": [False,True]
    }

In [9]:
X_raw = microbiome
for col in X_raw.columns:
    if X_raw[col].dtype == 'int64':
        X_raw[col] = X_raw[col].astype(float)

        
X = X_raw
feature_names = list(X_raw.columns)

In [10]:

num_stability_runs=50
test_data_ratio = 0.2

In [11]:
shuffle_counter = 0

sss1 = StratifiedShuffleSplit(n_splits=50, test_size=0.2, random_state=42)

results_dict = {
    'train_scores':[],
    'val_scores':[],
    'test_scores':[],
    }
for train_val, test in tqdm(sss1.split(X_raw,y)):
    shuffle_counter +=1

    X_train_val,y_train_val = X_raw.iloc[train_val,:],y[train_val]
    X_test, y_test = X_raw.iloc[test,:],y[test]

    model = ExtraTreesClassifier()

    sss2 = StratifiedShuffleSplit(test_size=0.2, random_state=42)
    search=GridSearchCV(
        model,
        param_grid,
        scoring='roc_auc',
        cv=sss2,
        n_jobs=-1,
        verbose=0,
        refit=True)
    
    search.fit(X_train_val,y_train_val)

    best_model = search.best_estimator_

    # train_auc = roc_auc_score(y_train_val,best_model.predict_proba(X_train_val)[:,1])
    # val_auc = search.best_score_
    # test_auc = roc_auc_score(y_test,best_model.predict_proba(X_test)[:,1])

    out_scores = [
        roc_auc_score(y_train_val,best_model.predict_proba(X_train_val)[:,1]),
        search.best_score_,
        roc_auc_score(y_test,best_model.predict_proba(X_test)[:,1])
    ]

    for i,key in enumerate(results_dict.keys()):
        results_dict[key].append(out_scores[i])

0it [00:20, ?it/s]


KeyboardInterrupt: 

In [None]:
    for i,key in enumerate(results_dict.keys()):
        print(key+':\t',np.round(np.mean(results_dict[key]),3),'+-',np.round(np.std(results_dict[key]),3))