## Description

Bone models of 16S rRNA data at level 2 (phylum), using the ml_new environment, rarefied data and no normalization during import, n = 1000 estimators, k fold = 6 (for regular, non-nested cross-validation), and hyperparameter tuning. Incorporates metadata features season, accumulated humidity, and accumulated precipitation in all combinations. 

In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing, svm, metrics
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV, GroupKFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVR
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_error, roc_auc_score, precision_score, make_scorer
from sklearn.linear_model import LassoCV, Lasso, LassoLarsIC, ElasticNet, LassoLarsCV
from sklearn.externals import joblib
from scipy.stats import randint as sp_randint
from matplotlib import pyplot as plt
from sklearn.feature_selection import RFECV
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier



In [2]:
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns
%matplotlib inline
import biom
import calour as ca
from scipy import stats
import scipy
import pickle
import time
import math
import inspect
pd.set_option('display.max_rows', 10000)

  import pandas.util.testing as pdt


## Import rarefied table collapsed at L2

In [3]:
exp_L2 = ca.read_amplicon('/Users/heatherdeel/Dropbox/PMI_3_analyses/bone/01_16S/01_qiime2_analysis/feature_tables/collapsed_tables/frag_ins_noChloMito_17098_L2_table.biom', '../02_metadata/maps/map3.txt', min_reads=0, normalize = None)




In [4]:
exp_L2.sample_metadata['host_subject_id'].value_counts()

STAFS2016.011    8
STAFS2016.067    8
STAFS2016.065    8
STAFS2016.064    7
STAFS2016.007    7
STAFS2016.024    7
Name: host_subject_id, dtype: int64

In [5]:
exp_L2.sample_metadata['ADD_0']

#SampleID
11553.SHSU.STAFS2016.024.R09      708.0555556
11553.STAFS.2016.065.L12          2268.055556
11553.STAFS.2016.067.R12          2414.444444
11553.SHSU.STAFS2016.007.L09      1911.111111
11553.SHSU.STAFS2016.011.L08      1301.666667
11553.STAFS.2016.065.R11          4756.388889
11553.STAFS.2016.065.R09          3456.111111
11553.SHSU.STAFS2016.024.L12      4285.833333
11553.SHSU.STAFS2016.007.L11      2584.444444
11553.SHSU.STAFS2016.011.L10      3777.222222
11553.STAFS.2016.064.R10          3456.111111
11553.SHSU.STAFS2016.024.R11      2584.444444
11553.STAFS.2016.067.L11.june     5201.388889
11553.SHSU.STAFS2016.007.R12      1301.666667
11553.STAFS.2016.065.R10          2804.722222
11553.STAFS.2016.064.R09          592.7777778
11553.STAFS.2016.067.L10          3623.055556
11553.SHSU.STAFS2016.011.R12      708.0555556
11553.STAFS.2016.064.R11          1300.555556
11553.SHSU.STAFS2016.011.L11      4285.833333
11553.SHSU.STAFS2016.007.L10      4821.388889
11553.STAFS.2016.065.R08

In [6]:
print(exp_L2.feature_metadata)

                                                     _feature_id
k__Archaea;p__Crenarchaeota          k__Archaea;p__Crenarchaeota
k__Archaea;p__Euryarchaeota          k__Archaea;p__Euryarchaeota
k__Bacteria;__                                    k__Bacteria;__
k__Bacteria;p__Acidobacteria        k__Bacteria;p__Acidobacteria
k__Bacteria;p__Actinobacteria      k__Bacteria;p__Actinobacteria
k__Bacteria;p__Armatimonadetes    k__Bacteria;p__Armatimonadetes
k__Bacteria;p__BRC1                          k__Bacteria;p__BRC1
k__Bacteria;p__Bacteroidetes        k__Bacteria;p__Bacteroidetes
k__Bacteria;p__Chlorobi                  k__Bacteria;p__Chlorobi
k__Bacteria;p__Chloroflexi            k__Bacteria;p__Chloroflexi
k__Bacteria;p__Cyanobacteria        k__Bacteria;p__Cyanobacteria
k__Bacteria;p__Deferribacteres    k__Bacteria;p__Deferribacteres
k__Bacteria;p__FBP                            k__Bacteria;p__FBP
k__Bacteria;p__Firmicutes              k__Bacteria;p__Firmicutes
k__Bacteria;p__Fusobacter

## Running the L2 no metadata model

In [7]:
# groupKfold = 6, will leave out one body for each model
gkf = GroupKFold(6)

X = exp_L2.data
y = exp_L2.sample_metadata['ADD_0']
y = (y.astype(float))

groups = exp_L2.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [8]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x11bf072d0>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_es...
                                             oob_score=False, random_state=999,
                                             verbose=0, warm_start=False

In [9]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [10]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 765.78


In [11]:
joblib.dump(gs.best_estimator_, 'models/bone_L2_no_metadata.pkl')

['models/bone_L2_no_metadata.pkl']

In [12]:
bone_L2 = joblib.load('models/bone_L2_no_metadata.pkl')
bone_L2

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features of L2 no metadata model

In [14]:
importances = bone_L2.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_L2.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = exp_L2.feature_metadata
np.savetxt("importances/bone_L2_no_metadata_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (exp_L2.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(exp_L2.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(58,1))

np.savetxt("importances/bone_L2_no_features_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. k__Bacteria;p__Verrucomicrobia	0.11987154819410084
2. k__Bacteria;p__Firmicutes	0.11126658098985664
3. k__Bacteria;p__Bacteroidetes	0.10196757181199129
4. k__Bacteria;p__Proteobacteria	0.09685239546073014
5. k__Bacteria;p__Planctomycetes	0.09232379132237392
6. k__Bacteria;p__[Thermi]	0.06191102623999902
7. k__Bacteria;p__Actinobacteria	0.061261278052764344
8. k__Bacteria;p__Gemmatimonadetes	0.055267434195445025
9. k__Bacteria;p__Chloroflexi	0.05188271387007733
10. k__Bacteria;p__Fusobacteria	0.04660002309985257
11. k__Bacteria;p__Acidobacteria	0.04012072345796194
12. k__Bacteria;p__FBP	0.028449503493015484
13. k__Bacteria;p__BRC1	0.018189050030968656
14. k__Bacteria;p__Tenericutes	0.015946114482290028
15. k__Bacteria;p__Armatimonadetes	0.014579541867179757
16. k__Bacteria;p__Cyanobacteria	0.012725500877310213
17. k__Bacteria;p__GN02	0.011298221080422693
18. k__Bacteria;p__TM7	0.010324222212359605
19. k__Bacteria;p__Chlorobi	0.010014462811682145
20. k__Bacteri

## Run the L2 model with season

In [15]:
exp_L2_season = exp_L2.add_sample_metadata_as_features(['season'])

In [16]:
# groupKfold = 6, will leave out one body for each model
gkf = GroupKFold(6)

X = exp_L2_season.data
y = exp_L2_season.sample_metadata['ADD_0']
y = (y.astype(float))

groups = exp_L2_season.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [17]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x11c39ae50>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_es...
                                             oob_score=False, random_state=999,
                                             verbose=0, warm_start=False

In [18]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [19]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 774.35


In [20]:
joblib.dump(gs.best_estimator_, 'models/bone_L2_season.pkl')

['models/bone_L2_season.pkl']

In [21]:
bone_L2_season = joblib.load('models/bone_L2_season.pkl')
bone_L2_season

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features for the L2 season model

In [23]:
importances = bone_L2_season.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_L2_season.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = exp_L2_season.feature_metadata
np.savetxt("importances/bone_L2_season_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (exp_L2_season.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(exp_L2_season.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(62,1))

np.savetxt("importances/bone_L2_season_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. k__Bacteria;p__Verrucomicrobia	0.11202654214514784
2. k__Bacteria;p__Firmicutes	0.1061343899218351
3. k__Bacteria;p__Proteobacteria	0.10365943935553606
4. k__Bacteria;p__Bacteroidetes	0.09751923573433713
5. k__Bacteria;p__Planctomycetes	0.0828123522834748
6. k__Bacteria;p__Actinobacteria	0.06484196957435172
7. k__Bacteria;p__[Thermi]	0.06104302756301653
8. k__Bacteria;p__Gemmatimonadetes	0.061024593421146404
9. k__Bacteria;p__Chloroflexi	0.049307982770428964
10. k__Bacteria;p__Fusobacteria	0.045694730950205646
11. k__Bacteria;p__Acidobacteria	0.036964782079307305
12. k__Bacteria;p__FBP	0.029595018439946844
13. k__Bacteria;p__BRC1	0.01754784738094031
14. k__Bacteria;p__Armatimonadetes	0.016860795249380427
15. k__Bacteria;p__Tenericutes	0.014989327027244446
16. k__Bacteria;p__Cyanobacteria	0.012999043307660637
17. k__Bacteria;p__GN02	0.0123824425026003
18. k__Bacteria;p__TM7	0.010931198471841397
19. k__Bacteria;p__Chlorobi	0.009754819639130307
20. k__Bacteria;p

## Run the L2 model with precipitation

In [24]:
exp_L2_precip = exp_L2.add_sample_metadata_as_features(['accumulated_precipitation_inches_base0'])

In [25]:
# groupKfold = 6, will leave out one body for each model
gkf = GroupKFold(6)

X = exp_L2_precip.data
y = exp_L2_precip.sample_metadata['ADD_0']
y = (y.astype(float))

groups = exp_L2_precip.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [26]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x11c3f25d0>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_es...
                                             oob_score=False, random_state=999,
                                             verbose=0, warm_start=False

In [27]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 16, 'max_features': 'sqrt', 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [28]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 573.46


In [29]:
joblib.dump(gs.best_estimator_, 'models/bone_L2_precip.pkl')

['models/bone_L2_precip.pkl']

In [30]:
bone_L2_precip = joblib.load('models/bone_L2_precip.pkl')
bone_L2_precip

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=16, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features for L2 precipitation model

In [32]:
importances = bone_L2_precip.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_L2_precip.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = exp_L2_precip.feature_metadata
np.savetxt("importances/bone_L2_precip_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (exp_L2_precip.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(exp_L2_precip.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(60,1))

np.savetxt("importances/bone_L2_precip_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. accumulated_precipitation_inches_base0	0.2300840628569225
2. k__Bacteria;p__Verrucomicrobia	0.09549872441133539
3. k__Bacteria;p__Firmicutes	0.08026801321683104
4. k__Bacteria;p__Proteobacteria	0.07682713727378934
5. k__Bacteria;p__Bacteroidetes	0.07644480777495924
6. k__Bacteria;p__Planctomycetes	0.0656555017659463
7. k__Bacteria;p__Actinobacteria	0.049478547570133466
8. k__Bacteria;p__Gemmatimonadetes	0.046271528865142915
9. k__Bacteria;p__[Thermi]	0.04384558898415175
10. k__Bacteria;p__Chloroflexi	0.042268916887808136
11. k__Bacteria;p__Fusobacteria	0.03663265304639057
12. k__Bacteria;p__Acidobacteria	0.03152200093248848
13. k__Bacteria;p__FBP	0.01994894180144875
14. k__Bacteria;p__Tenericutes	0.013473214315324682
15. k__Bacteria;p__Armatimonadetes	0.013194357132992307
16. k__Bacteria;p__BRC1	0.0131872824675222
17. k__Bacteria;p__Cyanobacteria	0.010879001856716142
18. k__Bacteria;p__GN02	0.010266435571545912
19. k__Bacteria;p__TM7	0.008413356159061863
20. 

## Run the L2 model with humidity

In [33]:
exp_L2_hum = exp_L2.add_sample_metadata_as_features(['accumulated_percent_humidity_days_base0'])

In [34]:
# groupKfold = 6, will leave out one body for each model
gkf = GroupKFold(6)

X = exp_L2_hum.data
y = exp_L2_hum.sample_metadata['ADD_0']
y = (y.astype(float))

groups = exp_L2_hum.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [35]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x11c3f2550>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_es...
                                             oob_score=False, random_state=999,
                                             verbose=0, warm_start=False

In [36]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [37]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 530.6


In [38]:
joblib.dump(gs.best_estimator_, 'models/bone_L2_hum.pkl')

['models/bone_L2_hum.pkl']

In [39]:
bone_L2_hum = joblib.load('models/bone_L2_hum.pkl')
bone_L2_hum

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features for L2 humidity model

In [41]:
importances = bone_L2_hum.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_L2_hum.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = exp_L2_hum.feature_metadata
np.savetxt("importances/bone_L2_hum_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (exp_L2_hum.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(exp_L2_hum.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(60,1))

np.savetxt("importances/bone_L2_hum_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. accumulated_percent_humidity_days_base0	0.2602281380745937
2. k__Bacteria;p__Verrucomicrobia	0.08825352934640605
3. k__Bacteria;p__Firmicutes	0.07799451806230023
4. k__Bacteria;p__Bacteroidetes	0.07451363313609981
5. k__Bacteria;p__Proteobacteria	0.0723793109852555
6. k__Bacteria;p__Planctomycetes	0.0661824048878819
7. k__Bacteria;p__[Thermi]	0.04558932869303888
8. k__Bacteria;p__Gemmatimonadetes	0.04512339466733629
9. k__Bacteria;p__Actinobacteria	0.04236629665023932
10. k__Bacteria;p__Chloroflexi	0.039874479522062155
11. k__Bacteria;p__Fusobacteria	0.03413452181308698
12. k__Bacteria;p__Acidobacteria	0.03106723790091354
13. k__Bacteria;p__FBP	0.019454670973470634
14. k__Bacteria;p__BRC1	0.014026873421212713
15. k__Bacteria;p__Cyanobacteria	0.013605784530183958
16. k__Bacteria;p__Armatimonadetes	0.012546633907356124
17. k__Bacteria;p__Tenericutes	0.01100558419037997
18. k__Bacteria;p__GN02	0.008501871067667017
19. k__Bacteria;p__TM7	0.008327591771425004
20. 

## Run the L2 model with season and precipitation

In [42]:
exp_L2_season_precip = exp_L2.add_sample_metadata_as_features(['season', 'accumulated_precipitation_inches_base0'])

In [43]:
# groupKfold = 6, will leave out one body for each model
gkf = GroupKFold(6)

X = exp_L2_season_precip.data
y = exp_L2_season_precip.sample_metadata['ADD_0']
y = (y.astype(float))

groups = exp_L2_season_precip.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [44]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x11c2be550>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_es...
                                             oob_score=False, random_state=999,
                                             verbose=0, warm_start=False

In [45]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [46]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 568.2


In [47]:
joblib.dump(gs.best_estimator_, 'models/bone_L2_season_precip.pkl')

['models/bone_L2_season_precip.pkl']

In [48]:
bone_L2_season_precip = joblib.load('models/bone_L2_season_precip.pkl')
bone_L2_season_precip

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features for L2 season and precipitation model

In [50]:
importances = bone_L2_season_precip.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_L2_season_precip.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = exp_L2_season_precip.feature_metadata
np.savetxt("importances/bone_L2_season_precip_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (exp_L2_season_precip.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(exp_L2_season_precip.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(64,1))

np.savetxt("importances/bone_L2_season_precip_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. accumulated_precipitation_inches_base0	0.22642135942611014
2. k__Bacteria;p__Verrucomicrobia	0.08953720111320916
3. k__Bacteria;p__Firmicutes	0.07811375976349073
4. k__Bacteria;p__Proteobacteria	0.07627497671424237
5. k__Bacteria;p__Bacteroidetes	0.0761270217757707
6. k__Bacteria;p__Planctomycetes	0.06406023670940361
7. k__Bacteria;p__Actinobacteria	0.04834312091298402
8. k__Bacteria;p__Gemmatimonadetes	0.0460506044133677
9. k__Bacteria;p__Chloroflexi	0.04225903452124425
10. k__Bacteria;p__[Thermi]	0.041853242588589974
11. k__Bacteria;p__Fusobacteria	0.037743340600356425
12. k__Bacteria;p__Acidobacteria	0.0307260355777873
13. k__Bacteria;p__FBP	0.02077014005888021
14. k__Bacteria;p__BRC1	0.014633468108830722
15. k__Bacteria;p__Armatimonadetes	0.013072169392086498
16. k__Bacteria;p__Tenericutes	0.012887937628393641
17. k__Bacteria;p__Cyanobacteria	0.011507818020866261
18. k__Bacteria;p__GN02	0.010099252607827361
19. k__Bacteria;p__Chlorobi	0.00846189727927441


## Run the L2 model with season and humidity

In [51]:
exp_L2_season_hum = exp_L2.add_sample_metadata_as_features(['season', 'accumulated_percent_humidity_days_base0'])

In [52]:
# groupKfold = 6, will leave out one body for each model
gkf = GroupKFold(6)

X = exp_L2_season_hum.data
y = exp_L2_season_hum.sample_metadata['ADD_0']
y = (y.astype(float))

groups = exp_L2_season_hum.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [53]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x11c661050>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_es...
                                             oob_score=False, random_state=999,
                                             verbose=0, warm_start=False

In [54]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 16, 'max_features': 'sqrt', 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [55]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 531.33


In [56]:
joblib.dump(gs.best_estimator_, 'models/bone_L2_season_hum.pkl')

['models/bone_L2_season_hum.pkl']

In [57]:
bone_L2_season_hum = joblib.load('models/bone_L2_season_hum.pkl')
bone_L2_season_hum

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=16, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features for L2 season and humidity model

In [59]:
importances = bone_L2_season_hum.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_L2_season_hum.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = exp_L2_season_hum.feature_metadata
np.savetxt("importances/bone_L2_season_hum_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (exp_L2_season_hum.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(exp_L2_season_hum.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(64,1))

np.savetxt("importances/bone_L2_season_hum_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. accumulated_percent_humidity_days_base0	0.2533338834980347
2. k__Bacteria;p__Verrucomicrobia	0.08267015001633723
3. k__Bacteria;p__Bacteroidetes	0.07629975306835772
4. k__Bacteria;p__Firmicutes	0.07627558654901216
5. k__Bacteria;p__Proteobacteria	0.07206698359093507
6. k__Bacteria;p__Planctomycetes	0.06456756730294942
7. k__Bacteria;p__Gemmatimonadetes	0.045977148973445466
8. k__Bacteria;p__[Thermi]	0.0429611251761901
9. k__Bacteria;p__Actinobacteria	0.04236310292871346
10. k__Bacteria;p__Chloroflexi	0.04086679604356672
11. k__Bacteria;p__Fusobacteria	0.034681976230749
12. k__Bacteria;p__Acidobacteria	0.032192278489662836
13. k__Bacteria;p__FBP	0.018760050631492785
14. k__Bacteria;p__BRC1	0.014635959296300417
15. k__Bacteria;p__Cyanobacteria	0.013911033478922263
16. k__Bacteria;p__Armatimonadetes	0.013837826319549253
17. k__Bacteria;p__Tenericutes	0.01032783337688724
18. k__Bacteria;p__GN02	0.008747702894678243
19. k__Bacteria;p__Chlorobi	0.007769872544560402

## Run the L2 model with precipitation and humidity

In [60]:
exp_L2_precip_hum = exp_L2.add_sample_metadata_as_features(['accumulated_precipitation_inches_base0', 'accumulated_percent_humidity_days_base0'])


In [61]:
# groupKfold = 6, will leave out one body for each model
gkf = GroupKFold(6)

X = exp_L2_precip_hum.data
y = exp_L2_precip_hum.sample_metadata['ADD_0']
y = (y.astype(float))

groups = exp_L2_precip_hum.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [62]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x11c661350>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_es...
                                             oob_score=False, random_state=999,
                                             verbose=0, warm_start=False

In [63]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [64]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 406.11


In [65]:
joblib.dump(gs.best_estimator_, 'models/bone_L2_precip_hum.pkl')

['models/bone_L2_precip_hum.pkl']

In [66]:
bone_L2_precip_hum = joblib.load('models/bone_L2_precip_hum.pkl')
bone_L2_precip_hum

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features for L2 precipitation and humidity model

In [68]:
importances = bone_L2_precip_hum.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_L2_precip_hum.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = exp_L2_precip_hum.feature_metadata
np.savetxt("importances/bone_L2_precip_hum_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (exp_L2_precip_hum.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(exp_L2_precip_hum.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(62,1))

np.savetxt("importances/bone_L2_precip_hum_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. accumulated_percent_humidity_days_base0	0.22476041233242242
2. accumulated_precipitation_inches_base0	0.18446048104557278
3. k__Bacteria;p__Verrucomicrobia	0.06738326470369013
4. k__Bacteria;p__Firmicutes	0.06088789638473622
5. k__Bacteria;p__Proteobacteria	0.05740217841469185
6. k__Bacteria;p__Bacteroidetes	0.05681705845657508
7. k__Bacteria;p__Planctomycetes	0.05345361532778305
8. k__Bacteria;p__Gemmatimonadetes	0.0395605765973457
9. k__Bacteria;p__Actinobacteria	0.03812417822402574
10. k__Bacteria;p__[Thermi]	0.03458588385567324
11. k__Bacteria;p__Chloroflexi	0.02987870153081879
12. k__Bacteria;p__Fusobacteria	0.027273228691235257
13. k__Bacteria;p__Acidobacteria	0.02456483015677615
14. k__Bacteria;p__FBP	0.018024526565985556
15. k__Bacteria;p__BRC1	0.011493930602283589
16. k__Bacteria;p__Armatimonadetes	0.010971746797529248
17. k__Bacteria;p__Cyanobacteria	0.010127104265964072
18. k__Bacteria;p__Tenericutes	0.009152876261128328
19. k__Bacteria;p__GN02	0.0

## Run the L2 model with season, precipitation, and humidity

In [69]:
exp_L2_season_precip_hum = exp_L2.add_sample_metadata_as_features(['season', 'accumulated_precipitation_inches_base0', 'accumulated_percent_humidity_days_base0'])


In [70]:
# groupKfold = 6, will leave out one body for each model
gkf = GroupKFold(6)

X = exp_L2_season_precip_hum.data
y = exp_L2_season_precip_hum.sample_metadata['ADD_0']
y = (y.astype(float))

groups = exp_L2_season_precip_hum.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [71]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x11ca212d0>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_es...
                                             oob_score=False, random_state=999,
                                             verbose=0, warm_start=False

In [72]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 16, 'max_features': 'sqrt', 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [73]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 406.7


In [74]:
joblib.dump(gs.best_estimator_, 'models/bone_L2_season_precip_hum.pkl')

['models/bone_L2_season_precip_hum.pkl']

In [75]:
bone_L2_season_precip_hum = joblib.load('models/bone_L2_season_precip_hum.pkl')
bone_L2_season_precip_hum

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=16, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features for L2 season, precipitation, humidity model

In [77]:
importances = bone_L2_season_precip_hum.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_L2_season_precip_hum.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = exp_L2_season_precip_hum.feature_metadata
np.savetxt("importances/bone_L2_season_precip_hum_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (exp_L2_season_precip_hum.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(exp_L2_season_precip_hum.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(66,1))

np.savetxt("importances/bone_L2_season_precip_hum_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. accumulated_percent_humidity_days_base0	0.22680683261232143
2. accumulated_precipitation_inches_base0	0.17959272299897452
3. k__Bacteria;p__Verrucomicrobia	0.07480774769635558
4. k__Bacteria;p__Firmicutes	0.06066531577550839
5. k__Bacteria;p__Bacteroidetes	0.05680288160051581
6. k__Bacteria;p__Proteobacteria	0.053385324101754325
7. k__Bacteria;p__Planctomycetes	0.051632336130879156
8. k__Bacteria;p__Actinobacteria	0.03795128541400078
9. k__Bacteria;p__Gemmatimonadetes	0.03286684337826081
10. k__Bacteria;p__[Thermi]	0.03228745628819685
11. k__Bacteria;p__Chloroflexi	0.03107105755467901
12. k__Bacteria;p__Fusobacteria	0.028624672876159583
13. k__Bacteria;p__Acidobacteria	0.026428551795106284
14. k__Bacteria;p__FBP	0.015394111157415563
15. k__Bacteria;p__BRC1	0.01072037299553048
16. k__Bacteria;p__Cyanobacteria	0.009541221772661306
17. k__Bacteria;p__Tenericutes	0.008948693912292685
18. k__Bacteria;p__Armatimonadetes	0.008796914714527584
19. k__Bacteria;p__GN02	