## Description

Bone models of 16S rRNA data, spring season only, at all levels, using the ml_new environment, rarefied data and no normalization during import, n = 1000 estimators, k fold = 3 (for regular, non-nested cross-validation), and hyperparameter tuning. Incorporates no metadata features.

In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing, svm, metrics
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV, GroupKFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVR
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_error, roc_auc_score, precision_score, make_scorer
from sklearn.linear_model import LassoCV, Lasso, LassoLarsIC, ElasticNet, LassoLarsCV
from sklearn.externals import joblib
from scipy.stats import randint as sp_randint
from matplotlib import pyplot as plt
from sklearn.feature_selection import RFECV
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier



In [2]:
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns
%matplotlib inline
import biom
import calour as ca
from scipy import stats
import scipy
import pickle
import time
import math
import inspect
pd.set_option('display.max_rows', 10000)

  import pandas.util.testing as pdt


## Import rarefied ASV table

In [3]:
exp_ASV = ca.read_amplicon('/Users/heatherdeel/Dropbox/PMI_3_analyses/bone/01_16S/01_qiime2_analysis/feature_tables/exported-feature-tables/frag_ins_filtered_noChloMito_17098_table.biom', '../02_metadata/maps/map3.txt', min_reads=0, normalize = None)




In [4]:
exp_ASV.sample_metadata['host_subject_id'].value_counts()

STAFS2016.011    8
STAFS2016.067    8
STAFS2016.065    8
STAFS2016.024    7
STAFS2016.007    7
STAFS2016.064    7
Name: host_subject_id, dtype: int64

In [5]:
exp_ASV.sample_metadata['ADD_0']

#SampleID
11553.SHSU.STAFS2016.024.R09      708.0555556
11553.STAFS.2016.065.L12          2268.055556
11553.STAFS.2016.067.R12          2414.444444
11553.SHSU.STAFS2016.007.L09      1911.111111
11553.SHSU.STAFS2016.011.L08      1301.666667
11553.STAFS.2016.065.R11          4756.388889
11553.STAFS.2016.065.R09          3456.111111
11553.SHSU.STAFS2016.024.L12      4285.833333
11553.SHSU.STAFS2016.007.L11      2584.444444
11553.SHSU.STAFS2016.011.L10      3777.222222
11553.STAFS.2016.064.R10          3456.111111
11553.SHSU.STAFS2016.024.R11      2584.444444
11553.STAFS.2016.067.L11.june     5201.388889
11553.SHSU.STAFS2016.007.R12      1301.666667
11553.STAFS.2016.065.R10          2804.722222
11553.STAFS.2016.064.R09          592.7777778
11553.STAFS.2016.067.L10          3623.055556
11553.SHSU.STAFS2016.011.R12      708.0555556
11553.STAFS.2016.064.R11          1300.555556
11553.SHSU.STAFS2016.011.L11      4285.833333
11553.SHSU.STAFS2016.007.L10      4821.388889
11553.STAFS.2016.065.R08

## Filter to only spring data

In [6]:
Spring_ASV = exp_ASV.filter_samples('season', 'spring')
Spring_ASV.sample_metadata.host_subject_id.value_counts()

STAFS2016.011    8
STAFS2016.024    7
STAFS2016.007    7
Name: host_subject_id, dtype: int64

## Spring ASV modeling

In [7]:
# groupKfold = 3, will leave out one body for each spring model
gkf = GroupKFold(3)

X = Spring_ASV.data
y = Spring_ASV.sample_metadata['ADD_0']
y = (y.astype(float))

groups = Spring_ASV.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [8]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x1200a7150>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_es...
                                             oob_score=False, random_state=999,
                                             verbose=0, warm_start=False

In [9]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 4, 'max_features': 0.1, 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [10]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 872.02


In [11]:
joblib.dump(gs.best_estimator_, 'models/bone_spring_ASV_no_metadata.pkl')

['models/bone_spring_ASV_no_metadata.pkl']

In [12]:
bone_spring_ASV = joblib.load('models/bone_spring_ASV_no_metadata.pkl')
bone_spring_ASV

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=4, max_features=0.1, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features of spring ASV model

In [14]:
importances = bone_spring_ASV.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_spring_ASV.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = Spring_ASV.feature_metadata
np.savetxt("importances/bone_spring_ASV_no_metadata_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (Spring_ASV.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(Spring_ASV.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(9442,1))

np.savetxt("importances/bone_spring_ASV_no_features_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. TACGTAGGGGGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGGGTACGTAGGCGGTTTAGAAAGTCAGATGTTAAAGCGTGAGGCTCAACCTCATAAAGCATTTGAAACTTCTAGACTTGAGTAGTGGAGAGGAAAGTGGAATTCCTAGT	0.07489245554322162
2. TACGTAGGGCGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGTTTGTCGCGTCTGGTGTGAAAACTCGAGGCTCAACCTCGAGCTTGCATCGGGTACGGGCAGACTAGAGTGCGGTAGGGGAGACTGGAATTCCTGG	0.05618468317160884
3. TACGTAGGGCGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCTTGTAGGTGGCTTGTCGCGTCTGCCGTGAAAACCCGAGGCTCAACCTCGGGCGTGCGGTGGGTACGGGCAGGCTAGAGTGTGGTAGGGGAGACTGGAACTCCTGG	0.047554065609534846
4. TACGTAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGTTTTGTAAGACAGTGGTGAAATCCCCGGGCTCAACCTGGGAACTGCCATTGTGACTGCAAGGCTAGAGTGCGGCAGAGGGGGATGGAATTCCGCG	0.045072869549405696
5. TACGTAGGGTGCAAGCGTTGTCCGGAATTATTGGGCGTAAAGAGCTCGTAGGCGGCTTGTCGCGTCTGCCGTGAAAACCTCAGGCTCAACCTGGGGCGTGCGGTGGGTACGGGCAGGCTAGAGTGCGGTAGGGGAGATTGGAATTCCTGG	0.042091762518432066
6. TACGAAGGGGGCTAGCGTTGCTCGGAATTACTGGGCGTAAAGGGCGCGTAGGCGGACAGTTTAGTCAGAGGTGAAAGCCCAGGGCTCAACCTTGGAATTG

## Import rarefied table collapsed at level 7

In [15]:
exp_L7 = ca.read_amplicon('/Users/heatherdeel/Dropbox/PMI_3_analyses/bone/01_16S/01_qiime2_analysis/feature_tables/collapsed_tables/frag_ins_noChloMito_17098_L7_table.biom', '../02_metadata/maps/map3.txt', min_reads=0, normalize = None)




In [16]:
# filter to spring
Spring_L7 = exp_L7.filter_samples('season', 'spring')
Spring_L7.sample_metadata.host_subject_id.value_counts()

STAFS2016.011    8
STAFS2016.024    7
STAFS2016.007    7
Name: host_subject_id, dtype: int64

In [17]:
print(Spring_L7.feature_metadata)

                                                                                          _feature_id
k__Archaea;p__Crenarchaeota;c__Thaumarchaeota;o...  k__Archaea;p__Crenarchaeota;c__Thaumarchaeota;...
k__Archaea;p__Euryarchaeota;c__Methanobacteria;...  k__Archaea;p__Euryarchaeota;c__Methanobacteria...
k__Bacteria;__;__;__;__;__;__                                           k__Bacteria;__;__;__;__;__;__
k__Bacteria;p__Acidobacteria;c__DA052;o__Ellin6...  k__Bacteria;p__Acidobacteria;c__DA052;o__Ellin...
k__Bacteria;p__Acidobacteria;c__Solibacteres;o_...  k__Bacteria;p__Acidobacteria;c__Solibacteres;o...
k__Bacteria;p__Acidobacteria;c__[Chloracidobact...  k__Bacteria;p__Acidobacteria;c__[Chloracidobac...
k__Bacteria;p__Acidobacteria;c__[Chloracidobact...  k__Bacteria;p__Acidobacteria;c__[Chloracidobac...
k__Bacteria;p__Actinobacteria;c__Acidimicrobiia...  k__Bacteria;p__Actinobacteria;c__Acidimicrobii...
k__Bacteria;p__Actinobacteria;c__Acidimicrobiia...  k__Bacteria;p__Actinobacteria;

## Spring L7 modeling

In [18]:
# groupKfold = 3, will leave out one body for each spring model
gkf = GroupKFold(3)

X = Spring_L7.data
y = Spring_L7.sample_metadata['ADD_0']
y = (y.astype(float))

groups = Spring_L7.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [19]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x11fc4a250>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_es...
                                             oob_score=False, random_state=999,
                                             verbose=0, warm_start=False

In [20]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 8, 'max_features': 0.1, 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [21]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 904.18


In [22]:
joblib.dump(gs.best_estimator_, 'models/bone_spring_L7_no_metadata.pkl')

['models/bone_spring_L7_no_metadata.pkl']

In [23]:
bone_spring_L7 = joblib.load('models/bone_spring_L7_no_metadata.pkl')
bone_spring_L7

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=8, max_features=0.1, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features of spring L7 model

In [25]:
importances = bone_spring_L7.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_spring_L7.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = Spring_L7.feature_metadata
np.savetxt("importances/bone_spring_L7_no_metadata_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (Spring_L7.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(Spring_L7.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(1342,1))

np.savetxt("importances/bone_spring_L7_no_features_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__[Tissierellaceae];g__Gallicola;s__	0.05630369289508368
2. k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Promicromonosporaceae;g__Cellulosimicrobium;s__	0.05566772100379429
3. k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Dermabacteraceae;g__Brachybacterium;s__	0.04224946637057502
4. k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Dietziaceae;g__Dietzia;s__	0.03552387273347633
5. k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__[Tissierellaceae];g__;s__	0.03486331581467503
6. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodospirillales;f__Acetobacteraceae;g__Roseococcus;s__	0.029799208108220177
7. k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Comamonadaceae;g__Leptothrix;s__	0.02345427299142483
8. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Caulobacterales;f__Caulobacter

## Import rarefied table collapsed at level 6

In [26]:
exp_L6 = ca.read_amplicon('/Users/heatherdeel/Dropbox/PMI_3_analyses/bone/01_16S/01_qiime2_analysis/feature_tables/collapsed_tables/frag_ins_noChloMito_17098_L6_table.biom', '../02_metadata/maps/map3.txt', min_reads=0, normalize = None)




In [27]:
# filter to spring
Spring_L6 = exp_L6.filter_samples('season', 'spring')
Spring_L6.sample_metadata.host_subject_id.value_counts()

STAFS2016.011    8
STAFS2016.024    7
STAFS2016.007    7
Name: host_subject_id, dtype: int64

In [28]:
print(Spring_L6.feature_metadata)

                                                                                          _feature_id
k__Archaea;p__Crenarchaeota;c__Thaumarchaeota;o...  k__Archaea;p__Crenarchaeota;c__Thaumarchaeota;...
k__Archaea;p__Euryarchaeota;c__Methanobacteria;...  k__Archaea;p__Euryarchaeota;c__Methanobacteria...
k__Bacteria;__;__;__;__;__                                                 k__Bacteria;__;__;__;__;__
k__Bacteria;p__Acidobacteria;c__DA052;o__Ellin6...  k__Bacteria;p__Acidobacteria;c__DA052;o__Ellin...
k__Bacteria;p__Acidobacteria;c__Solibacteres;o_...  k__Bacteria;p__Acidobacteria;c__Solibacteres;o...
k__Bacteria;p__Acidobacteria;c__[Chloracidobact...  k__Bacteria;p__Acidobacteria;c__[Chloracidobac...
k__Bacteria;p__Acidobacteria;c__[Chloracidobact...  k__Bacteria;p__Acidobacteria;c__[Chloracidobac...
k__Bacteria;p__Actinobacteria;c__Acidimicrobiia...  k__Bacteria;p__Actinobacteria;c__Acidimicrobii...
k__Bacteria;p__Actinobacteria;c__Acidimicrobiia...  k__Bacteria;p__Actinobacteria;

## Spring L6 modeling

In [29]:
# groupKfold = 3, will leave out one body for each spring model
gkf = GroupKFold(3)

X = Spring_L6.data
y = Spring_L6.sample_metadata['ADD_0']
y = (y.astype(float))

groups = Spring_L6.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [30]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x11ffcd8d0>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_es...
                                             oob_score=False, random_state=999,
                                             verbose=0, warm_start=False

In [31]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 8, 'max_features': 0.1, 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [32]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 884.32


In [33]:
joblib.dump(gs.best_estimator_, 'models/bone_spring_L6_no_metadata.pkl')

['models/bone_spring_L6_no_metadata.pkl']

In [34]:
bone_spring_L6 = joblib.load('models/bone_spring_L6_no_metadata.pkl')
bone_spring_L6

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=8, max_features=0.1, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features of spring L6 model

In [36]:
importances = bone_spring_L6.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_spring_L6.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = Spring_L6.feature_metadata
np.savetxt("importances/bone_spring_L6_no_metadata_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (Spring_L6.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(Spring_L6.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(1104,1))

np.savetxt("importances/bone_spring_L6_no_features_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__[Tissierellaceae];g__Gallicola	0.061123173745340494
2. k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Promicromonosporaceae;g__Cellulosimicrobium	0.06065807317052671
3. k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Dietziaceae;g__Dietzia	0.044960107070838216
4. k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__[Tissierellaceae];g__	0.037520632187693144
5. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodospirillales;f__Acetobacteraceae;g__Roseococcus	0.027498634101359696
6. k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Comamonadaceae;g__Leptothrix	0.02531771681357879
7. k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__[Tissierellaceae];g__Peptoniphilus	0.02185763739339386
8. k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pseudomonadales;f__Pseudomonadaceae;g__Pseudomonas	0.0211023922512

## Import rarefied table collapsed at level 5

In [37]:
exp_L5 = ca.read_amplicon('/Users/heatherdeel/Dropbox/PMI_3_analyses/bone/01_16S/01_qiime2_analysis/feature_tables/collapsed_tables/frag_ins_noChloMito_17098_L5_table.biom', '../02_metadata/maps/map3.txt', min_reads=0, normalize = None)




In [38]:
# filter to spring
Spring_L5 = exp_L5.filter_samples('season', 'spring')
Spring_L5.sample_metadata.host_subject_id.value_counts()

STAFS2016.011    8
STAFS2016.024    7
STAFS2016.007    7
Name: host_subject_id, dtype: int64

In [39]:
print(Spring_L5.feature_metadata)

                                                                                          _feature_id
k__Archaea;p__Crenarchaeota;c__Thaumarchaeota;o...  k__Archaea;p__Crenarchaeota;c__Thaumarchaeota;...
k__Archaea;p__Euryarchaeota;c__Methanobacteria;...  k__Archaea;p__Euryarchaeota;c__Methanobacteria...
k__Bacteria;__;__;__;__                                                       k__Bacteria;__;__;__;__
k__Bacteria;p__Acidobacteria;c__DA052;o__Ellin6...  k__Bacteria;p__Acidobacteria;c__DA052;o__Ellin...
k__Bacteria;p__Acidobacteria;c__Solibacteres;o_...  k__Bacteria;p__Acidobacteria;c__Solibacteres;o...
k__Bacteria;p__Acidobacteria;c__[Chloracidobact...  k__Bacteria;p__Acidobacteria;c__[Chloracidobac...
k__Bacteria;p__Acidobacteria;c__[Chloracidobact...  k__Bacteria;p__Acidobacteria;c__[Chloracidobac...
k__Bacteria;p__Actinobacteria;c__Acidimicrobiia...  k__Bacteria;p__Actinobacteria;c__Acidimicrobii...
k__Bacteria;p__Actinobacteria;c__Acidimicrobiia...  k__Bacteria;p__Actinobacteria;

## Spring L5 modeling

In [40]:
# groupKfold = 3, will leave out one body for each spring model
gkf = GroupKFold(3)

X = Spring_L5.data
y = Spring_L5.sample_metadata['ADD_0']
y = (y.astype(float))

groups = Spring_L5.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [41]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x120e4f850>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_es...
                                             oob_score=False, random_state=999,
                                             verbose=0, warm_start=False

In [42]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 16, 'max_features': 0.1, 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [43]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 941.63


In [44]:
joblib.dump(gs.best_estimator_, 'models/bone_spring_L5_no_metadata.pkl')

['models/bone_spring_L5_no_metadata.pkl']

In [45]:
bone_spring_L5 = joblib.load('models/bone_spring_L5_no_metadata.pkl')
bone_spring_L5

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=16, max_features=0.1, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features of spring L5 model

In [47]:
importances = bone_spring_L5.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_spring_L5.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = Spring_L5.feature_metadata
np.savetxt("importances/bone_spring_L5_no_metadata_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (Spring_L5.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(Spring_L5.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(516,1))

np.savetxt("importances/bone_spring_L5_no_features_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Promicromonosporaceae	0.05477378880570991
2. k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Dietziaceae	0.0518577414018554
3. k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Peptostreptococcaceae	0.04140386696081289
4. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Phyllobacteriaceae	0.03831272879518664
5. k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Planococcaceae	0.03686118761981151
6. k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Oxalobacteraceae	0.026478328182116095
7. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Caulobacterales;f__Caulobacteraceae	0.02464191491115831
8. k__Bacteria;p__Firmicutes;c__Bacilli;o__Gemellales;f__Gemellaceae	0.023340968484004777
9. k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales;f__Bacillaceae	0.022864892665730725
10. k__Bacteria;p__Proteobacteria;c__

## Import rarefied table collapsed at level 4

In [3]:
exp_L4 = ca.read_amplicon('/Users/heatherdeel/Dropbox/PMI_3_analyses/bone/01_16S/01_qiime2_analysis/feature_tables/collapsed_tables/frag_ins_noChloMito_17098_L4_table.biom', '../02_metadata/maps/map3.txt', min_reads=0, normalize = None)




In [4]:
# filter to spring
Spring_L4 = exp_L4.filter_samples('season', 'spring')
Spring_L4.sample_metadata.host_subject_id.value_counts()

STAFS2016.011    8
STAFS2016.024    7
STAFS2016.007    7
Name: host_subject_id, dtype: int64

In [5]:
print(Spring_L4.feature_metadata)

                                                                                          _feature_id
k__Archaea;p__Crenarchaeota;c__Thaumarchaeota;o...  k__Archaea;p__Crenarchaeota;c__Thaumarchaeota;...
k__Archaea;p__Euryarchaeota;c__Methanobacteria;...  k__Archaea;p__Euryarchaeota;c__Methanobacteria...
k__Bacteria;__;__;__                                                             k__Bacteria;__;__;__
k__Bacteria;p__Acidobacteria;c__DA052;o__Ellin6513  k__Bacteria;p__Acidobacteria;c__DA052;o__Ellin...
k__Bacteria;p__Acidobacteria;c__Solibacteres;o_...  k__Bacteria;p__Acidobacteria;c__Solibacteres;o...
k__Bacteria;p__Acidobacteria;c__[Chloracidobact...  k__Bacteria;p__Acidobacteria;c__[Chloracidobac...
k__Bacteria;p__Actinobacteria;c__Acidimicrobiia...  k__Bacteria;p__Actinobacteria;c__Acidimicrobii...
k__Bacteria;p__Actinobacteria;c__Actinobacteria...  k__Bacteria;p__Actinobacteria;c__Actinobacteri...
k__Bacteria;p__Actinobacteria;c__Coriobacteriia...  k__Bacteria;p__Actinobacteria;

## Spring L4 modeling

In [6]:
# groupKfold = 3, will leave out one body for each spring model
gkf = GroupKFold(3)

X = Spring_L4.data
y = Spring_L4.sample_metadata['ADD_0']
y = (y.astype(float))

groups = Spring_L4.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
#param_grid = {"max_depth": [4, 8, 16, None],
#              "max_features": ['sqrt', 'log2', 0.1],
#              "min_samples_split": [0.001, 0.01, 0.1],
#              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
#              "bootstrap": [True, False]}

param_grid = {"max_depth": [8],
          "max_features": [0.1],
          "min_samples_split": [0.001],
          "min_weight_fraction_leaf": [0.0001],
          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [7]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x7fa88a959d50>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n...imators=1000, n_jobs=None,
                                             oob_score=False, random_state=999,
                                             v

In [8]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 8, 'max_features': 0.1, 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [9]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 1025.01


In [10]:
joblib.dump(gs.best_estimator_, 'models/bone_spring_L4_no_metadata.pkl')

['models/bone_spring_L4_no_metadata.pkl']

In [11]:
bone_spring_L4 = joblib.load('models/bone_spring_L4_no_metadata.pkl')
bone_spring_L4

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=8, max_features=0.1, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features of spring L4 model

In [59]:
importances = bone_spring_L4.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_spring_L4.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = Spring_L4.feature_metadata
np.savetxt("importances/bone_spring_L4_no_metadata_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (Spring_L4.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(Spring_L4.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(258,1))

np.savetxt("importances/bone_spring_L4_no_features_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Caulobacterales	0.05475138328196004
2. k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales	0.04366773645599929
3. k__Bacteria;p__Bacteroidetes;c__Sphingobacteriia;o__Sphingobacteriales	0.04107538761418487
4. k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales	0.04029268806222716
5. k__Bacteria;p__Firmicutes;c__Erysipelotrichi;o__Erysipelotrichales	0.038332603761591914
6. k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Xanthomonadales	0.036851302183539
7. k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__	0.035501615549257945
8. k__Bacteria;p__Firmicutes;c__Bacilli;o__Bacillales	0.03268663400586559
9. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales	0.031720504826073984
10. k__Bacteria;p__Firmicutes;c__Bacilli;o__Gemellales	0.028205529085910404
11. k__Bacteria;p__Bacteroidetes;c__[Saprospirae];o__[Saprospirales]	0.02741886265250109
12. k__Bacteria;p__Proteoba

## Import rarefied table collapsed at level 3

In [60]:
exp_L3 = ca.read_amplicon('/Users/heatherdeel/Dropbox/PMI_3_analyses/bone/01_16S/01_qiime2_analysis/feature_tables/collapsed_tables/frag_ins_noChloMito_17098_L3_table.biom', '/Users/heatherdeel/Dropbox/PMI_3_analyses/bone/01_16S/02_metadata/maps/map3.txt', min_reads=0, normalize = None)




In [61]:
# filter to spring
Spring_L3 = exp_L3.filter_samples('season', 'spring')
Spring_L3.sample_metadata.host_subject_id.value_counts()

STAFS2016.011    8
STAFS2016.024    7
STAFS2016.007    7
Name: host_subject_id, dtype: int64

In [62]:
print(Spring_L3.feature_metadata)

                                                                                          _feature_id
k__Archaea;p__Crenarchaeota;c__Thaumarchaeota           k__Archaea;p__Crenarchaeota;c__Thaumarchaeota
k__Archaea;p__Euryarchaeota;c__Methanobacteria         k__Archaea;p__Euryarchaeota;c__Methanobacteria
k__Bacteria;__;__                                                                   k__Bacteria;__;__
k__Bacteria;p__Acidobacteria;c__DA052                           k__Bacteria;p__Acidobacteria;c__DA052
k__Bacteria;p__Acidobacteria;c__Solibacteres             k__Bacteria;p__Acidobacteria;c__Solibacteres
k__Bacteria;p__Acidobacteria;c__[Chloracidobact...  k__Bacteria;p__Acidobacteria;c__[Chloracidobac...
k__Bacteria;p__Actinobacteria;c__Acidimicrobiia       k__Bacteria;p__Actinobacteria;c__Acidimicrobiia
k__Bacteria;p__Actinobacteria;c__Actinobacteria       k__Bacteria;p__Actinobacteria;c__Actinobacteria
k__Bacteria;p__Actinobacteria;c__Coriobacteriia       k__Bacteria;p__Actinobacteri

## Spring L3 modeling

In [63]:
# groupKfold = 3, will leave out one body for each spring model
gkf = GroupKFold(3)

X = Spring_L3.data
y = Spring_L3.sample_metadata['ADD_0']
y = (y.astype(float))

groups = Spring_L3.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [64]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x122715cd0>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_es...
                                             oob_score=False, random_state=999,
                                             verbose=0, warm_start=False

In [65]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 8, 'max_features': 'log2', 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [66]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 987.59


In [67]:
joblib.dump(gs.best_estimator_, 'models/bone_spring_L3_no_metadata.pkl')

['models/bone_spring_L3_no_metadata.pkl']

In [68]:
bone_spring_L3 = joblib.load('models/bone_spring_L3_no_metadata.pkl')
bone_spring_L3

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=8, max_features='log2', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features of spring L3 model

In [70]:
importances = bone_spring_L3.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_spring_L3.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = Spring_L3.feature_metadata
np.savetxt("importances/bone_spring_L3_no_metadata_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (Spring_L3.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(Spring_L3.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(124,1))

np.savetxt("importances/bone_spring_L3_no_features_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria	0.08448062230823035
2. k__Bacteria;p__Firmicutes;c__Erysipelotrichi	0.06961125680696126
3. k__Bacteria;p__Bacteroidetes;c__Sphingobacteriia	0.06669654295210935
4. k__Bacteria;p__Firmicutes;c__Clostridia	0.06337153783921855
5. k__Bacteria;p__Firmicutes;c__Bacilli	0.058919919707038707
6. k__Bacteria;p__Proteobacteria;c__Betaproteobacteria	0.05175102318565451
7. k__Bacteria;p__Bacteroidetes;c__Flavobacteriia	0.05118527235754803
8. k__Bacteria;p__Bacteroidetes;c__Bacteroidia	0.043538435302568346
9. k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria	0.04241445998603448
10. k__Bacteria;p__[Thermi];c__Deinococci	0.03768005104421956
11. k__Bacteria;p__Bacteroidetes;c__[Saprospirae]	0.03406603107901827
12. k__Bacteria;p__Cyanobacteria;c__4C0d-2	0.027354857636189497
13. k__Bacteria;p__Proteobacteria;c__Epsilonproteobacteria	0.025014212589116003
14. k__Bacteria;p__Actinobacteria;c__Actinobacteria	0.02495028014021742

## Import rarefied table collapsed at level 2

In [71]:
exp_L2 = ca.read_amplicon('/Users/heatherdeel/Dropbox/PMI_3_analyses/bone/01_16S/01_qiime2_analysis/feature_tables/collapsed_tables/frag_ins_noChloMito_17098_L2_table.biom', '../02_metadata/maps/map3.txt', min_reads=0, normalize = None)




In [72]:
# filter to spring
Spring_L2 = exp_L2.filter_samples('season', 'spring')
Spring_L2.sample_metadata.host_subject_id.value_counts()

STAFS2016.011    8
STAFS2016.024    7
STAFS2016.007    7
Name: host_subject_id, dtype: int64

In [73]:
print(Spring_L2.feature_metadata)

                                                     _feature_id
k__Archaea;p__Crenarchaeota          k__Archaea;p__Crenarchaeota
k__Archaea;p__Euryarchaeota          k__Archaea;p__Euryarchaeota
k__Bacteria;__                                    k__Bacteria;__
k__Bacteria;p__Acidobacteria        k__Bacteria;p__Acidobacteria
k__Bacteria;p__Actinobacteria      k__Bacteria;p__Actinobacteria
k__Bacteria;p__Armatimonadetes    k__Bacteria;p__Armatimonadetes
k__Bacteria;p__BRC1                          k__Bacteria;p__BRC1
k__Bacteria;p__Bacteroidetes        k__Bacteria;p__Bacteroidetes
k__Bacteria;p__Chlorobi                  k__Bacteria;p__Chlorobi
k__Bacteria;p__Chloroflexi            k__Bacteria;p__Chloroflexi
k__Bacteria;p__Cyanobacteria        k__Bacteria;p__Cyanobacteria
k__Bacteria;p__Deferribacteres    k__Bacteria;p__Deferribacteres
k__Bacteria;p__FBP                            k__Bacteria;p__FBP
k__Bacteria;p__Firmicutes              k__Bacteria;p__Firmicutes
k__Bacteria;p__Fusobacter

## Spring L2 modeling

In [74]:
# groupKfold = 3, will leave out one body for each spring model
gkf = GroupKFold(3)

X = Spring_L2.data
y = Spring_L2.sample_metadata['ADD_0']
y = (y.astype(float))

groups = Spring_L2.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [75]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x122702f50>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_es...
                                             oob_score=False, random_state=999,
                                             verbose=0, warm_start=False

In [76]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 8, 'max_features': 0.1, 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [77]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 1074.76


In [78]:
joblib.dump(gs.best_estimator_, 'models/bone_spring_L2_no_metadata.pkl')

['models/bone_spring_L2_no_metadata.pkl']

In [79]:
bone_spring_L2 = joblib.load('models/bone_spring_L2_no_metadata.pkl')
bone_spring_L2

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=8, max_features=0.1, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features of spring L2 model

In [80]:
importances = bone_spring_L2.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_spring_L2.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = Spring_L2.feature_metadata
np.savetxt("importances/bone_spring_L2_no_metadata_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (Spring_L2.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(Spring_L2.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(58,1))

np.savetxt("importances/bone_spring_L2_no_features_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. k__Bacteria;p__Firmicutes	0.13530715039970023
2. k__Bacteria;p__Bacteroidetes	0.10307049542398712
3. k__Bacteria;p__Proteobacteria	0.0987264809928522
4. k__Bacteria;p__[Thermi]	0.07440302969224813
5. k__Bacteria;p__Actinobacteria	0.06262480771454591
6. k__Bacteria;p__Planctomycetes	0.06113420010000938
7. k__Bacteria;p__Verrucomicrobia	0.0578917215512778
8. k__Bacteria;p__Fusobacteria	0.054603372264265426
9. k__Bacteria;p__Chloroflexi	0.04277828070587122
10. k__Bacteria;p__Cyanobacteria	0.034238303496331585
11. k__Bacteria;p__GN02	0.03130093319495075
12. k__Bacteria;p__Armatimonadetes	0.02724114167654929
13. k__Bacteria;p__Gemmatimonadetes	0.026806790368113127
14. k__Bacteria;p__Tenericutes	0.024549767112663032
15. k__Bacteria;p__Deferribacteres	0.024323105308209886
16. k__Bacteria;p__FBP	0.024246926476910798
17. k__Bacteria;p__Acidobacteria	0.023407665822652263
18. k__Bacteria;p__OP11	0.02151079451611744
19. k__Bacteria;p__BRC1	0.020546800080858508
20. k__Bac