## Description

Bone models of 16S rRNA data, summer season only, at all levels, using the ml_new environment, rarefied data and no normalization during import, n = 1000 estimators, k fold = 3 (for regular, non-nested cross-validation), and hyperparameter tuning. Incorporates no metadata features.

In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing, svm, metrics
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV, GroupKFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVR
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_error, roc_auc_score, precision_score, make_scorer
from sklearn.linear_model import LassoCV, Lasso, LassoLarsIC, ElasticNet, LassoLarsCV
from sklearn.externals import joblib
from scipy.stats import randint as sp_randint
from matplotlib import pyplot as plt
from sklearn.feature_selection import RFECV
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier



In [2]:
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns
%matplotlib inline
import biom
import calour as ca
from scipy import stats
import scipy
import pickle
import time
import math
import inspect
pd.set_option('display.max_rows', 10000)

  import pandas.util.testing as pdt


## Import rarefied ASV table

In [3]:
exp_ASV = ca.read_amplicon('/Users/heatherdeel/Dropbox/PMI_3_analyses/bone/01_16S/01_qiime2_analysis/feature_tables/exported-feature-tables/frag_ins_filtered_noChloMito_17098_table.biom', '../02_metadata/maps/map3.txt', min_reads=0, normalize = None)




In [4]:
exp_ASV.sample_metadata['host_subject_id'].value_counts()

STAFS2016.067    8
STAFS2016.011    8
STAFS2016.065    8
STAFS2016.064    7
STAFS2016.024    7
STAFS2016.007    7
Name: host_subject_id, dtype: int64

In [5]:
exp_ASV.sample_metadata['ADD_0']

#SampleID
11553.SHSU.STAFS2016.024.R09      708.0555556
11553.STAFS.2016.065.L12          2268.055556
11553.STAFS.2016.067.R12          2414.444444
11553.SHSU.STAFS2016.007.L09      1911.111111
11553.SHSU.STAFS2016.011.L08      1301.666667
11553.STAFS.2016.065.R11          4756.388889
11553.STAFS.2016.065.R09          3456.111111
11553.SHSU.STAFS2016.024.L12      4285.833333
11553.SHSU.STAFS2016.007.L11      2584.444444
11553.SHSU.STAFS2016.011.L10      3777.222222
11553.STAFS.2016.064.R10          3456.111111
11553.SHSU.STAFS2016.024.R11      2584.444444
11553.STAFS.2016.067.L11.june     5201.388889
11553.SHSU.STAFS2016.007.R12      1301.666667
11553.STAFS.2016.065.R10          2804.722222
11553.STAFS.2016.064.R09          592.7777778
11553.STAFS.2016.067.L10          3623.055556
11553.SHSU.STAFS2016.011.R12      708.0555556
11553.STAFS.2016.064.R11          1300.555556
11553.SHSU.STAFS2016.011.L11      4285.833333
11553.SHSU.STAFS2016.007.L10      4821.388889
11553.STAFS.2016.065.R08

## Filter to only summer data

In [6]:
Summer_ASV = exp_ASV.filter_samples('season', 'summer')
Summer_ASV.sample_metadata.host_subject_id.value_counts()

STAFS2016.065    8
STAFS2016.067    8
STAFS2016.064    7
Name: host_subject_id, dtype: int64

## Summer ASV modeling

In [7]:
# groupKfold = 3, will leave out one body for each summer model
gkf = GroupKFold(3)

X = Summer_ASV.data
y = Summer_ASV.sample_metadata['ADD_0']
y = (y.astype(float))

groups = Summer_ASV.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [8]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x11961a150>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_es...
                                             oob_score=False, random_state=999,
                                             verbose=0, warm_start=False

In [9]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 16, 'max_features': 'sqrt', 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [10]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 723.98


In [11]:
joblib.dump(gs.best_estimator_, 'models/bone_summer_ASV_no_metadata.pkl')

['models/bone_summer_ASV_no_metadata.pkl']

In [12]:
bone_summer_ASV = joblib.load('models/bone_summer_ASV_no_metadata.pkl')
bone_summer_ASV

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=16, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features of summer ASV model

In [14]:
importances = bone_summer_ASV.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_summer_ASV.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = Summer_ASV.feature_metadata
np.savetxt("importances/bone_summer_ASV_no_metadata_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (Summer_ASV.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(Summer_ASV.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(9442,1))

np.savetxt("importances/bone_summer_ASV_no_features_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTTAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTCTAATACTGTCGATCTCGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.013563481045241806
2. TACGAGGGGAGCTAGCGTTATTCGGAATTACTGGGCGTAAAGCGCGCGTAGGCGGTTTTTCAAGTCAGGGGTGAAAGCCCGGAGCTCAACTCCGGAATAGCCTTTGAAACTGGAGGACTTGAATCTTGGAGAGGTCAGTGGAATTCCGAG	0.011904687665606381
3. TACGTAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGTTCGGAAAGAAGGATGTGAAATCCCAGGGCTCAACCTTGGAACTGCATTCTTAACTACCGGGCTAGAGTGTGTCAGAGGGAGGTGGAATTCCGCG	0.011473593164739615
4. TACGAAGGGGGCTAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCACGTAGGCGGATTGGTCAGTTAGGGGTGAAATCCCAGAGCTCAACTCTGGAACTGCCCTTAATACTGCCAGTCTAGAGTCCGGAAGAGGTGAGTGGAACTCCTAG	0.010738739987822222
5. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATCGGTCAGTTAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTTTAATACTGCTGGTCTAGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.010263000532674274
6. TACGGAGGGGACTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATCAGAAAGTTGGGGGTGAAATCCCGGGGCTCAACCCCGGAAC

## Import rarefied table collapsed at level 7

In [15]:
exp_L7 = ca.read_amplicon('/Users/heatherdeel/Dropbox/PMI_3_analyses/bone/01_16S/01_qiime2_analysis/feature_tables/collapsed_tables/frag_ins_noChloMito_17098_L7_table.biom', '../02_metadata/maps/map3.txt', min_reads=0, normalize = None)




In [16]:
# filter to summer
Summer_L7 = exp_L7.filter_samples('season', 'summer')
Summer_L7.sample_metadata.host_subject_id.value_counts()

STAFS2016.065    8
STAFS2016.067    8
STAFS2016.064    7
Name: host_subject_id, dtype: int64

In [17]:
print(Summer_L7.feature_metadata)

                                                                                          _feature_id
k__Archaea;p__Crenarchaeota;c__Thaumarchaeota;o...  k__Archaea;p__Crenarchaeota;c__Thaumarchaeota;...
k__Archaea;p__Euryarchaeota;c__Methanobacteria;...  k__Archaea;p__Euryarchaeota;c__Methanobacteria...
k__Bacteria;__;__;__;__;__;__                                           k__Bacteria;__;__;__;__;__;__
k__Bacteria;p__Acidobacteria;c__DA052;o__Ellin6...  k__Bacteria;p__Acidobacteria;c__DA052;o__Ellin...
k__Bacteria;p__Acidobacteria;c__Solibacteres;o_...  k__Bacteria;p__Acidobacteria;c__Solibacteres;o...
k__Bacteria;p__Acidobacteria;c__[Chloracidobact...  k__Bacteria;p__Acidobacteria;c__[Chloracidobac...
k__Bacteria;p__Acidobacteria;c__[Chloracidobact...  k__Bacteria;p__Acidobacteria;c__[Chloracidobac...
k__Bacteria;p__Actinobacteria;c__Acidimicrobiia...  k__Bacteria;p__Actinobacteria;c__Acidimicrobii...
k__Bacteria;p__Actinobacteria;c__Acidimicrobiia...  k__Bacteria;p__Actinobacteria;

## Summer L7 modeling

In [18]:
# groupKfold = 3, will leave out one body for each spring model
gkf = GroupKFold(3)

X = Summer_L7.data
y = Summer_L7.sample_metadata['ADD_0']
y = (y.astype(float))

groups = Summer_L7.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [19]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x119223250>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_es...
                                             oob_score=False, random_state=999,
                                             verbose=0, warm_start=False

In [20]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 4, 'max_features': 'log2', 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [21]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 729.09


In [22]:
joblib.dump(gs.best_estimator_, 'models/bone_summer_L7_no_metadata.pkl')

['models/bone_summer_L7_no_metadata.pkl']

In [23]:
bone_summer_L7 = joblib.load('models/bone_summer_L7_no_metadata.pkl')
bone_summer_L7

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=4, max_features='log2', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features of summer L7 model

In [25]:
importances = bone_summer_L7.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_summer_L7.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = Summer_L7.feature_metadata
np.savetxt("importances/bone_summer_L7_no_metadata_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (Summer_L7.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(Summer_L7.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(1342,1))

np.savetxt("importances/bone_summer_L7_no_features_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Phyllobacteriaceae;g__;s__	0.016246890224742155
2. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales;f__Sphingomonadaceae;g__Sphingopyxis;s__	0.015747700123308072
3. k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Nocardiaceae;g__Rhodococcus;s__	0.014979012629035037
4. k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Nocardioidaceae;g__Aeromicrobium;s__	0.01320175832081318
5. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales;f__Sphingomonadaceae;g__Sphingomonas;s__	0.012685685526947493
6. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Phyllobacteriaceae;g__Pseudaminobacter;s__	0.012563688258466972
7. k__Bacteria;p__Verrucomicrobia;c__[Spartobacteria];o__[Chthoniobacterales];f__[Chthoniobacteraceae];g__heteroC45_4W;s__	0.011646423546858391
8. k__Bacteria;p__Actinobacteria;c__Ther

## Import rarefied table collapsed at level 6

In [26]:
exp_L6 = ca.read_amplicon('/Users/heatherdeel/Dropbox/PMI_3_analyses/bone/01_16S/01_qiime2_analysis/feature_tables/collapsed_tables/frag_ins_noChloMito_17098_L6_table.biom', '../02_metadata/maps/map3.txt', min_reads=0, normalize = None)




In [27]:
# filter to summer
Summer_L6 = exp_L6.filter_samples('season', 'summer')
Summer_L6.sample_metadata.host_subject_id.value_counts()

STAFS2016.065    8
STAFS2016.067    8
STAFS2016.064    7
Name: host_subject_id, dtype: int64

In [28]:
print(Summer_L6.feature_metadata)

                                                                                          _feature_id
k__Archaea;p__Crenarchaeota;c__Thaumarchaeota;o...  k__Archaea;p__Crenarchaeota;c__Thaumarchaeota;...
k__Archaea;p__Euryarchaeota;c__Methanobacteria;...  k__Archaea;p__Euryarchaeota;c__Methanobacteria...
k__Bacteria;__;__;__;__;__                                                 k__Bacteria;__;__;__;__;__
k__Bacteria;p__Acidobacteria;c__DA052;o__Ellin6...  k__Bacteria;p__Acidobacteria;c__DA052;o__Ellin...
k__Bacteria;p__Acidobacteria;c__Solibacteres;o_...  k__Bacteria;p__Acidobacteria;c__Solibacteres;o...
k__Bacteria;p__Acidobacteria;c__[Chloracidobact...  k__Bacteria;p__Acidobacteria;c__[Chloracidobac...
k__Bacteria;p__Acidobacteria;c__[Chloracidobact...  k__Bacteria;p__Acidobacteria;c__[Chloracidobac...
k__Bacteria;p__Actinobacteria;c__Acidimicrobiia...  k__Bacteria;p__Actinobacteria;c__Acidimicrobii...
k__Bacteria;p__Actinobacteria;c__Acidimicrobiia...  k__Bacteria;p__Actinobacteria;

## Summer L6 modeling

In [29]:
# groupKfold = 3, will leave out one body for each spring model
gkf = GroupKFold(3)

X = Summer_L6.data
y = Summer_L6.sample_metadata['ADD_0']
y = (y.astype(float))

groups = Summer_L6.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [30]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x1191828d0>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_es...
                                             oob_score=False, random_state=999,
                                             verbose=0, warm_start=False

In [31]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 8, 'max_features': 'log2', 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [32]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 746.9


In [33]:
joblib.dump(gs.best_estimator_, 'models/bone_summer_L6_no_metadata.pkl')

['models/bone_summer_L6_no_metadata.pkl']

In [34]:
bone_summer_L6 = joblib.load('models/bone_summer_L6_no_metadata.pkl')
bone_summer_L6

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=8, max_features='log2', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features of summer L6 model

In [36]:
importances = bone_summer_L6.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_summer_L6.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = Summer_L6.feature_metadata
np.savetxt("importances/bone_summer_L6_no_metadata_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (Summer_L6.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(Summer_L6.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(1104,1))

np.savetxt("importances/bone_summer_L6_no_features_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales;f__Sphingomonadaceae;g__Sphingopyxis	0.015543256801390555
2. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodospirillales;f__Acetobacteraceae;g__Roseococcus	0.014459423040839732
3. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Hyphomicrobiaceae;g__Devosia	0.014338026511348944
4. k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Nocardiaceae;g__Rhodococcus	0.01431141340706443
5. k__Bacteria;p__Verrucomicrobia;c__[Spartobacteria];o__[Chthoniobacterales];f__[Chthoniobacteraceae];g__heteroC45_4W	0.013042476480571035
6. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Phyllobacteriaceae;g__	0.012928112667276654
7. k__Bacteria;p__Actinobacteria;c__Thermoleophilia;o__Solirubrobacterales;f__;g__	0.01272176938404155
8. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Phyllobacteriaceae;g__Defluvib

## Import rarefied table collapsed at level 5

In [3]:
exp_L5 = ca.read_amplicon('/Users/heatherdeel/Dropbox/PMI_3_analyses/bone/01_16S/01_qiime2_analysis/feature_tables/collapsed_tables/frag_ins_noChloMito_17098_L5_table.biom', '../02_metadata/maps/map3.txt', min_reads=0, normalize = None)




In [4]:
# filter to summer
Summer_L5 = exp_L5.filter_samples('season', 'summer')
Summer_L5.sample_metadata.host_subject_id.value_counts()

STAFS2016.067    8
STAFS2016.065    8
STAFS2016.064    7
Name: host_subject_id, dtype: int64

In [5]:
print(Summer_L5.feature_metadata)

                                                                                          _feature_id
k__Archaea;p__Crenarchaeota;c__Thaumarchaeota;o...  k__Archaea;p__Crenarchaeota;c__Thaumarchaeota;...
k__Archaea;p__Euryarchaeota;c__Methanobacteria;...  k__Archaea;p__Euryarchaeota;c__Methanobacteria...
k__Bacteria;__;__;__;__                                                       k__Bacteria;__;__;__;__
k__Bacteria;p__Acidobacteria;c__DA052;o__Ellin6...  k__Bacteria;p__Acidobacteria;c__DA052;o__Ellin...
k__Bacteria;p__Acidobacteria;c__Solibacteres;o_...  k__Bacteria;p__Acidobacteria;c__Solibacteres;o...
k__Bacteria;p__Acidobacteria;c__[Chloracidobact...  k__Bacteria;p__Acidobacteria;c__[Chloracidobac...
k__Bacteria;p__Acidobacteria;c__[Chloracidobact...  k__Bacteria;p__Acidobacteria;c__[Chloracidobac...
k__Bacteria;p__Actinobacteria;c__Acidimicrobiia...  k__Bacteria;p__Actinobacteria;c__Acidimicrobii...
k__Bacteria;p__Actinobacteria;c__Acidimicrobiia...  k__Bacteria;p__Actinobacteria;

## Summer L5 modeling

In [6]:
# groupKfold = 3, will leave out one body for each spring model
gkf = GroupKFold(3)

X = Summer_L5.data
y = Summer_L5.sample_metadata['ADD_0']
y = (y.astype(float))

groups = Summer_L5.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
#param_grid = {"max_depth": [4, 8, 16, None],
#              "max_features": ['sqrt', 'log2', 0.1],
#              "min_samples_split": [0.001, 0.01, 0.1],
#              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
#              "bootstrap": [True, False]}

param_grid = {"max_depth": [8],
          "max_features": ['sqrt'],
          "min_samples_split": [0.001],
          "min_weight_fraction_leaf": [0.0001],
          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [7]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x7fde45a616d0>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n...tors=1000, n_jobs=None,
                                             oob_score=False, random_state=999,
                                             verb

In [8]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 8, 'max_features': 'sqrt', 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [9]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 788.1


In [10]:
joblib.dump(gs.best_estimator_, 'models/bone_summer_L5_no_metadata.pkl')

['models/bone_summer_L5_no_metadata.pkl']

In [11]:
bone_summer_L5 = joblib.load('models/bone_summer_L5_no_metadata.pkl')
bone_summer_L5

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=8, max_features='sqrt', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features of summer L5 model

In [47]:
importances = bone_summer_L5.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_summer_L5.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = Summer_L5.feature_metadata
np.savetxt("importances/bone_summer_L5_no_metadata_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (Summer_L5.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(Summer_L5.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(516,1))

np.savetxt("importances/bone_summer_L5_no_features_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Nocardiaceae	0.03308882868389938
2. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Phyllobacteriaceae	0.029684497761835335
3. k__Bacteria;p__Actinobacteria;c__Thermoleophilia;o__Solirubrobacterales;f__	0.029392396552183554
4. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae	0.024450871538491997
5. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Hyphomicrobiaceae	0.02337507586005258
6. k__Bacteria;p__Proteobacteria;c__Betaproteobacteria;o__Burkholderiales;f__Comamonadaceae	0.022743085007181182
7. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__	0.022306965171448378
8. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales;f__Sphingomonadaceae	0.021901253752666012
9. k__Bacteria;p__Actinobacteria;c__Acidimicrobiia;o__Acidimicrobiales;f__	0.02170786925522077
10. k_

## Import rarefied table collapsed at level 4

In [12]:
exp_L4 = ca.read_amplicon('/Users/heatherdeel/Dropbox/PMI_3_analyses/bone/01_16S/01_qiime2_analysis/feature_tables/collapsed_tables/frag_ins_noChloMito_17098_L4_table.biom', '../02_metadata/maps/map3.txt', min_reads=0, normalize = None)




In [13]:
# filter to summer
Summer_L4 = exp_L4.filter_samples('season', 'summer')
Summer_L4.sample_metadata.host_subject_id.value_counts()

STAFS2016.067    8
STAFS2016.065    8
STAFS2016.064    7
Name: host_subject_id, dtype: int64

In [14]:
print(Summer_L4.feature_metadata)

                                                                                          _feature_id
k__Archaea;p__Crenarchaeota;c__Thaumarchaeota;o...  k__Archaea;p__Crenarchaeota;c__Thaumarchaeota;...
k__Archaea;p__Euryarchaeota;c__Methanobacteria;...  k__Archaea;p__Euryarchaeota;c__Methanobacteria...
k__Bacteria;__;__;__                                                             k__Bacteria;__;__;__
k__Bacteria;p__Acidobacteria;c__DA052;o__Ellin6513  k__Bacteria;p__Acidobacteria;c__DA052;o__Ellin...
k__Bacteria;p__Acidobacteria;c__Solibacteres;o_...  k__Bacteria;p__Acidobacteria;c__Solibacteres;o...
k__Bacteria;p__Acidobacteria;c__[Chloracidobact...  k__Bacteria;p__Acidobacteria;c__[Chloracidobac...
k__Bacteria;p__Actinobacteria;c__Acidimicrobiia...  k__Bacteria;p__Actinobacteria;c__Acidimicrobii...
k__Bacteria;p__Actinobacteria;c__Actinobacteria...  k__Bacteria;p__Actinobacteria;c__Actinobacteri...
k__Bacteria;p__Actinobacteria;c__Coriobacteriia...  k__Bacteria;p__Actinobacteria;

## Summer L4 modeling

In [15]:
# groupKfold = 3, will leave out one body for each spring model
gkf = GroupKFold(3)

X = Summer_L4.data
y = Summer_L4.sample_metadata['ADD_0']
y = (y.astype(float))

groups = Summer_L4.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
#param_grid = {"max_depth": [4, 8, 16, None],
#              "max_features": ['sqrt', 'log2', 0.1],
#              "min_samples_split": [0.001, 0.01, 0.1],
#              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
#              "bootstrap": [True, False]}

param_grid = {"max_depth": [8],
          "max_features": [0.1],
          "min_samples_split": [0.001],
          "min_weight_fraction_leaf": [0.0001],
          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [16]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x7fde460e2350>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n...imators=1000, n_jobs=None,
                                             oob_score=False, random_state=999,
                                             v

In [17]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 8, 'max_features': 0.1, 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [18]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 834.82


In [19]:
joblib.dump(gs.best_estimator_, 'models/bone_summer_L4_no_metadata.pkl')

['models/bone_summer_L4_no_metadata.pkl']

In [20]:
bone_summer_L4 = joblib.load('models/bone_summer_L4_no_metadata.pkl')
bone_summer_L4

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=8, max_features=0.1, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features of summer L4 model

In [58]:
importances = bone_summer_L4.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_summer_L4.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = Summer_L4.feature_metadata
np.savetxt("importances/bone_summer_L4_no_metadata_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (Summer_L4.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(Summer_L4.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(258,1))

np.savetxt("importances/bone_summer_L4_no_features_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. k__Bacteria;p__Actinobacteria;c__Thermoleophilia;o__Solirubrobacterales	0.05077120829664414
2. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales	0.049376261097152534
3. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales	0.041984544754360024
4. k__Bacteria;p__Actinobacteria;c__Acidimicrobiia;o__Acidimicrobiales	0.03845661144479225
5. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Sphingomonadales	0.037337651554120194
6. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Caulobacterales	0.03564867194786332
7. k__Bacteria;p__Bacteroidetes;c__Flavobacteriia;o__Flavobacteriales	0.035319211631802946
8. k__Bacteria;p__Verrucomicrobia;c__[Spartobacteria];o__[Chthoniobacterales]	0.03423314959719621
9. k__Bacteria;p__Planctomycetes;c__Planctomycetia;o__Planctomycetales	0.03184929459571207
10. k__Bacteria;p__Bacteroidetes;c__[Saprospirae];o__[Saprospirales]	0.028218089629226555
11. k__Bacteria;p__Proteobacteria;c__Alp

## Import rarefied table collapsed at level 3

In [21]:
exp_L3 = ca.read_amplicon('/Users/heatherdeel/Dropbox/PMI_3_analyses/bone/01_16S/01_qiime2_analysis/feature_tables/collapsed_tables/frag_ins_noChloMito_17098_L3_table.biom', '/Users/heatherdeel/Dropbox/PMI_3_analyses/bone/01_16S/02_metadata/maps/map3.txt', min_reads=0, normalize = None)




In [22]:
# filter to summer
Summer_L3 = exp_L3.filter_samples('season', 'summer')
Summer_L3.sample_metadata.host_subject_id.value_counts()

STAFS2016.067    8
STAFS2016.065    8
STAFS2016.064    7
Name: host_subject_id, dtype: int64

In [23]:
print(Summer_L3.feature_metadata)

                                                                                          _feature_id
k__Archaea;p__Crenarchaeota;c__Thaumarchaeota           k__Archaea;p__Crenarchaeota;c__Thaumarchaeota
k__Archaea;p__Euryarchaeota;c__Methanobacteria         k__Archaea;p__Euryarchaeota;c__Methanobacteria
k__Bacteria;__;__                                                                   k__Bacteria;__;__
k__Bacteria;p__Acidobacteria;c__DA052                           k__Bacteria;p__Acidobacteria;c__DA052
k__Bacteria;p__Acidobacteria;c__Solibacteres             k__Bacteria;p__Acidobacteria;c__Solibacteres
k__Bacteria;p__Acidobacteria;c__[Chloracidobact...  k__Bacteria;p__Acidobacteria;c__[Chloracidobac...
k__Bacteria;p__Actinobacteria;c__Acidimicrobiia       k__Bacteria;p__Actinobacteria;c__Acidimicrobiia
k__Bacteria;p__Actinobacteria;c__Actinobacteria       k__Bacteria;p__Actinobacteria;c__Actinobacteria
k__Bacteria;p__Actinobacteria;c__Coriobacteriia       k__Bacteria;p__Actinobacteri

## Summer L3 modeling

In [24]:
# groupKfold = 3, will leave out one body for each spring model
gkf = GroupKFold(3)

X = Summer_L3.data
y = Summer_L3.sample_metadata['ADD_0']
y = (y.astype(float))

groups = Summer_L3.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
#param_grid = {"max_depth": [4, 8, 16, None],
#              "max_features": ['sqrt', 'log2', 0.1],
#              "min_samples_split": [0.001, 0.01, 0.1],
#              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
#              "bootstrap": [True, False]}

param_grid = {"max_depth": [8],
          "max_features": [0.1],
          "min_samples_split": [0.001],
          "min_weight_fraction_leaf": [0.0001],
          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [25]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x7fde460e2850>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n...imators=1000, n_jobs=None,
                                             oob_score=False, random_state=999,
                                             v

In [26]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 8, 'max_features': 0.1, 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [27]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 853.38


In [28]:
joblib.dump(gs.best_estimator_, 'models/bone_summer_L3_no_metadata.pkl')

['models/bone_summer_L3_no_metadata.pkl']

In [29]:
bone_summer_L3 = joblib.load('models/bone_summer_L3_no_metadata.pkl')
bone_summer_L3

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=8, max_features=0.1, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features of summer L3 model

In [69]:
importances = bone_summer_L3.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_summer_L3.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = Summer_L3.feature_metadata
np.savetxt("importances/bone_summer_L3_no_metadata_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (Summer_L3.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(Summer_L3.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(124,1))

np.savetxt("importances/bone_summer_L3_no_features_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria	0.07929831389694235
2. k__Bacteria;p__Actinobacteria;c__Thermoleophilia	0.06651285742817459
3. k__Bacteria;p__Verrucomicrobia;c__[Spartobacteria]	0.05466499984212487
4. k__Bacteria;p__Bacteroidetes;c__Flavobacteriia	0.05242951547689456
5. k__Bacteria;p__Actinobacteria;c__Acidimicrobiia	0.05185902601489251
6. k__Bacteria;p__Planctomycetes;c__Planctomycetia	0.05034574657113114
7. k__Bacteria;p__Bacteroidetes;c__Sphingobacteriia	0.04949065850050336
8. k__Bacteria;p__Bacteroidetes;c__[Saprospirae]	0.044200862363166185
9. k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria	0.03703220952472006
10. k__Bacteria;p__Proteobacteria;c__Betaproteobacteria	0.03549949983034941
11. k__Bacteria;p__Chloroflexi;c__Thermomicrobia	0.03142125767802066
12. k__Bacteria;p__Fusobacteria;c__Fusobacteriia	0.03138290061676391
13. k__Bacteria;p__Firmicutes;c__Clostridia	0.0299922066244084
14. k__Bacteria;p__Firmicutes;c__Bacilli	0.0282

## Import rarefied table collapsed at level 2

In [30]:
exp_L2 = ca.read_amplicon('/Users/heatherdeel/Dropbox/PMI_3_analyses/bone/01_16S/01_qiime2_analysis/feature_tables/collapsed_tables/frag_ins_noChloMito_17098_L2_table.biom', '../02_metadata/maps/map3.txt', min_reads=0, normalize = None)




In [31]:
# filter to summer
Summer_L2 = exp_L2.filter_samples('season', 'summer')
Summer_L2.sample_metadata.host_subject_id.value_counts()

STAFS2016.067    8
STAFS2016.065    8
STAFS2016.064    7
Name: host_subject_id, dtype: int64

In [32]:
print(Summer_L2.feature_metadata)

                                                     _feature_id
k__Archaea;p__Crenarchaeota          k__Archaea;p__Crenarchaeota
k__Archaea;p__Euryarchaeota          k__Archaea;p__Euryarchaeota
k__Bacteria;__                                    k__Bacteria;__
k__Bacteria;p__Acidobacteria        k__Bacteria;p__Acidobacteria
k__Bacteria;p__Actinobacteria      k__Bacteria;p__Actinobacteria
k__Bacteria;p__Armatimonadetes    k__Bacteria;p__Armatimonadetes
k__Bacteria;p__BRC1                          k__Bacteria;p__BRC1
k__Bacteria;p__Bacteroidetes        k__Bacteria;p__Bacteroidetes
k__Bacteria;p__Chlorobi                  k__Bacteria;p__Chlorobi
k__Bacteria;p__Chloroflexi            k__Bacteria;p__Chloroflexi
k__Bacteria;p__Cyanobacteria        k__Bacteria;p__Cyanobacteria
k__Bacteria;p__Deferribacteres    k__Bacteria;p__Deferribacteres
k__Bacteria;p__FBP                            k__Bacteria;p__FBP
k__Bacteria;p__Firmicutes              k__Bacteria;p__Firmicutes
k__Bacteria;p__Fusobacter

## Summer L2 modeling

In [33]:
# groupKfold = 3, will leave out one body for each spring model
gkf = GroupKFold(3)

X = Summer_L2.data
y = Summer_L2.sample_metadata['ADD_0']
y = (y.astype(float))

groups = Summer_L2.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
#param_grid = {"max_depth": [4, 8, 16, None],
#              "max_features": ['sqrt', 'log2', 0.1],
#              "min_samples_split": [0.001, 0.01, 0.1],
#              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
#              "bootstrap": [True, False]}

param_grid = {"max_depth": [16],
          "max_features": ['log2'],
          "min_samples_split": [0.001],
          "min_weight_fraction_leaf": [0.0001],
          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [34]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x7fde460033d0>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n...ors=1000, n_jobs=None,
                                             oob_score=False, random_state=999,
                                             verbo

In [35]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 16, 'max_features': 'log2', 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [36]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 778.17


In [77]:
joblib.dump(gs.best_estimator_, 'models/bone_summer_L2_no_metadata.pkl')

['models/bone_summer_L2_no_metadata.pkl']

In [78]:
bone_summer_L2 = joblib.load('models/bone_summer_L2_no_metadata.pkl')
bone_summer_L2

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mae',
                      max_depth=16, max_features='log2', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features of summer L2 model

In [79]:
importances = bone_summer_L2.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_summer_L2.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = Summer_L2.feature_metadata
np.savetxt("importances/bone_summer_L2_no_metadata_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (Summer_L2.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(Summer_L2.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(58,1))

np.savetxt("importances/bone_summer_L2_no_features_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. k__Bacteria;p__Verrucomicrobia	0.11302707395665576
2. k__Bacteria;p__Bacteroidetes	0.11087063889610961
3. k__Bacteria;p__Planctomycetes	0.09218155827219474
4. k__Bacteria;p__Chloroflexi	0.07972517911974596
5. k__Bacteria;p__Proteobacteria	0.073166346704778
6. k__Bacteria;p__Firmicutes	0.06769671381101959
7. k__Bacteria;p__Fusobacteria	0.06009915672337473
8. k__Bacteria;p__Actinobacteria	0.05922858771220375
9. k__Bacteria;p__Gemmatimonadetes	0.048902943795039115
10. k__Bacteria;p__FBP	0.03940457445530486
11. k__Bacteria;p__BRC1	0.036401595000867225
12. k__Bacteria;p__[Thermi]	0.036196602508893366
13. k__Bacteria;p__Acidobacteria	0.035618782115436586
14. k__Bacteria;p__TM7	0.03505468807333406
15. k__Bacteria;p__Tenericutes	0.022797329658920753
16. k__Bacteria;p__Armatimonadetes	0.021541747266305263
17. k__Bacteria;p__Chlorobi	0.016164648530056273
18. k__Bacteria;p__Cyanobacteria	0.014061163094108611
19. k__Bacteria;p__Synergistetes	0.010716159870423021
20. k__A