## Description

Bone models of 16S rRNA data at all levels, using the ml_new environment, rarefied data and no normalization during import, n = 1000 estimators, k fold = 6 (for regular, non-nested cross-validation), and hyperparameter tuning. Incorporates metadata features season, accumulated humidity, and accumulated precipitation in all combinations. 

In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing, svm, metrics
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold, KFold, GridSearchCV, GroupKFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVR
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_error, roc_auc_score, precision_score, make_scorer
from sklearn.linear_model import LassoCV, Lasso, LassoLarsIC, ElasticNet, LassoLarsCV
from sklearn.externals import joblib
from scipy.stats import randint as sp_randint
from matplotlib import pyplot as plt
from sklearn.feature_selection import RFECV
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier



In [2]:
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns
%matplotlib inline
import biom
import calour as ca
from scipy import stats
import scipy
import pickle
import time
import math
import inspect
pd.set_option('display.max_rows', 10000)

  import pandas.util.testing as pdt


## Import rarefied data

In [3]:
exp_ASV = ca.read_amplicon('../01_qiime2_analysis/feature_tables/exported-feature-tables/frag_ins_filtered_noChloMito_17098_table.biom', '../02_metadata/maps/map3.txt', min_reads=0, normalize = None)




In [4]:
exp_ASV.sample_metadata['host_subject_id'].value_counts()

STAFS2016.065    8
STAFS2016.067    8
STAFS2016.011    8
STAFS2016.024    7
STAFS2016.064    7
STAFS2016.007    7
Name: host_subject_id, dtype: int64

In [5]:
exp_ASV.sample_metadata['ADD_0']

#SampleID
11553.SHSU.STAFS2016.024.R09      708.0555556
11553.STAFS.2016.065.L12          2268.055556
11553.STAFS.2016.067.R12          2414.444444
11553.SHSU.STAFS2016.007.L09      1911.111111
11553.SHSU.STAFS2016.011.L08      1301.666667
11553.STAFS.2016.065.R11          4756.388889
11553.STAFS.2016.065.R09          3456.111111
11553.SHSU.STAFS2016.024.L12      4285.833333
11553.SHSU.STAFS2016.007.L11      2584.444444
11553.SHSU.STAFS2016.011.L10      3777.222222
11553.STAFS.2016.064.R10          3456.111111
11553.SHSU.STAFS2016.024.R11      2584.444444
11553.STAFS.2016.067.L11.june     5201.388889
11553.SHSU.STAFS2016.007.R12      1301.666667
11553.STAFS.2016.065.R10          2804.722222
11553.STAFS.2016.064.R09          592.7777778
11553.STAFS.2016.067.L10          3623.055556
11553.SHSU.STAFS2016.011.R12      708.0555556
11553.STAFS.2016.064.R11          1300.555556
11553.SHSU.STAFS2016.011.L11      4285.833333
11553.SHSU.STAFS2016.007.L10      4821.388889
11553.STAFS.2016.065.R08

## Running the ASV no metadata model

Testing the param grid to expand hyperparameter tuning - which parameters give the best model?

In [6]:
# groupKfold = 6, will leave out one body for each model
gkf = GroupKFold(6)

X = exp_ASV.data
y = exp_ASV.sample_metadata['ADD_0']
y = (y.astype(float))

groups = exp_ASV.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [None]:
gs.fit(X, y)

In [12]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 16, 'max_features': 0.1, 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [13]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 804.88


In [14]:
joblib.dump(gs.best_estimator_, 'models/bone_ASV_no_metadata.pkl')

['models/bone_ASV_no_metadata.pkl']

In [4]:
bone_ASV = joblib.load('models/bone_ASV_no_metadata.pkl')
bone_ASV

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=16, max_features=0.1, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features of ASV no metadata model

In [26]:
importances = bone_ASV.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_ASV.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = exp_ASV.feature_metadata
np.savetxt("importances/bone_ASV_no_metadata_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (exp_ASV.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(exp_ASV.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(10870,1))

np.savetxt("importances/bone_ASV_no_features_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATTGGTCAGTTAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTTTAATACTGCCGGTCTAGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.04195649140072618
2. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTTAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTCTAATACTGTCGATCTCGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.03894779833031093
3. TACGTAGGGGGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGGGTACGTAGGCGGTTTAGAAAGTCAGATGTTAAAGCGTGAGGCTCAACCTCATAAAGCATTTGAAACTTCTAGACTTGAGTAGTGGAGAGGAAAGTGGAATTCCTAGT	0.03468390673507264
4. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTTAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTCTAATACTGTCGATCTCGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.031517240074916145
5. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATATTTAAGTCAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTTTGATACTGGGTATCTGGAGTATGGAAGAGGTAAGTGGAATTCCGAG	0.029463431785990393
6. TACGTAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGTCATGCAAGACAGATGTGAAATCCCCGGGCTCAACCTGGGAACTGC

## Run the ASV model with season

In [6]:
exp_ASV_season = exp_ASV.add_sample_metadata_as_features(['season'])

In [18]:
# groupKfold = 6, will leave out one body for each model
gkf = GroupKFold(6)

X = exp_ASV_season.data
y = exp_ASV_season.sample_metadata['ADD_0']
y = (y.astype(float))

groups = exp_ASV_season.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [19]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x11151aed0>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_es...
                                             oob_score=False, random_state=999,
                                             verbose=0, warm_start=False

In [20]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 8, 'max_features': 0.1, 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [21]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 804.3


In [22]:
joblib.dump(gs.best_estimator_, 'models/bone_ASV_season.pkl')

['models/bone_ASV_season.pkl']

In [5]:
bone_ASV_season = joblib.load('models/bone_ASV_season.pkl')
bone_ASV_season

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=8, max_features=0.1, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features for ASV season model

In [11]:
importances = bone_ASV_season.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_ASV_season.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = exp_ASV_season.feature_metadata
np.savetxt("importances/bone_ASV_season_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (exp_ASV_season.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(exp_ASV_season.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(10874,1))

np.savetxt("importances/bone_ASV_season_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. TACGTAGGGGGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGGGTACGTAGGCGGTTTAGAAAGTCAGATGTTAAAGCGTGAGGCTCAACCTCATAAAGCATTTGAAACTTCTAGACTTGAGTAGTGGAGAGGAAAGTGGAATTCCTAGT	0.037859251293875816
2. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATTGGTCAGTTAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTTTAATACTGCCGGTCTAGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.03732798395045147
3. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATATTTAAGTCAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTTTGATACTGGGTATCTGGAGTATGGAAGAGGTAAGTGGAATTCCGAG	0.034523517412016244
4. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTTAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTCTAATACTGTCGATCTCGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.03269390452558569
5. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTTAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTCTAATACTGTCGATCTCGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.02960124401244732
6. TACGTAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGTGCGCAGGCGGTCATGCAAGACAGATGTGAAATCCCCGGGCTCAACCTGGGAACTGC

## Run the ASV model with precipitation

In [12]:
exp_ASV_precip = exp_ASV.add_sample_metadata_as_features(['accumulated_precipitation_inches_base0'])

In [13]:
# groupKfold = 6, will leave out one body for each model
gkf = GroupKFold(6)

X = exp_ASV_precip.data
y = exp_ASV_precip.sample_metadata['ADD_0']
y = (y.astype(float))

groups = exp_ASV_precip.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [14]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x11a417250>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_es...
                                             oob_score=False, random_state=999,
                                             verbose=0, warm_start=False

In [15]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 16, 'max_features': 0.1, 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [16]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 689.63


In [17]:
joblib.dump(gs.best_estimator_, 'models/bone_ASV_precip.pkl')

['models/bone_ASV_precip.pkl']

In [18]:
bone_ASV_precip = joblib.load('models/bone_ASV_precip.pkl')
bone_ASV_precip

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=16, max_features=0.1, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features for ASV precipitation model

In [21]:
importances = bone_ASV_precip.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_ASV_precip.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = exp_ASV_precip.feature_metadata
np.savetxt("importances/bone_ASV_precip_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (exp_ASV_precip.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(exp_ASV_precip.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(10872,1))

np.savetxt("importances/bone_ASV_precip_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. accumulated_precipitation_inches_base0	0.086767526783665
2. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATTGGTCAGTTAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTTTAATACTGCCGGTCTAGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.036484539652717855
3. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTTAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTCTAATACTGTCGATCTCGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.03457456453679246
4. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTTAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTCTAATACTGTCGATCTCGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.03445563128574849
5. TACGTAGGGGGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGGGTACGTAGGCGGTTTAGAAAGTCAGATGTTAAAGCGTGAGGCTCAACCTCATAAAGCATTTGAAACTTCTAGACTTGAGTAGTGGAGAGGAAAGTGGAATTCCTAGT	0.03164648922991007
6. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATATTTAAGTCAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTTTGATACTGGGTATCTGGAGTATGGAAGAGGTAAGTGGAATTCCGAG	0.025034826354821654
7. TACGTAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAA

## Run the ASV model with humidity

In [22]:
exp_ASV_hum = exp_ASV.add_sample_metadata_as_features(['accumulated_percent_humidity_days_base0'])

In [23]:
# groupKfold = 6, will leave out one body for each model
gkf = GroupKFold(6)

X = exp_ASV_hum.data
y = exp_ASV_hum.sample_metadata['ADD_0']
y = (y.astype(float))

groups = exp_ASV_hum.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [24]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x11a4178d0>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_es...
                                             oob_score=False, random_state=999,
                                             verbose=0, warm_start=False

In [25]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 8, 'max_features': 0.1, 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [26]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 673.81


In [27]:
joblib.dump(gs.best_estimator_, 'models/bone_ASV_hum.pkl')

['models/bone_ASV_hum.pkl']

In [28]:
bone_ASV_hum = joblib.load('models/bone_ASV_hum.pkl')
bone_ASV_hum

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=8, max_features=0.1, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features for ASV humidity model

In [30]:
importances = bone_ASV_hum.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_ASV_hum.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = exp_ASV_hum.feature_metadata
np.savetxt("importances/bone_ASV_hum_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (exp_ASV_hum.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(exp_ASV_hum.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(10872,1))

np.savetxt("importances/bone_ASV_hum_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. accumulated_percent_humidity_days_base0	0.10477181189981431
2. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATTGGTCAGTTAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTTTAATACTGCCGGTCTAGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.03547648392613599
3. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTTAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTCTAATACTGTCGATCTCGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.03539385746620913
4. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTTAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTCTAATACTGTCGATCTCGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.03464028381333024
5. TACGTAGGGGGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGGGTACGTAGGCGGTTTAGAAAGTCAGATGTTAAAGCGTGAGGCTCAACCTCATAAAGCATTTGAAACTTCTAGACTTGAGTAGTGGAGAGGAAAGTGGAATTCCTAGT	0.031104233266285992
6. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATATTTAAGTCAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTTTGATACTGGGTATCTGGAGTATGGAAGAGGTAAGTGGAATTCCGAG	0.025474846843413192
7. TACGTAGGGTGCAAGCGTTAATCGGAATTACTGGGCGT

## Run the ASV model with season and precipitation

In [31]:
exp_ASV_season_precip = exp_ASV.add_sample_metadata_as_features(['season', 'accumulated_precipitation_inches_base0'])

In [32]:
# groupKfold = 6, will leave out one body for each model
gkf = GroupKFold(6)

X = exp_ASV_season_precip.data
y = exp_ASV_season_precip.sample_metadata['ADD_0']
y = (y.astype(float))

groups = exp_ASV_season_precip.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [33]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x119f292d0>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_es...
                                             oob_score=False, random_state=999,
                                             verbose=0, warm_start=False

In [34]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 16, 'max_features': 0.1, 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [35]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 688.03


In [36]:
joblib.dump(gs.best_estimator_, 'models/bone_ASV_season_precip.pkl')

['models/bone_ASV_season_precip.pkl']

In [37]:
bone_ASV_season_precip = joblib.load('models/bone_ASV_season_precip.pkl')
bone_ASV_season_precip

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=16, max_features=0.1, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features for ASV season and precipitation model

In [39]:
importances = bone_ASV_season_precip.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_ASV_season_precip.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = exp_ASV_season_precip.feature_metadata
np.savetxt("importances/bone_ASV_season_precip_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (exp_ASV_season_precip.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(exp_ASV_season_precip.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(10876,1))

np.savetxt("importances/bone_ASV_season_precip_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. accumulated_precipitation_inches_base0	0.08698606513717953
2. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATTGGTCAGTTAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTTTAATACTGCCGGTCTAGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.03640905009361764
3. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTTAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTCTAATACTGTCGATCTCGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.03589650546038424
4. TACGTAGGGGGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGGGTACGTAGGCGGTTTAGAAAGTCAGATGTTAAAGCGTGAGGCTCAACCTCATAAAGCATTTGAAACTTCTAGACTTGAGTAGTGGAGAGGAAAGTGGAATTCCTAGT	0.032089408195751394
5. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTTAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTCTAATACTGTCGATCTCGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.026090360273687118
6. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATATTTAAGTCAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTTTGATACTGGGTATCTGGAGTATGGAAGAGGTAAGTGGAATTCCGAG	0.021301211727290808
7. TACGTAGGGTGCAAGCGTTAATCGGAATTACTGGGCGT

## Run the ASV model with season and humidity

In [40]:
exp_ASV_season_hum = exp_ASV.add_sample_metadata_as_features(['season', 'accumulated_percent_humidity_days_base0'])

In [41]:
# groupKfold = 6, will leave out one body for each model
gkf = GroupKFold(6)

X = exp_ASV_season_hum.data
y = exp_ASV_season_hum.sample_metadata['ADD_0']
y = (y.astype(float))

groups = exp_ASV_season_hum.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [42]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x119f29f50>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_es...
                                             oob_score=False, random_state=999,
                                             verbose=0, warm_start=False

In [43]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 16, 'max_features': 0.1, 'min_samples_split': 0.1, 'min_weight_fraction_leaf': 0.0001}


In [44]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 670.33


In [45]:
joblib.dump(gs.best_estimator_, 'models/bone_ASV_season_hum.pkl')

['models/bone_ASV_season_hum.pkl']

In [46]:
bone_ASV_season_hum = joblib.load('models/bone_ASV_season_hum.pkl')
bone_ASV_season_hum

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=16, max_features=0.1, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.1, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features for ASV season and humidity model

In [49]:
importances = bone_ASV_season_hum.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_ASV_season_hum.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = exp_ASV_season_hum.feature_metadata
np.savetxt("importances/bone_ASV_season_hum_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (exp_ASV_season_hum.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(exp_ASV_season_hum.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(10876,1))

np.savetxt("importances/bone_ASV_season_hum_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. accumulated_percent_humidity_days_base0	0.11216395343735554
2. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATTGGTCAGTTAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTTTAATACTGCCGGTCTAGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.0395010430874544
3. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTTAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTCTAATACTGTCGATCTCGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.03908837797598783
4. TACGTAGGGGGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGGGTACGTAGGCGGTTTAGAAAGTCAGATGTTAAAGCGTGAGGCTCAACCTCATAAAGCATTTGAAACTTCTAGACTTGAGTAGTGGAGAGGAAAGTGGAATTCCTAGT	0.035560352357163344
5. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTTAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTCTAATACTGTCGATCTCGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.028189149775976495
6. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATATTTAAGTCAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTTTGATACTGGGTATCTGGAGTATGGAAGAGGTAAGTGGAATTCCGAG	0.023983240167204198
7. TACGTAGGGTGCAAGCGTTAATCGGAATTACTGGGCGT

## Run the ASV model with precipitation and humidity

In [50]:
exp_ASV_precip_hum = exp_ASV.add_sample_metadata_as_features(['accumulated_precipitation_inches_base0', 'accumulated_percent_humidity_days_base0'])


In [51]:
# groupKfold = 6, will leave out one body for each model
gkf = GroupKFold(6)

X = exp_ASV_precip_hum.data
y = exp_ASV_precip_hum.sample_metadata['ADD_0']
y = (y.astype(float))

groups = exp_ASV_precip_hum.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [52]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x104309650>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_es...
                                             oob_score=False, random_state=999,
                                             verbose=0, warm_start=False

In [53]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 16, 'max_features': 0.1, 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [54]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 582.63


In [55]:
joblib.dump(gs.best_estimator_, 'models/bone_ASV_precip_hum.pkl')

['models/bone_ASV_precip_hum.pkl']

In [56]:
bone_ASV_precip_hum = joblib.load('models/bone_ASV_precip_hum.pkl')
bone_ASV_precip_hum

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=16, max_features=0.1, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features for ASV precipitation and humidity model

In [58]:
importances = bone_ASV_precip_hum.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_ASV_precip_hum.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = exp_ASV_precip_hum.feature_metadata
np.savetxt("importances/bone_ASV_precip_hum_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (exp_ASV_precip_hum.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(exp_ASV_precip_hum.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(10874,1))

np.savetxt("importances/bone_ASV_precip_hum_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. accumulated_percent_humidity_days_base0	0.10644753326490276
2. accumulated_precipitation_inches_base0	0.08665101699121121
3. TACGTAGGGGGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGGGTACGTAGGCGGTTTAGAAAGTCAGATGTTAAAGCGTGAGGCTCAACCTCATAAAGCATTTGAAACTTCTAGACTTGAGTAGTGGAGAGGAAAGTGGAATTCCTAGT	0.029249518550170404
4. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATATTTAAGTCAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTTTGATACTGGGTATCTGGAGTATGGAAGAGGTAAGTGGAATTCCGAG	0.02912662014476594
5. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATTGGTCAGTTAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTTTAATACTGCCGGTCTAGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.02872784810268148
6. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTTAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTCTAATACTGTCGATCTCGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.025771318836747895
7. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTTAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTCTAATACTGTCGATCTCGAGTTCGAGAGAGGTGAGTGGAATTCCGAG

## Run the ASV model with season, precipitation, and humidity

In [59]:
exp_ASV_season_precip_hum = exp_ASV.add_sample_metadata_as_features(['season', 'accumulated_precipitation_inches_base0', 'accumulated_percent_humidity_days_base0'])


In [60]:
# groupKfold = 6, will leave out one body for each model
gkf = GroupKFold(6)

X = exp_ASV_season_precip_hum.data
y = exp_ASV_season_precip_hum.sample_metadata['ADD_0']
y = (y.astype(float))

groups = exp_ASV_season_precip_hum.sample_metadata['host_subject_id']

# used to test the param grid for parameter tuning
# use the output of this estimator below for input into new estimator (not commented out)
param_grid = {"max_depth": [4, 8, 16, None],
              "max_features": ['sqrt', 'log2', 0.1],
              "min_samples_split": [0.001, 0.01, 0.1],
              "min_weight_fraction_leaf": [0.0001, 0.001, 0.01],
              "bootstrap": [True, False]}

#param_grid = {"max_depth": [8],
#          "max_features": [0.1],
#          "min_samples_split": [0.001],
#          "min_weight_fraction_leaf": [0.0001],
#          "bootstrap": [False]}

rf = RandomForestRegressor(n_estimators=1000, random_state=999, criterion='mae')
gs = GridSearchCV(rf, param_grid=param_grid, cv=gkf.split(X, y, groups), scoring='neg_mean_absolute_error', n_jobs=1)

In [61]:
gs.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x1043092d0>,
             error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mae', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_es...
                                             oob_score=False, random_state=999,
                                             verbose=0, warm_start=False

In [62]:
# this line is used when run with the first param grid to determine what the best parameters are for tuning
print(gs.best_params_)

{'bootstrap': False, 'max_depth': 8, 'max_features': 0.1, 'min_samples_split': 0.001, 'min_weight_fraction_leaf': 0.0001}


In [63]:
print('The best mean absolute error is', round(-gs.best_score_,2))

The best mean absolute error is 583.48


In [64]:
joblib.dump(gs.best_estimator_, 'models/bone_ASV_season_precip_hum.pkl')

['models/bone_ASV_season_precip_hum.pkl']

In [65]:
bone_ASV_season_precip_hum = joblib.load('models/bone_ASV_season_precip_hum.pkl')
bone_ASV_season_precip_hum

RandomForestRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mae',
                      max_depth=8, max_features=0.1, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=0.001, min_weight_fraction_leaf=0.0001,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=999, verbose=0, warm_start=False)

## Determine important features for ASV season, precipitation, humidity model

In [67]:
importances = bone_ASV_season_precip_hum.feature_importances_
std = np.std([tree.feature_importances_ for tree in bone_ASV_season_precip_hum.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

feature_metadata = exp_ASV_season_precip_hum.feature_metadata
np.savetxt("importances/bone_ASV_season_precip_hum_features.csv", feature_metadata, delimiter=",", fmt='%s')


#print the important ids in order
general_importances = []
count = 0
print("Feature:\t\t\t\tImportance:")
for i in indices:
    general_importances += (exp_ASV_season_precip_hum.feature_metadata.index.values[i], importances[indices[count]])
    if count < 25:
        print(str(count+1)+". "+str(exp_ASV_season_precip_hum.feature_metadata.index.values[i])+"\t"+str(importances[indices[count]]))
    count += 1
    
general_importances_df = pd.DataFrame(np.array(general_importances).reshape(10878,1))

np.savetxt("importances/bone_ASV_season_precip_hum_importances.csv", general_importances_df, delimiter=",", fmt='%s')

Feature:				Importance:
1. accumulated_percent_humidity_days_base0	0.10836240458683657
2. accumulated_precipitation_inches_base0	0.08394406484883161
3. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATTGGTCAGTTAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTTTAATACTGCCGGTCTAGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.038227264067744994
4. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTTAGGGGTGAAATCCCAGGGCTCAACCCTGGAACTGCCTCTAATACTGTCGATCTCGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.034057543148500555
5. TACGTAGGGGGCGAGCGTTGTCCGGAATTATTGGGCGTAAAGGGTACGTAGGCGGTTTAGAAAGTCAGATGTTAAAGCGTGAGGCTCAACCTCATAAAGCATTTGAAACTTCTAGACTTGAGTAGTGGAGAGGAAAGTGGAATTCCTAGT	0.02717547784154946
6. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATCGATCAGTTAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTCTAATACTGTCGATCTCGAGTTCGAGAGAGGTGAGTGGAATTCCGAG	0.02512692653479867
7. TACGAAGGGGGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGATATTTAAGTCAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTGCCTTTGATACTGGGTATCTGGAGTATGGAAGAGGTAAGTGGAATTCCGAG