In [1]:
import numpy as np
import pandas as pd
import sklearn as skl
import statsmodels
import statsmodels.api as sm

print 'numpy = %s' % np.__version__
print 'pandas = %s' % pd.__version__
print 'scikit-earn = %s' % skl.__version__
print 'statsmodels = %s' % statsmodels.__version__

numpy = 1.10.2
pandas = 0.22.0
scikit-earn = 0.19.1
statsmodels = 0.8.0


  from pandas.core import datetools


In [2]:
!ls -lh ../generate_data/output_data/S100_cv0_59024160/

total 5672
-rw-r--r--  1 firasmidani  staff   3.9K Feb 15 22:08 auc_scores.txt
-rw-r--r--  1 firasmidani  staff   2.2M Feb 15 22:08 coefficients.txt
-rw-r--r--  1 firasmidani  staff   563K Feb 15 22:08 decision_scores.txt
-rw-r--r--  1 firasmidani  staff    53K Feb 15 22:08 features.txt


In [3]:
def txt_to_df(filename,header=0):
    return pd.read_csv(filename,sep='\t',header=header,index_col=0)

def getSummaryStatistics(otu,biom_df,labels_df):

    # split cohort
    subjects_1 = labels_df[labels_df.color==1].index;
    subjects_0 = labels_df[labels_df.color==0].index;
    
    # compute mean relative abundnace
    mean =  biom_df.loc[:,otu].mean()
    mean_0 = biom_df.loc[subjects_0,otu].mean()
    mean_1 =  biom_df.loc[subjects_1,otu].mean()
    
    # compute log ratio of mean relative abundance with pseudocoutn
    mlr = np.log10((mean_1+1e-6)/(mean_0+1e-6))

    # compute prevalence (number of individuals)
    prev = (np.sum(biom_df.loc[:,otu]>0))
    prev_0 = (np.sum(biom_df.loc[subjects_0,otu]>0))
    prev_1 = (np.sum(biom_df.loc[subjects_1,otu]>0))
    
    
    # compute prevalence (percent of individuals)
    pr = float(prev)/biom_df.shape[0]
    pr_0 = float(prev_0)/len(subjects_0); 
    pr_1 = float(prev_1)/len(subjects_1); 
    
    # compute log ratio of prevalence with pseudocount
    plr = np.log10((float(prev_1+1)/len(subjects_1))/(float(prev_0+1)/len(subjects_0)))
    
    # put it all in a pandas.DataFrame
    df = pd.DataFrame(index = ['mean','mean_0','mean_1','mlr',
                               'prev','prev_0','prev_1','plr'],
                               columns = [otu])
    df.loc[:,otu] = [mean,mean_0,mean_1,mlr,pr,pr_0,pr_1,plr]
     
    return df

def normalize(df):
    
    from sklearn.preprocessing import StandardScaler

    scaler = skl.preprocessing.StandardScaler()
    scaler = scaler.fit(df)
    df = pd.DataFrame(data=scaler.transform(df),index=df.index,columns=df.keys())
    
    return df

def log_transform(ii):
    
    ii += 1e-6
    ii = np.log10(ii)
    
    return ii

# Import data

In [4]:
# import microbiota model results
parent_path = '../generate_data/output_data/S100_cv0_59024160/'

features_df = txt_to_df('%s/features.txt' % parent_path); 
aucs_df = txt_to_df('%s/auc_scores.txt' % parent_path,[0,1]); 
coef_df = txt_to_df('%s/coefficients.txt' % parent_path); 

print features_df.shape
print aucs_df.shape
print coef_df.shape
print 

# import otu and outcomes tables
parent_path = '../generate_data/input_data/suscpetibility/'
biom_df = txt_to_df('%s/otus.ygbr.day.2.txt' % parent_path); 
labels_df = txt_to_df('%s/outcomes.ygbr.day.2.txt' % parent_path);

print biom_df.shape
print labels_df.shape

(501, 1)
(501, 1)
(501, 501)

(76, 4181)
(76, 1)


# Clean up data

In [5]:
## grab top 100 OTUs
otus_df = features_df.loc[range(1,101)]
otus_df = otus_df.reset_index().set_index('Feature')
otus_df.head()

## grab top 100 OTU mdoel coefficients
coef_100 = pd.DataFrame(coef_df.loc[:,'100_f'])
coef_100 = coef_100[coef_100.any(1)]
coef_100 = coef_100.sort_values(['100_f'])

## subset biom table for top 100 OTUs
biom_df = biom_df.loc[:,otus_df.index]; print biom_df.shape

(76, 100)


# add OTU model coefficients

In [6]:
summary_df = pd.DataFrame(index=otus_df.index,
                          columns=['rank','coef','abs_coef'])

for otu in summary_df.index:
    
    summary_df.loc[otu,'rank'] = otus_df.loc[otu,'Rank']
    summary_df.loc[otu,'coef'] = coef_100.loc[otu,'100_f']
    summary_df.loc[otu,'abs_coef'] = np.abs(coef_100.loc[otu,'100_f'])

# add OTU summary statistics

In [7]:
stats_df = pd.DataFrame(index=otus_df.index,
                        columns=['mean','mean_0','mean_1','mlr',
                                 'prev','prev_0','prev_1','plr'])

for otu in stats_df.index:
    
    df = getSummaryStatistics(otu,biom_df,labels_df)
    
    stats_df.loc[otu,df.index] = np.ravel(df.values)

# delineate OU taxonomy

In [8]:
taxa_df = pd.DataFrame(index=otus_df.index,
                       columns=['kingdom','phylum','class','order',
                                'family','genus','species','otu_id'])

for otu in summary_df.index:
    
    df = pd.DataFrame([ii.split('__')[1] for ii in otu.split(';')],
                       index=['kingdom','phylum','class','order',
                              'family','genus','species','otu_id'],
                       columns=[otu])
    
    taxa_df.loc[otu,df.index] = np.ravel(df.values)

# compute odds ratio

In [9]:
odds_df = pd.DataFrame(index=otus_df.index,
                       columns=['i_coef','i_coef_l','i_coef_h',
                                'i_pvalue'])

dgmatrix = biom_df.loc[labels_df.index,otus_df.index] # design matrix

normalize
input_df = dgmatrix
input_df = input_df.applymap(log_transform)
input_df = normalize(input_df)

for otu in dgmatrix.keys():
    print otu
    
    df = input_df.loc[:,otu]
    
    if len(df.shape)==1:
        df = pd.DataFrame(df.values,index=df.index,columns=['abundance'])
    
    response = np.ravel(labels_df.loc[df.index,:].color);
    observed = np.ravel(df.values);
    
    results = sm.Logit(response,observed).fit()
       
    odds_df.loc[otu,'i_coef'] = results.params[0]
    odds_df.loc[otu,'i_coef_l'] = results.conf_int()[0][0]
    odds_df.loc[otu,'i_coef_h'] = results.conf_int()[0][1]
    odds_df.loc[otu,'i_pvalue'] = results.pvalues[0]#*100
    
#     #print results.summary2()

k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Corynebacteriaceae;g__Corynebacterium;s__;otu__282360
Optimization terminated successfully.
         Current function value: 0.671404
         Iterations 4
k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__;s__;otu__738351
Optimization terminated successfully.
         Current function value: 0.667231
         Iterations 5
k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Veillonellaceae;g__Dialister;s__;otu__174638
Optimization terminated successfully.
         Current function value: 0.675816
         Iterations 4
k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium;s__;otu__69664
Optimization terminated successfully.
         Current function value: 0.661712
         Iterations 4
k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Clostridiaceae;g__Clostridium;s__;otu__97301
Optimization terminated successfully.
         Current fu

         Current function value: 0.650051
         Iterations 5
k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__uniformis;otu__328617
Optimization terminated successfully.
         Current function value: 0.678717
         Iterations 5
k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Ruminococcus;s__;otu__4375860
Optimization terminated successfully.
         Current function value: 0.689752
         Iterations 4
k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__;g__;s__;otu__308386
Optimization terminated successfully.
         Current function value: 0.672168
         Iterations 4
k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Oscillospira;s__;otu__4437359
Optimization terminated successfully.
         Current function value: 0.683813
         Iterations 4
k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotella;s__;otu__333042
O

# join tables

In [10]:
supp_df = summary_df.join(stats_df).join(odds_df).join(taxa_df)
supp_df.to_csv('./tables/supp_table_2.txt',sep='\t',header=True,index=True)

In [11]:
supp_df.shape

(100, 23)