In [2]:
import numpy as np
import pandas as pd
import sklearn as skl
import statsmodels
import statsmodels.api as sm

print 'numpy = %s' % np.__version__
print 'pandas = %s' % pd.__version__
print 'scikit-earn = %s' % skl.__version__
print 'statsmodels = %s' % statsmodels.__version__

numpy = 1.10.2
pandas = 0.22.0
scikit-earn = 0.19.1
statsmodels = 0.8.0


  from pandas.core import datetools


In [3]:
!ls -lh ../generate_data/output_data/O100_cv0_10991483/

total 1392
-rw-r--r--  1 firasmidani  staff   1.8K Feb 16 13:16 auc_scores.txt
-rw-r--r--  1 firasmidani  staff   456K Feb 16 13:16 coefficients.txt
-rw-r--r--  1 firasmidani  staff   211K Feb 16 13:16 decision_scores.txt
-rw-r--r--  1 firasmidani  staff    23K Feb 16 13:16 features.txt


In [4]:
def txt_to_df(filename,header=0):
    return pd.read_csv(filename,sep='\t',header=header,index_col=0)

def getSummaryStatistics(otu,biom_df,labels_df):

    # split cohort
    subjects_1 = labels_df[labels_df.color==1].index;
    subjects_0 = labels_df[labels_df.color==0].index;
    
    # compute mean relative abundnace
    mean =  biom_df.loc[:,otu].mean()
    mean_0 = biom_df.loc[subjects_0,otu].mean()
    mean_1 =  biom_df.loc[subjects_1,otu].mean()
    
    # compute log ratio of mean relative abundance with pseudocoutn
    mlr = np.log10((mean_1+1e-6)/(mean_0+1e-6))

    # compute prevalence (number of individuals)
    prev = (np.sum(biom_df.loc[:,otu]>0))
    prev_0 = (np.sum(biom_df.loc[subjects_0,otu]>0))
    prev_1 = (np.sum(biom_df.loc[subjects_1,otu]>0))
    
    
    # compute prevalence (percent of individuals)
    pr = float(prev)/biom_df.shape[0]
    pr_0 = float(prev_0)/len(subjects_0); 
    pr_1 = float(prev_1)/len(subjects_1); 
    
    # compute log ratio of prevalence with pseudocount
    plr = np.log10((float(prev_1+1)/len(subjects_1))/(float(prev_0+1)/len(subjects_0)))
    
    # put it all in a pandas.DataFrame
    df = pd.DataFrame(index = ['mean','mean_0','mean_1','mlr',
                               'prev','prev_0','prev_1','plr'],
                               columns = [otu])
    df.loc[:,otu] = [mean,mean_0,mean_1,mlr,pr,pr_0,pr_1,plr]
     
    return df

def normalize(df):
    
    from sklearn.preprocessing import StandardScaler

    scaler = skl.preprocessing.StandardScaler()
    scaler = scaler.fit(df)
    df = pd.DataFrame(data=scaler.transform(df),index=df.index,columns=df.keys())
    
    return df

def log_transform(ii):
    
    ii += 1e-6
    ii = np.log10(ii)
    
    return ii

# Import data

In [5]:
# import microbiota model results
parent_path = '../generate_data/output_data/O100_cv0_10991483/'

features_df = txt_to_df('%s/features.txt' % parent_path); 
aucs_df = txt_to_df('%s/auc_scores.txt' % parent_path,[0,1]); 
coef_df = txt_to_df('%s/coefficients.txt' % parent_path); 

print features_df.shape
print aucs_df.shape
print coef_df.shape
print 

# import otu and outcomes tables
parent_path = '../generate_data/input_data/onset/'
biom_df = txt_to_df('%s/features.S.no.vc.infected.day.2.txt' % parent_path); 
labels_df = txt_to_df('%s/outcomes.infected.batch.1.day.2.txt' % parent_path);

print biom_df.shape
print labels_df.shape

(226, 1)
(226, 1)
(226, 226)

(82, 551)
(58, 1)


# Clean up data

In [6]:
## grab top 100 OTUs
otus_df = features_df.loc[range(1,17)]
otus_df = otus_df.reset_index().set_index('Feature')
otus_df.head()

## grab top 100 OTU mdoel coefficients
coef_100 = pd.DataFrame(coef_df.loc[:,'16_f'])
coef_100 = coef_100[coef_100.any(1)]
coef_100 = coef_100.sort_values(['16_f'])

## subset biom table for top 100 OTUs
biom_df = biom_df.loc[:,otus_df.index]; print biom_df.shape

(82, 16)


# add OTU model coefficients

In [7]:
summary_df = pd.DataFrame(index=otus_df.index,
                          columns=['rank','coef','abs_coef'])

for otu in summary_df.index:
    summary_df.loc[otu,'rank'] = otus_df.loc[otu,'Rank']
    summary_df.loc[otu,'coef'] = coef_100.loc[otu,'16_f']
    summary_df.loc[otu,'abs_coef'] = np.abs(coef_100.loc[otu,'16_f'])
    
print summary_df.shape

(16, 3)


# add OTU summary statistics

In [8]:
stats_df = pd.DataFrame(index=otus_df.index,
                        columns=['mean','mean_0','mean_1','mlr',
                                 'prev','prev_0','prev_1','plr'])

for otu in stats_df.index:
    
    df = getSummaryStatistics(otu,biom_df,labels_df)
    
    stats_df.loc[otu,df.index] = np.ravel(df.values)
    
print stats_df.shape

(16, 8)


# delineate OU taxonomy

In [9]:
taxa_df = pd.DataFrame(index=otus_df.index,
                       columns=['kingdom','phylum','class','order',
                                'family','genus','species'])

for otu in summary_df.index:
    
    df = pd.DataFrame([ii.split('__')[1] for ii in otu.split(';')],
                       index=['kingdom','phylum','class','order',
                              'family','genus','species'],
                       columns=[otu])
    
    taxa_df.loc[otu,df.index] = np.ravel(df.values)
    
print taxa_df.shape

(16, 7)


# compute odds ratio

In [10]:
odds_df = pd.DataFrame(index=otus_df.index,
                       columns=['i_coef','i_coef_l','i_coef_h',
                                'i_pvalue'])

dgmatrix = biom_df.loc[labels_df.index,otus_df.index] # design matrix

normalize
input_df = dgmatrix
input_df = input_df.applymap(log_transform)
input_df = normalize(input_df)

for otu in dgmatrix.keys():
    print otu
    
    df = input_df.loc[:,otu]
    
    if len(df.shape)==1:
        df = pd.DataFrame(df.values,index=df.index,columns=['abundance'])
    
    response = np.ravel(labels_df.loc[df.index,:].color);
    observed = np.ravel(df.values);
    
    results = sm.Logit(response,observed).fit()
       
    odds_df.loc[otu,'i_coef'] = results.params[0]
    odds_df.loc[otu,'i_coef_l'] = results.conf_int()[0][0]
    odds_df.loc[otu,'i_coef_h'] = results.conf_int()[0][1]
    odds_df.loc[otu,'i_pvalue'] = results.pvalues[0]#*100
    
print odds_df.shape

k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Bifidobacteriales;f__Bifidobacteriaceae;g__Bifidobacterium;s__bifidum
Optimization terminated successfully.
         Current function value: 0.578758
         Iterations 6
k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__Klebsiella;s__
Optimization terminated successfully.
         Current function value: 0.676617
         Iterations 5
k__Bacteria;p__Proteobacteria;c__Alphaproteobacteria;o__Rhodobacterales;f__Rhodobacteraceae;g__Paracoccus;s__aminovorans
Optimization terminated successfully.
         Current function value: 0.637159
         Iterations 6
k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__;g__;s__
Optimization terminated successfully.
         Current function value: 0.676718
         Iterations 4
k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Veillonellaceae;g__Megasphaera;s__
Optimization terminated successfully.
         Current functio

# join tables

In [11]:
supp_df = summary_df.join(stats_df).join(odds_df).join(taxa_df)
supp_df.to_csv('./tables/supp_table_4.txt',sep='\t',header=True,index=True)

In [12]:
supp_df.shape

(16, 22)