In [None]:
from os import listdir,mkdir
from os.path import isfile, join, isdir,exists
import pandas as pd
import numpy as np
from scipy import stats
import re
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from myplots import roundup, rounddown, find_decimal_fold, percentile_cut_off, rarefaction_calc, rarefaction_plot,draw_correlation_scatter
from matplotlib.ticker import FormatStrFormatter
import cPickle as pickle
from Bio.SeqUtils import GC
import seaborn as sns
import random
from scipy.stats import pearsonr
from skbio.diversity.alpha import shannon, simpson, berger_parker_d

from pop_organize import get_sample_data, get_sample_with_dfs
from SufficientStatistics import *
from MyFunctionsShani import *
import math
from myplots import roundup, rounddown, find_decimal_fold
from skbio.stats.distance import mantel
from scipy.spatial.distance import braycurtis, pdist


In [None]:
import time
cdate=str(time.strftime("%d%m%Y"))
cdate

In [None]:
file1='%s/TCR_real_data/SubSampled15000data_rep2/TCR_mb_results/permFisherTest_NoneNone5050/real_result_df' %MyPath
df=pd.read_pickle(file1)
df

# def general function to calculate diversity correlations between a sharing sequence matrix and a microbiome matrix:

In [None]:
def reject_outliers(data, m):
    outlier_ind=abs(data - np.mean(data)) < m * np.std(data)
    return outlier_ind


def plot_corr_diversity(measure,ax,stdToReject,divDF_seqs,divDF_mb):

    x=divDF_seqs[measure]
    y=divDF_mb[measure]
    
    print 'checking TCR and MB df order...'
    print x.head()
    print y.head()
    
    #clean data: remove nans and outliers:
    nx=np.isnan(x)
    ny=np.isnan(y)
    n=nx+ny
    newx=x[~n]
    newy=y[~n]
    
    
    if stdToReject is not None:
        nx_outliers=reject_outliers(newx, m=stdToReject)
        ny_outliers=reject_outliers(newy, m=stdToReject)
        n_outliers=nx_outliers+ny_outliers
        finalx=newx[~n_outliers]
        finaly=newy[~n_outliers]
    else:
        finalx=newx
        finaly=newy
        
        
    ymean=np.mean(finaly)
    nsamples=len(finalx)

    ax.scatter(finalx,finaly, alpha=0.4)
    ax.set_xlabel('TCR sequences')
    ax.set_ylabel('Microbiome Species')
    ax.plot(np.unique(finalx), np.poly1d(np.polyfit(finalx, finaly, 1))(np.unique(finalx)),c='blue',linewidth=1)
    ax.set_title('%s' %measure,fontsize=16)

    from scipy.stats import pearsonr
    r,p = pearsonr(finalx,finaly)

    ax.annotate("r=%.4f p=%.6f,n=%s" %(r,p,nsamples),  xy=(0.02, 0.96), xycoords='axes fraction', fontsize=11,
        horizontalalignment='left', verticalalignment='top')

    # if minPhenotypeValue is not None:
    #     plt.ylim(minPhenotypeValue,np.max(y)*1.1)
    #         plt.margins(0.2)

    #     file1='/net/mraid08/export/genie/Lab/Personal/ShaniBAF/TCR_real_data/PhenotypicData/DistMat_correlation_plots/'
    #     fig1.savefig(file1,dpi=200)
    
    return nsamples



In [None]:
def calc_corr_between_TCR_and_microbiome_preprocessing(TCRfile,microbiomeFile):

    
    #(1)process sample names:
    
    
    for dfFile in [TCRfile,microbiomeFile]:
        print 'processing sample names...'
        for n,sample in enumerate(dfFile.index):
            print n
            if '_' in sample:
                NewName=sample.split('_')[0]
            else:
                NewName=sample
            if 'b' in NewName:
                NewName=NewName.split('b')[0]
            dfFile.rename(index={sample:NewName},inplace=True)
            dfFile.rename(columns={sample:NewName},inplace=True)
    
    #(2)transform RA files to binary:
    
    print 'now converting TCR counts to binary indications...'          
    TCRfile_binary=pd.DataFrame()
    for column in TCRfile.columns.values:
        TCRfile_binary[column]=np.where(TCRfile[column]>0,1,0)
        TCRfile_binary.index=TCRfile.index

    print 'now converting mb counts to binary indications...'          
    microbiomeFile_binary=pd.DataFrame()
    for column in microbiomeFile.columns.values:
        microbiomeFile_binary[column]=np.where(microbiomeFile[column]>0,1,0)
        microbiomeFile_binary.index=microbiomeFile.index
        
    return TCRfile_binary,TCRfile,microbiomeFile_binary,microbiomeFile


def calc_corr_between_TCR_and_microbiome(TCRfile_binary,TCRfile,microbiomeFile_binary,microbiomeFile,TCRcutoff,mbCutoff):
            
    #(3)truncate files to include only sequences/species shared by more than a cutoff number of samples:
    if 'FD' in TCRfile_binary.columns.values:
            TCRfile_binary=TCRfile_binary.drop('FD',axis=1)
            print 'FD column was dropped from TCR file'
    if 'FD' in microbiomeFile_binary.columns.values:
            microbiomeFile_binary=microbiomeFile_binary.drop('FD',axis=1)
            print 'FD column was dropped from MB file'
    
    if TCRcutoff is not None:
        print 'truncating TCR File to include only sequences shared by more than %s' %TCRcutoff
        nSeqsBefore=len(TCRfile_binary.columns.values)
        print 'number of sequences before truncation=%s' %nSeqsBefore
        columnList=[]
        for n, column in enumerate(TCRfile_binary.columns.values):
            if n%1000==0:
                print n
            nSamples=TCRfile_binary[column].sum()
            if nSamples>TCRcutoff:
                columnList.append(column)
        
        print 'number of sequences after truncation=%s' %len(columnList)

        TCRfile_binary_truncated=TCRfile_binary[columnList]
        TCRfile_RA_truncated=TCRfile[columnList]
    else:
        print 'TCR files doesnt need truncation...'
        TCRfile_binary_truncated=TCRfile_binary
        TCRfile_RA_truncated=TCRfile
        
        
    if mbCutoff is not None:
        print 'truncating MB File to include only sequences shared by more than %s' %mbCutoff
        nSeqsBefore=len(microbiomeFile_binary.columns.values)
        print 'number of sequences before truncation=%s' %nSeqsBefore
        columnList=[]
        for n, column in enumerate(microbiomeFile_binary.columns.values):
            if n%1000==0:
                print n
            nSamples=microbiomeFile_binary[column].sum()
            if nSamples>mbCutoff:
                columnList.append(column)
        if 'FD' in columnList:
            columnList.remove('FD')
        print 'number of sequences after truncation=%s' %len(columnList)

        microbiomeFile_binary_truncated=microbiomeFile_binary[columnList]
        microbiomeFile_RA_truncated=microbiomeFile[columnList]
    else:
        print 'MB files doesnt need truncation...'
        microbiomeFile_binary_truncated=microbiomeFile_binary
        microbiomeFile_RA_truncated=microbiomeFile
        
        
    #(4)calculate diversity measures for TCRs:
    
    print 'calculating diversity measures for TCR...'
    
    dfList=[TCRfile_binary_truncated,TCRfile_RA_truncated]
    dfName='TCR_moreThan%s' %TCRcutoff
    isRAList=[False,True]

    divDF_seqs=pd.DataFrame(index=TCRfile_binary_truncated.index)

    for n,df in enumerate(dfList):
        print 

#         if 'FD' in df.columns.values:
#             df=df.drop('FD',axis=1)

        isRA=isRAList[n]
        if isRA:
            RA='RA'
            df=df.round(5)*100000
            df=df.astype(int,errors='ignore')      
        else:
            RA='binary'

        for sample in df.index:
    #         print sample
            divDF_seqs.loc[sample, 'shannon_%s' %RA]=shannon(df.loc[sample,:],base=2)
            divDF_seqs.loc[sample, 'simpson_%s' %RA]=simpson(df.loc[sample,:])
            divDF_seqs.loc[sample, 'berger_parker_d_%s' %RA]=berger_parker_d(df.loc[sample,:])
            if isRA:
                divDF_seqs.loc[sample, 'maxFreq_%s' %RA]=np.max(df.loc[sample,:])
                divDF_seqs.loc[sample, 'meanFreq_%s' %RA]=np.mean(df.loc[sample,:])
            else:
                divDF_seqs.loc[sample, 'nUnique']=np.sum(df.loc[sample,:])
                
    print 'calculating diversity measures for MB...'
    
    dfList=[microbiomeFile_binary_truncated,microbiomeFile_RA_truncated]
    dfName='microbiome_moreThan%s' %TCRcutoff
    isRAList=[False,True]

    divDF_mb=pd.DataFrame(index=microbiomeFile_binary_truncated.index)

    for n,df in enumerate(dfList):
        print 'df number=%s' %n

#         if 'FD' in df.columns.values:
#             df=df.drop('FD',axis=1)

        isRA=isRAList[n]
        if isRA:
            RA='RA'
            df=df.round(5)*100000
            df=df.astype(int,errors='ignore')      
        else:
            RA='binary'

        for sample in df.index:
    #         print sample
            divDF_mb.loc[sample, 'shannon_%s' %RA]=shannon(df.loc[sample,:],base=2)
            divDF_mb.loc[sample, 'simpson_%s' %RA]=simpson(df.loc[sample,:])
            divDF_mb.loc[sample, 'berger_parker_d_%s' %RA]=berger_parker_d(df.loc[sample,:])
            if isRA:
                divDF_mb.loc[sample, 'maxFreq_%s' %RA]=np.max(df.loc[sample,:])
                divDF_mb.loc[sample, 'meanFreq_%s' %RA]=np.mean(df.loc[sample,:])
            else:
                divDF_mb.loc[sample, 'nUnique']=np.sum(df.loc[sample,:])
                
    #plotting correlation scatters between TCR and microbiome, with different stdToReject:
    
    print'plotting correlation scatter for stdToRejectList=...'
    stdToRejectList=[None,0.25,0.5]
    
    for stdToReject in stdToRejectList:
        print stdToReject

        fig1=plt.figure(figsize=(12,12))
        fig1.suptitle('Correlations between TCR and microbiome diversities\nRemoved outliers=%s' %stdToReject,
                     fontsize=18)
        sumDF=pd.DataFrame()



        for n, measure in enumerate(divDF_seqs.columns.values):
            print n, measure
            ax=fig1.add_subplot(3,3,n+1)
            nsamples,r,p=plot_corr_diversity(measure,ax,stdToReject,divDF_seqs,divDF_mb)
            sumDF.loc[n,'TCRcutoff']=TCRcutoff
            sumDF.loc[n,'mbCutoff']=mbCutoff
            sumDF.loc[n,'stdToReject']=stdToReject
            sumDF.loc[n,'measure']=measure
            sumDF.loc[n,'nsamples']=nsamples
            sumDF.loc[n,'r']=r
            sumDF.loc[n,'p']=p
            
        file1='/net/mraid08/export/genie/Lab/Personal/ShaniBAF/TCR_real_data/PhenotypicData/Diversity analysis/sumDFs/DF_sharedSeqMoreThan%s_mbMoreThan%s_rejectMoreThan%s'%(TCRcutoff,mbCutoff,stdToReject)        
        sumDF.to_pickle(file1)  
            
            

        fig1.subplots_adjust(left=0.09, right=0.98, top=0.9, bottom=0.02, wspace=0.25,hspace=0.30)
      
        stdToRejectNameList=str(stdToReject).split('.')
        if len(stdToRejectNameList)==1:
            stdToRejectName=stdToRejectNameList[0]
        else:
            stdToRejectName=stdToRejectNameList[0]+stdToRejectNameList[1]
      
        file2='/net/mraid08/export/genie/Lab/Personal/ShaniBAF/TCR_real_data/PhenotypicData/Diversity analysis/sharedSeqMoreThan%s_mbMoreThan%s_rejectMoreThan%s' %(TCRcutoff,mbCutoff,stdToRejectName)
        fig1.savefig(file2, dpi=200) 

        
        
        
        
    
    
  
    

    
        

### load files:

In [None]:
#load TCR file ***NOTE THAT IT IS THE RA file!***:
print 'loading TCR file'
file1='/net/mraid08/export/genie/Lab/Personal/ShaniBAF/TCR_real_data/PublicSeqAnalysis/sharingMatrix_moreThan1_434Samples_RA'
TCRfile=pd.read_pickle(file1)

#load MB file ***NOTE THAT IT IS THE RA file!***:

print 'loading MB file'
file2='/net/mraid08/export/genie/Lab/Personal/ShaniBAF/TCR_real_data/PhenotypicData/SampleSpeciesDFgroupedByBD_only434'
microbiomeFile=pd.read_pickle(file2)

### run preprocessing:

In [None]:
TCRfile_binary,TCRfile,microbiomeFile_binary,microbiomeFile=calc_corr_between_TCR_and_microbiome_preprocessing(TCRfile,microbiomeFile)

### save results:

In [None]:
file1='/net/mraid08/export/genie/Lab/Personal/ShaniBAF/TCR_real_data/PublicSeqAnalysis/organized_TCR_and_mb_files_for434samples/TCRfile_binary'
TCRfile_binary.to_pickle(file1)

file2='/net/mraid08/export/genie/Lab/Personal/ShaniBAF/TCR_real_data/PublicSeqAnalysis/organized_TCR_and_mb_files_for434samples/TCRfile'
TCRfile.to_pickle(file2)

file3='/net/mraid08/export/genie/Lab/Personal/ShaniBAF/TCR_real_data/PublicSeqAnalysis/organized_TCR_and_mb_files_for434samples/microbiomeFile_binary'
microbiomeFile_binary.to_pickle(file3)

file4='/net/mraid08/export/genie/Lab/Personal/ShaniBAF/TCR_real_data/PublicSeqAnalysis/organized_TCR_and_mb_files_for434samples/microbiomeFile'
microbiomeFile.to_pickle(file4)



### load results:

In [None]:
file1='/net/mraid08/export/genie/Lab/Personal/ShaniBAF/TCR_real_data/PublicSeqAnalysis/organized_TCR_and_mb_files_for434samples/TCRfile_binary'
TCRfile_binary=pd.read_pickle(file1)

file2='/net/mraid08/export/genie/Lab/Personal/ShaniBAF/TCR_real_data/PublicSeqAnalysis/organized_TCR_and_mb_files_for434samples/TCRfile'
TCRfile=pd.read_pickle(file2)

file3='/net/mraid08/export/genie/Lab/Personal/ShaniBAF/TCR_real_data/PublicSeqAnalysis/organized_TCR_and_mb_files_for434samples/microbiomeFile_binary'
microbiomeFile_binary=pd.read_pickle(file3)

file4='/net/mraid08/export/genie/Lab/Personal/ShaniBAF/TCR_real_data/PublicSeqAnalysis/organized_TCR_and_mb_files_for434samples/microbiomeFile'
microbiomeFile=pd.read_pickle(file4)



### run with different cutoffs:

In [None]:
TCRcutoff=50
mbCutoff=50


calc_corr_between_TCR_and_microbiome(TCRfile_binary,TCRfile,microbiomeFile_binary,microbiomeFile,TCRcutoff,mbCutoff)


In [None]:
TCRcutoff=10
mbCutoff=10


calc_corr_between_TCR_and_microbiome(TCRfile_binary,TCRfile,microbiomeFile_binary,microbiomeFile,TCRcutoff,mbCutoff)


In [None]:
TCRcutoff=10
mbCutoff=1


calc_corr_between_TCR_and_microbiome(TCRfile_binary,TCRfile,microbiomeFile_binary,microbiomeFile,TCRcutoff,mbCutoff)


In [None]:
TCRcutoff=1
mbCutoff=1


calc_corr_between_TCR_and_microbiome(TCRfile_binary,TCRfile,microbiomeFile_binary,microbiomeFile,TCRcutoff,mbCutoff)


In [None]:
TCRcutoff=100
mbCutoff=100


calc_corr_between_TCR_and_microbiome(TCRfile_binary,TCRfile,microbiomeFile_binary,microbiomeFile,TCRcutoff,mbCutoff)


# view results:

In [None]:
diversityFolder='/net/mraid08/export/genie/Lab/Personal/ShaniBAF/TCR_real_data/PhenotypicData/Diversity analysis/sumDFs'
diversityResults=concat_summarizing_dfs(diversityFolder)

In [None]:
diversityResults

# Exploring the relation between binary phenotypes and shared sequences:

## load files:

### MoreThan10_binary

In [None]:
#load the binary shared sequence table:

file1='/net/mraid08/export/genie/Lab/Personal/ShaniBAF/TCR_real_data/PublicSeqAnalysis/sharingMatrix_moreThan10_434Samples'
sharingMatrix_moreThan10_434Samples=pd.read_pickle(file1)

sharingMatrix_moreThan10_434Samples.head()

In [None]:
#process sample names:
    # (1) edit sample names to match those in the phenotype files
    # (2) remove rows and columns with nan values:
    
for n,sample in enumerate(sharingMatrix_moreThan10_434Samples.index):
    print n
    if '_' in sample:
        NewName=sample.split('_')[0]
    else:
        NewName=sample
    if 'b' in NewName:
        NewName=NewName.split('b')[0]
    sharingMatrix_moreThan10_434Samples.rename(index={sample:NewName},inplace=True)
    sharingMatrix_moreThan10_434Samples.rename(columns={sample:NewName},inplace=True)
        
sharingMatrix_moreThan10_434Samples.head()

### MoreThan10_RA

In [None]:
file1='/net/mraid08/export/genie/Lab/Personal/ShaniBAF/TCR_real_data/PublicSeqAnalysis/sharingMatrix_moreThan10_434Samples_RA'
sharingMatrix_moreThan10_434Samples_RA=pd.read_pickle(file1)

sharingMatrix_moreThan10_434Samples_RA.head()

In [None]:
#process sample names:
    # (1) edit sample names to match those in the phenotype files
    # (2) remove rows and columns with nan values:
    
for n,sample in enumerate(sharingMatrix_moreThan10_434Samples_RA.index):
    print n
    if '_' in sample:
        NewName=sample.split('_')[0]
    else:
        NewName=sample
    if 'b' in NewName:
        NewName=NewName.split('b')[0]
    sharingMatrix_moreThan10_434Samples_RA.rename(index={sample:NewName},inplace=True)
    sharingMatrix_moreThan10_434Samples_RA.rename(columns={sample:NewName},inplace=True)
        
sharingMatrix_moreThan10_434Samples_RA.head()

### MicrobiomeSpecies_RA

In [None]:
file1='/net/mraid08/export/genie/Lab/Personal/ShaniBAF/TCR_real_data/PhenotypicData/SampleSpeciesDFgroupedByBD_only434'
SampleSpeciesDFgroupedByBD_only434_RA=pd.read_pickle(file1)

SampleSpeciesDFgroupedByBD_only434_RA.head()

### MicrobiomeSpecies_binary:

In [None]:
SampleSpeciesDFgroupedByBD_only434_binary=pd.DataFrame()
print 'now converting counts to binary indications...'
for column in SampleSpeciesDFgroupedByBD_only434_RA.columns.values:
    SampleSpeciesDFgroupedByBD_only434_binary[column]=np.where(SampleSpeciesDFgroupedByBD_only434_RA[column]>0,1,0)
    SampleSpeciesDFgroupedByBD_only434_binary.index=SampleSpeciesDFgroupedByBD_only434_RA.index
print 'DONE!'
SampleSpeciesDFgroupedByBD_only434_binary.head()

### drop species that appear in less than 10 samples:

In [None]:
print len(SampleSpeciesDFgroupedByBD_only434_binary.columns.values)
columnList10=[]
for n, column in enumerate(SampleSpeciesDFgroupedByBD_only434_binary.columns.values):
    if n%1000==0:
        print n
    nSamples=SampleSpeciesDFgroupedByBD_only434_binary[column].sum()
    if nSamples>10:
        columnList10.append(column)
columnList10.remove('FD')
print len(columnList10)


In [None]:
SampleSpeciesDFgroupedByBD_only434_binary_more10=SampleSpeciesDFgroupedByBD_only434_binary[columnList10]
SampleSpeciesDFgroupedByBD_only434_RA_more10=SampleSpeciesDFgroupedByBD_only434_RA[columnList10]

In [None]:
file1='/net/mraid08/export/genie/Lab/Personal/ShaniBAF/TCR_real_data/PhenotypicData/SampleSpeciesDFgroupedByBD_only434_binary_more10'
SampleSpeciesDFgroupedByBD_only434_binary_more10.to_pickle(file1)

file2='/net/mraid08/export/genie/Lab/Personal/ShaniBAF/TCR_real_data/PhenotypicData/SampleSpeciesDFgroupedByBD_only434_binary_more10.xlsx'
SampleSpeciesDFgroupedByBD_only434_binary_more10.to_excel(file2)

file3='/net/mraid08/export/genie/Lab/Personal/ShaniBAF/TCR_real_data/PhenotypicData/SampleSpeciesDFgroupedByBD_only434_RA_more10'
SampleSpeciesDFgroupedByBD_only434_RA_more10.to_pickle(file3)

file4='/net/mraid08/export/genie/Lab/Personal/ShaniBAF/TCR_real_data/PhenotypicData/SampleSpeciesDFgroupedByBD_only434_RA_more10.xlsx'
SampleSpeciesDFgroupedByBD_only434_RA_more10.to_excel(file4)

# calculate diversity measures for shared sequences and microbiome species:

## calculate diversity measures for sharing sequence matrix (moreThan10-binary+RA): 

In [None]:
dfList=[sharingMatrix_moreThan10_434Samples,sharingMatrix_moreThan10_434Samples_RA]
dfName='microbiomeSpecies'
isRAList=[False,True]



divDF_seqs=pd.DataFrame(index=df.index)

for n,df in enumerate(dfList):
    
    if 'FD' in df.columns.values:
        df=df.drop('FD',axis=1)
    
    isRA=isRAList[n]
    if isRA:
        RA='RA'
        df=df.round(5)*100000
        df=df.astype(int,errors='ignore')      
    else:
        RA='binary'
       
    for sample in df.index:
#         print sample
        divDF_seqs.loc[sample, 'shannon_%s' %RA]=shannon(df.loc[sample,:],base=2)
        divDF_seqs.loc[sample, 'simpson_%s' %RA]=simpson(df.loc[sample,:])
        divDF_seqs.loc[sample, 'berger_parker_d_%s' %RA]=berger_parker_d(df.loc[sample,:])
        if isRA:
            divDF_seqs.loc[sample, 'maxFreq_%s' %RA]=np.max(df.loc[sample,:])
            divDF_seqs.loc[sample, 'meanFreq_%s' %RA]=np.mean(df.loc[sample,:])
        else:
            divDF_seqs.loc[sample, 'nUnique']=np.sum(df.loc[sample,:])

            

divDF_seqs.head()
    
    

### calculate correlations between measures to reduce number of measures to correlate with microbiome diversity measures:

In [None]:
for i in range(len(divDF_seqs.columns.values)):
    for j in range(i,len(divDF_seqs.columns.values)):
        column1=divDF_seqs.columns.values[i]
        column2=divDF_seqs.columns.values[j]
        if column1!=column2:
            r,p=MyPearsonr(divDF_seqs[column1],divDF_seqs[column2])
            print column1,column2,r,p
    

## calculate diversity measures for sharing sequence matrix (moreThan10-binary+RA): 

In [None]:
dfList=[SampleSpeciesDFgroupedByBD_only434_binary_more10,SampleSpeciesDFgroupedByBD_only434_RA_more10]
dfName='microbiomeSpecies'
isRAList=[False,True]



divDF_mb=pd.DataFrame(index=df.index)

for n,df in enumerate(dfList):
    
    if 'FD' in df.columns.values:
        df=df.drop('FD',axis=1)
    
    isRA=isRAList[n]
    if isRA:
        RA='RA'
        df=df.round(5)*100000
        df=df.astype(int,errors='ignore')      
    else:
        RA='binary'
       
    for sample in df.index:
#         print sample
        divDF_mb.loc[sample, 'shannon_%s' %RA]=shannon(df.loc[sample,:],base=2)
        divDF_mb.loc[sample, 'simpson_%s' %RA]=simpson(df.loc[sample,:])
        divDF_mb.loc[sample, 'berger_parker_d_%s' %RA]=berger_parker_d(df.loc[sample,:])
        if isRA:
            divDF_mb.loc[sample, 'maxFreq_%s' %RA]=np.max(df.loc[sample,:])
            divDF_mb.loc[sample, 'meanFreq_%s' %RA]=np.mean(df.loc[sample,:])
        else:
            divDF_mb.loc[sample, 'nUnique']=np.sum(df.loc[sample,:])

            

divDF_mb.head()
    

### calculate correlations between measures to reduce number of measures to correlate with shared sequences diversity measures:

In [None]:
for i in range(len(divDF_mb.columns.values)):
    for j in range(i,len(divDF_mb.columns.values)):
        column1=divDF_mb.columns.values[i]
        column2=divDF_mb.columns.values[j]
        if column1!=column2:
            r,p=MyPearsonr(divDF_mb[column1],divDF_mb[column2])
            print column1,column2,r,p
    

# explore associations between TCR and microbiome diversity:

1. calculate diversity measures for the microbiome species
2. correlate with TCR diversity measures (including plots, p value corrections)

consider analysing adaptive cohort for age, gender

In [None]:
print len(divDF_seqs.columns.values)

In [None]:
stdToReject=None

fig1=plt.figure(figsize=(12,12))
fig1.suptitle('Correlations between TCR and microbiome diversities\nRemoved outliers=%s' %stdToReject,
             fontsize=18)



for n, measure in enumerate(divDF_seqs.columns.values):
    print n, measure
    ax=fig1.add_subplot(3,3,n+1)
    plot_corr_diversity(measure,ax,stdToReject)
    
fig1.subplots_adjust(left=0.09, right=0.98, top=0.9, bottom=0.02, wspace=0.25,hspace=0.30)

plt.show()

In [None]:
stdToReject=0.25

fig1=plt.figure(figsize=(12,12))
fig1.suptitle('Correlations between TCR and microbiome diversities\nRemoved outliers=%s' %stdToReject,
             fontsize=18)



for n, measure in enumerate(divDF_seqs.columns.values):
    print n, measure
    ax=fig1.add_subplot(3,3,n+1)
    plot_corr_diversity(measure,ax,stdToReject)
    
fig1.subplots_adjust(left=0.09, right=0.98, top=0.9, bottom=0.02, wspace=0.25,hspace=0.30)

plt.show()