In [1]:
import os
import pandas as pd
from Bio import SeqIO
import numpy as np

# This makes the note book as wide as the screen
#from IPython.core.display import display, HTML
#display(HTML("<style>.container { width:100% !important; }</style>"))

In [6]:
## Funct to get the assembly of origin for a bin

def bin_to_assembly(BIN):
    '''Give a bin name and return the assembly of origin, because I wasn't thoughtful in my naming, some reformatting is 
        needed. This is also the name of the directory holding the mapping data for that assembly'''
    
    ## Get list of assembly directory names 
    list_assembly=os.listdir('Multimap/')
    ## A list of sample names from the assembly names  
    list_assembly_stripd=[ASSEMBLY.replace('-','_').replace('_pat','_Pat').replace('MH_','').split('_')[0] for ASSEMBLY in os.listdir('Multimap/')]
    ## Zip a dictionary together of the above lists
    dict_assembly=dict(zip(list_assembly,list_assembly_stripd))
    
    ## Loop over the dictionary and compare the function input bin name to return the assembly key
    for ASSEMBLY, SAMPLE in dict_assembly.items():
        if BIN.replace('-','_').replace('twin.','').replace('MH_','').split('_')[0] == SAMPLE:
            return ASSEMBLY
        
    

## Func to produce a dictionary of dataframes from the covstat files in each assembly mapping dir

def covstat_to_dfdict(ASSEMBLY_DIR):
    '''Give an assembly mapping directory and return a dictionary of dataframes for each covstat files.
    {Keys are the name of the file: values are the dataframe}. These directiories are under the 'Multimap/' dir '''
    
    Multimap = os.path.expanduser("Multimap/")
    df_dict = {}
    cov_cols=["#ID","Avg_fold","Covered_percent" ]
    
    for COVSTAT in os.listdir(os.path.join(Multimap, ASSEMBLY_DIR)):
        df_dict[COVSTAT] = pd.read_csv(os.path.join(Multimap,ASSEMBLY_DIR,COVSTAT),sep='\t',usecols=cov_cols)
    
    return df_dict


## Gets bins form the total pool of bins 
def bin_to_contigs(BIN):
    '''For a given bin name (string), return a list of contigs that bin is made of'''
    
    contig_list = [record.id for record in SeqIO.parse("bins_dir/MH-total_bins_input/{0}".format(BIN), "fasta")]
    return contig_list


## Dictionary of dictionaries of dataframes. Use the funciton 'covstat_to_dfdict' and collate all the dataframe 
## dictionaries together.
## {Assembly:{assembly_covstat_name:dataframe}}

list_ass_dirs=sorted(os.listdir('Multimap/'))
list_dict_df = map(covstat_to_dfdict, list_ass_dirs)
assembly_dict_df = dict(zip(list_ass_dirs, list_dict_df))


In [11]:
# with open('s1_maxbin2_bins_bin.10.list', 'w') as f:
#     for item in bin_to_contigs('s1_maxbin2_bins_bin.10.fasta'):
#         f.write("%s\n" % item)


# #bin_to_contigs('s1_maxbin2_bins_bin.10.fasta')


In [12]:
## Produce a dataframe of unnormalized cov values  

def bin_to_avg_coverage(BIN): 
    '''Given a bin - return a list of average coverages for that bin in each sample.'''
    
    # Access the appropiate section of the data_frame_dict for the bin in question. 
    
    # Returns a dictionary of covestat df's for the assembly that the bin comes from.
    df_dict = assembly_dict_df[bin_to_assembly(BIN)]
    #return df_dict
     
    # Get the list of keys to iter over
    dict_keys = list(df_dict.keys())
    #return dict_keys
    
    # Get the contigs form the bin
    contigs = bin_to_contigs(BIN)
    #return contigs 
    
    # Init list to fill
    avg_cov_list = []

    for dfs in dict_keys:
        ## loop over the dataframes
        df = df_dict[dfs]
        
        
        ## subset dataframe to 'Covered_percent' > n and average coverage is > 1 
        n=65
        df=df[df['Covered_percent']>n]
        
        
        
        ## Calculate avg coverage for bin while in each dataframe. 
        ## 
        avg_cov=list(df[df['#ID'].isin(contigs)].Avg_fold)
        
        avg_cov_list.append(np.nanmean(avg_cov))
        
        
    ##list of samples     
    samples = [sample.split('_')[0] for sample in dict_keys] 
    samples.append('Bin Id')
    ##list of ave cov for bin in each sequenced sample, append the bin name that is being processed. 
    avg_cov_list.append(BIN)
    
    
    ## Combine sample ids with coverage data for each sample
    return dict(zip(samples, avg_cov_list))

In [14]:
bin_to_avg_coverage('sr_bins.23.fasta')



{'MH-s3': 9.7048,
 'MH-s2': 7.0321,
 'MH-s1': 6.4793,
 'MH-Pat': 34.1108,
 'MH-s5': nan,
 'Bin Id': 'sr_bins.23.fasta'}

In [15]:
## This is where the bins are read in. 
############################################################################################################

## Get list of bin names to run the  bin_to_avg_coverage function on

# List of dereplicated genome bins 
bin_names = os.listdir('bins_dir/final_bins/') ## With Bac and 1 Archaea

# List of all genomes bins
Chdb = pd.read_csv("data_tables/Chdb.csv", usecols=["Bin Id","Completeness","Contamination"])
all_bins = Chdb["Bin Id"]

#############################################################################################################

In [16]:
## Make the dataframe for output of the bin_to_avg_coverage function as list of lists
df = pd.DataFrame([bin_to_avg_coverage(BIN) for BIN in bin_names])


##Ignore below for now.

## Do some formatting on the output df
# Remove fasta suffix and replace 'twin.' with 'twin_' from the bin names, needed for 16s tree heatmap compatability. 
df['Bin Id']=df['Bin Id'].apply(lambda x: x.replace('.fasta','').replace('twin.','twin_'))
# and set Bin column to index
df = df.set_index('Bin Id')
# Fill NaN values with 'O' place holder
df.fillna(0, inplace=True)



##Ignore warning 



In [17]:
## Dataframe for all bins - This will take a while to compute, but it will get there
#df_all = pd.DataFrame([bin_to_avg_coverage(BIN) for BIN in all_bins])


#df_all.head()
##Ignore warning

In [18]:
#all_bins_complete = pd.merge(df_all, Chdb, on='Bin Id')
#all_bins_complete.head()



In [19]:
#all_bins_complete.to_csv(r'all_bins_abundace_comp_score.tsv', sep = '\t')

In [20]:
## Normalization functions

def col_sum(dataframe):
    return dataframe.div(dataframe.sum(axis=0), axis=1)

def log_by_n(dataframe, n ):
    return np.log((dataframe) * n)

def min_max(dataframe, n):
     return (dataframe-dataframe.min())/(dataframe.max()-dataframe.min()) * n

    


In [21]:
## Explore normalization of data

## Show the sum of the columns 
col_sum(df).sum(axis =0)

#log_by_n(col_sum(df), 10000)

#log_by_n(df, 100)

#col_sum(log_by_n(df, 100))

#min_max(df, 100)


MH-Pat    1.0
MH-s1     1.0
MH-s2     1.0
MH-s3     1.0
MH-s5     1.0
dtype: float64

In [28]:
##Other output options

#df.to_csv(r'raw.tsv', sep='\t')
#col_sum(df).to_csv(r'col_sum.tsv', sep='\t')
#log_by_n(col_sum(df), 10000).to_csv(r'col_sum-log_10000-75pc.tsv', sep='\t')
#log_by_n(df,100).to_csv(r'log_100.tsv',sep='\t')
#min_max(df,100).to_csv(r'min_max-100.tsv',sep='\t')
#min_max(col_sum(df),100).to_csv(r'col_sum-min_max-100',sep='\t')
#min_max(log_by_n(col_sum(df), 10000),100).to_csv(r'min_max-col_sum-log_10000-75pc.tsv', sep='\t')


In [23]:
## Export the Unnormalized abundance profile dataframe to .tsv 
#df_summed_logged.to_csv(r'abundance_bac.tsv', sep='\t')

df.to_csv(r'final_bins_abundace.tsv', sep = '\t')

#df_all.to_csv(r'all_bins_abundace.tsv', sep = '\t')