In [1]:
## Load in required libraries 
from Bio import SeqIO 
import numpy as np
import glob
import pandas as pd
import pickle as pk
import matplotlib.pylab as plt
import seaborn as sns
import os.path
import math

In [2]:
from os.path import abspath, join

In [3]:
## Define paths & variables
species_list=['s__Acinetobacter_johnsonii.fna'
,'s__Akkermansia_muciniphila.fna'
,'s__Alistipes_shahii.fna'
,'s__Bacillaceae_bacterium_EAG3.fna'
,'s__Bacteroides_dorei.fna'
,'s__Bacteroides_fragilis.fna'
,'s__Bacteroides_thetaiotaomicron.fna'
,'s__Bacteroides_uniformis.fna'
,'s__Bacteroides_vulgatus.fna'
,'s__Clostridium_perfringens.fna'
,'s__Clostridium_ventriculi.fna'
,'s__Collinsella_massiliensis.fna'
,'s__Collinsella_stercoris.fna'
,'s__Enterococcus_faecalis.fna'
,'s__Erysipelatoclostridium_ramosum.fna'
,'s__Escherichia_coli.fna'
,'s__Lactobacillus_apodemi.fna'
,'s__Lactobacillus_murinus.fna'
,'s__Lactobacillus_reuteri.fna'
,'s__Lactobacillus_rodentium.fna'
,'s__Parabacteroides_distasonis.fna'
,'s__Plesiomonas_shigelloides.fna'
,'s__Prevotella_copri.fna'
,'s__Pseudomonas_lundensis.fna'
,'s__Pseudomonas_yamanorum.fna'
,'s__Ruthenibacterium_lactatiformans.fna'
,'s__Streptococcus_gallolyticus.fna']




clade_dir="/panfs/panfs1.ucsd.edu/panscratch/jhc103/VertMetaphlan-frmerged/db_markers"
path_to_metadata="/panfs/panfs1.ucsd.edu/panscratch/jhc103/VertMetaphlan-frmerged/metadata/vert_metadata_new.txt"
pkl_dir="/panfs/panfs1.ucsd.edu/panscratch/jhc103/VertMetaphlan-frmerged/consensus_markers/*.pkl"

In [4]:
## These are changeable variables! 
sample_group = "" ## options "" or "_mammallianHost" or "_nonMammallianHost"
marker_in_n_samples=0.5
tree_dir = "/panfs/panfs1.ucsd.edu/panscratch/jhc103/VertMetaphlan-frmerged/strain-trees-output"


In [5]:
## Set up for calculation
## 2 things to build 
## 1. The length of each markers for each bacteria
## 2. Build a panda dataframe contain information for each bacteria interested 
##   rows: will be the unique markers for each bacteria
##   columns: will be the samples 
master_dict={} ## first key should be species whose value is a dictionary which keys are the markers and value 
               ## is a dictionary whose keys are the samples whose values is the number of times that marker appeared in the 
               ## sample 
marker_len_dict={}
for species in species_list: 
    # Extract all the header ids or markers for each bacteria 
    header_set=set(())
    marker_dict={}
    for record in SeqIO.parse(clade_dir + "/" + species, "fasta"):
        header_set.add(record.id)
        marker_len_dict.update({record.id:len(record.seq)})
        marker_dict.update({record.id:0})
 
    ## Loop through files and check if the markers are present in the markers for that bacteria  
    sample_dict={}
    for filename in glob.iglob(pkl_dir):            
        infile=open(filename,'rb')
        marker_list=pk.load(infile) ## The dictionary in the file 
        ## count the number of times that marker appeared 
        for marker_info in marker_list: ## Loop through each marker informatio
            if marker_info['marker'] in header_set: ## If the marker id in that marker key associated value is present in our set then increment our dict by 1
                marker_dict[marker_info['marker']]+=1
        filename=os.path.basename(filename)
        sample_dict.update({filename:marker_dict}) ## Add the key (file name) and the marker_dict to our sample dict
        infile.close()
        marker_dict=marker_dict.fromkeys(marker_dict,0)## reset the marker_dict to have 0 values again

        
    master_dict.update({species:sample_dict})

In [6]:
## Create function that finds that find the number of marker and subsequently nucleotide that is shared across 
## all the samples in a dataframe
def calculate_marker_score(df, length_for_markers, marker_threshold):

    ## Subset the dataframe to only include markers present in all of the sample 
    ## The value  in this case represents the threshold we are willing to keep a marker
    df=df.loc[:, df.sum()>=marker_threshold*len(df)]
    
    ## replace 1 in dataframe with corresponding read length stored in the marker_len_dict
    ## the loop structure is critical in not making a copy ==> iteritems() help with changing the dataframe directly
    for (columnName, columnData) in df.iteritems():
        columnData=np.where((columnData == 1), length_for_markers[columnName], columnData)
    ## Other ways to change values in column 
#         m = df.column == 1
#         df.where(~m, other=length_for_markers[column])
        #df[column].replace(1, length_for_markers[column],inplace=True) ## This shoot an warning but checked for validity and it's doing what I want it to do
#         df.loc[(df.columnName==1,column)]=marker_len_dict[column]
        
    return df.sum().sum(), len(df), df

def find_best_marker_score(df, length_for_markers, min_marker_per_sample, marker_threshold):
    
    ## Convert the df back to 0 & 1 
    df[df != 0] = 1

    ## Set up variable before for loop
    #len(df) ## number of rows of dataframe represents # of samples present in our dataframe
    score=-1
    best_marker_count=-1
    sample_count=-1
    series_marker_sum=df.sum(axis=1).sort_values()
    best_df = -1
    ## If somehow the max value is NaN then return negative values of -1
    if math.isnan(series_marker_sum.max()):
        return 0, 0, score, best_df
    else:
        for marker_count in range(min_marker_per_sample, int(series_marker_sum.max())):
            ## update the df by taking away samples that do not have the marker_count
            df=df[df.sum(axis=1)>=marker_count] 

            ## Run the function to find the corresponding score
            tmpScore, num_of_sample, tmpDf =calculate_marker_score(df, length_for_markers, marker_threshold)

            ## Only update score and marker_count if we have tmpScore higher than score! 
            if( tmpScore > score):
                score = tmpScore
                best_marker_count=marker_count
                sample_count = num_of_sample
                best_df = tmpDf

    return best_marker_count, sample_count, score, best_df
    
def unpack_dict_make_df(bacteria_marker_dict, length_for_markers):
    dict_return={}
    for bacteria in bacteria_marker_dict:
        print(bacteria)
        df_bac=pd.DataFrame.from_dict(bacteria_marker_dict[bacteria],orient='index')
        marker_count, sample_count, score, df = find_best_marker_score(df_bac, length_for_markers, 5, marker_in_n_samples)
        
        tmp_dict={"marker_count":marker_count, "sample_count": sample_count, "score": score, "df": df}
        dict_return.update({bacteria:tmp_dict})
    
    return dict_return

In [7]:
dict_strainphlan_optim=unpack_dict_make_df(master_dict, marker_len_dict)

s__Acinetobacter_johnsonii.fna
s__Akkermansia_muciniphila.fna
s__Alistipes_shahii.fna
s__Bacillaceae_bacterium_EAG3.fna
s__Bacteroides_dorei.fna
s__Bacteroides_fragilis.fna
s__Bacteroides_thetaiotaomicron.fna
s__Bacteroides_uniformis.fna
s__Bacteroides_vulgatus.fna
s__Clostridium_perfringens.fna
s__Clostridium_ventriculi.fna
s__Collinsella_massiliensis.fna
s__Collinsella_stercoris.fna
s__Enterococcus_faecalis.fna
s__Erysipelatoclostridium_ramosum.fna
s__Escherichia_coli.fna
s__Lactobacillus_apodemi.fna
s__Lactobacillus_murinus.fna
s__Lactobacillus_reuteri.fna
s__Lactobacillus_rodentium.fna
s__Parabacteroides_distasonis.fna
s__Plesiomonas_shigelloides.fna
s__Prevotella_copri.fna
s__Pseudomonas_lundensis.fna
s__Pseudomonas_yamanorum.fna
s__Ruthenibacterium_lactatiformans.fna
s__Streptococcus_gallolyticus.fna


In [None]:
dict_strainphlan_optim

In [10]:
df_best_bacteria = -1
for bacteria in dict_strainphlan_optim.keys():
    
    pre_bacteria_trimmed=bacteria[3:].strip("fna").strip(".") ## variable used in output_dir identification
    bacteria_trimmed=pre_bacteria_trimmed[0]+pre_bacteria_trimmed[pre_bacteria_trimmed.find("_"):len(pre_bacteria_trimmed)]
    output_dir = join(tree_dir, "output_"+str(int(marker_in_n_samples*100))+"_"+bacteria_trimmed+sample_group+"/GAMMA")
    ## Make sure the output_tree directory exists first 
    if os.path.isdir(output_dir):

        ## Extract dataframe for bacteria interested 
        df_best_bacteria = dict_strainphlan_optim[bacteria]['df']

        ## Make new df with sample and percent marker detected information
        array_sample_marker = np.empty((0,2)) ## empty np to build df on top of 
        for index, row in df_best_bacteria.iterrows():
            percent_of_markers_detected = int(row.sum()/len(row)*100)
            row_to_add=np.array([index.strip(".pkl"), percent_of_markers_detected ])
            array_sample_marker = np.vstack((array_sample_marker, row_to_add))
        
        print("Saving percent marker count for "+bacteria +" ....")
        df_bacteria_marker = pd.DataFrame(array_sample_marker, columns = ['sample', 'percent_marker_detected'])
        df_bacteria_marker.to_csv(join(output_dir,pre_bacteria_trimmed+"_sample_marker_percent.tsv"))
        print("done\n")
        
    else: ## Directory does not exists 
        print("For "+ bacteria +":")
        print(output_dir)
        print("directory DOES NOT exist\n")


For s__Acinetobacter_johnsonii.fna:
/panfs/panfs1.ucsd.edu/panscratch/jhc103/VertMetaphlan-frmerged/strain-trees-output/output_50_A_johnsonii/GAMMA
directory DOES NOT exist

For s__Akkermansia_muciniphila.fna:
/panfs/panfs1.ucsd.edu/panscratch/jhc103/VertMetaphlan-frmerged/strain-trees-output/output_50_A_muciniphil/GAMMA
directory DOES NOT exist

For s__Alistipes_shahii.fna:
/panfs/panfs1.ucsd.edu/panscratch/jhc103/VertMetaphlan-frmerged/strain-trees-output/output_50_A_shahii/GAMMA
directory DOES NOT exist

For s__Bacillaceae_bacterium_EAG3.fna:
/panfs/panfs1.ucsd.edu/panscratch/jhc103/VertMetaphlan-frmerged/strain-trees-output/output_50_B_bacterium_EAG3/GAMMA
directory DOES NOT exist

Saving percent marker count for s__Bacteroides_dorei.fna ....
done

Saving percent marker count for s__Bacteroides_fragilis.fna ....
done

For s__Bacteroides_thetaiotaomicron.fna:
/panfs/panfs1.ucsd.edu/panscratch/jhc103/VertMetaphlan-frmerged/strain-trees-output/output_50_B_thetaiotaomicro/GAMMA
directo