# Microbiota analysis
## Author: Tijs van Lieshout

### Import statements and setting up[ config:

In [1]:
import pandas as pd
import glob
import yaml
from IPython.display import display
from bokeh.plotting import figure, show
from bokeh.layouts import gridplot
from bokeh.models import Range1d
from bokeh.io import output_notebook
output_notebook()

In [2]:
def get_config():
    '''
    Function that gets the configuration of personalized variable definitions
    '''
    with open("config.yaml", 'r') as stream:
        config = yaml.safe_load(stream)
    return config

### Loading the data:

In [3]:
def load_data_microbiota(PATH):
    """Load microbiota data (Gut Feeling Knowledge Base and metaphlann output) into pandas 
    dataframes

    Keyword arguments:
    PATH -- The path which contains the Gut Feeling Knowledge Base and metaphlann output dir
    
    Returns:
    gfkb -- A pandas dataframe containing the Gut Feeling Knowledge Base
    tax_profiles -- A pandas dataframe containing the taxonomic profile per barcode for all barcodes"""
    
    # Gut Feeling Knowledge Base
    gfkb = pd.read_csv(f"{PATH}/GutFeelingKnowledgeBase-v4-Master_List.csv")
    gfkb = gfkb.drop(columns=["Present in GFKB v3 (Y/N)",
                              "Present in GFKB_epilepsy v3 (Y/N)"])
    gfkb = pd.concat([gfkb.drop(columns=["Genome Size (Mb)"]).apply(lambda x: x.astype(str)), 
                      gfkb["Genome Size (Mb)"]], axis=1)

    tax_profiles = pd.DataFrame()
    
    # Dictionary containing barcode ID as key and a list with subject ID and bool if vegan as values
    barcode2subject_sample = {"barcode_01":["A", True],
                              "barcode_02":["B", True],
                              "barcode_03":["C", True],
                              "barcode_04":["D", True],
                              "barcode_05":["E", True],
                              "barcode_06":["A", False],
                              "barcode_07":["B", False],
                              "barcode_08":["C", False],
                              "barcode_09":["D", False],
                              "barcode_10":["E", False]}
    
    # Concatenate all taxonomic profiles of all barcodes to one dataframe
    tax_profiles = concat_tax_profiles(PATH, tax_profiles, barcode2subject_sample)
    
    return gfkb, tax_profiles

In [4]:
def concat_tax_profiles(PATH, tax_profiles, barcode2subject_sample):
    """Concatenate all taxonomic profiles of all given barcodes in the metaphlan output dir
    to one dataframe and reset the index on multi-index (subject, is_vegan).

    Keyword arguments:
    PATH -- The path which contains the Gut Feeling Knowledge Base and metaphlann output dir
    tax_profiles -- An empty pandas dataframe containing the column names in which all taxonomic
    profiles will be concatenated.
    barcode2subject_sample -- Dictionary containing barcode ID as key and a list with subject ID 
    and bool if sample is vegan as values
    
    Returns:
    tax_profiles -- A pandas dataframe containing the taxonomic profile per barcode for all barcodes"""
    
    for file in glob.glob(f"{PATH}/metaphlan_output/*.txt"):
        tax_profile = pd.read_csv(file, 
                                  comment="#", 
                                  sep="\t", 
                                  names=["clade_name", 
                                         "NCBI_tax_id", 
                                         "relative_abundance",
                                         "additional_species"])
        # Indexing
        barcode = file.split('metaphlan_output/')[1].split("_all")[0]
        tax_profile["subject"] = barcode2subject_sample[barcode][0]
        tax_profile["is_vegan"] = barcode2subject_sample[barcode][1]
        tax_profile = tax_profile.set_index([tax_profile.subject, tax_profile.is_vegan]).sort_index()
        
        # Splitting clade_name into taxonomic levels
        tax_profile = tax_profile.join(tax_profile["clade_name"].str.split('|', expand=True).rename(columns={0:'kingdom', 1:'phylum', 2:'class', 3:'order', 4:'family', 5:'genus', 6:'species'}))
        
        tax_profiles = pd.concat([tax_profiles,
                                  tax_profile])
    return tax_profiles

In [5]:
def select_on_tax_level(tax_profiles, tax_level = "species"):
    if tax_level == "kingdom":
        included = r"k__"
        not_included = r"p__"
    elif tax_level == "phylum":
        included = r"p__"
        not_included = r"c__"
    elif tax_level == "class":
        included = r"c__"
        not_included = r"o__"
    elif tax_level == "order":
        included = r"o__"
        not_included = r"f__"
    elif tax_level == "family":
        included = r"f__"
        not_included = r"g__"
    elif tax_level == "genus":
        included = r"g__"
        not_included = "s__"
    elif tax_level == "species":
        included = r"s__"
        not_included = r"x__"
    else:
        print("no valid tax level selected, returning original dataframe.")
        return tax_profiles
    
    return tax_profiles[(tax_profiles['clade_name'].str.contains(included, regex=True) == True) &
                        (tax_profiles['clade_name'].str.contains(not_included, regex=True) == False)]

### Recreating the plot from Zimmer et al. 2012
<img src="../microbiota_tax_data/zimmer_species_abundance_plot.png" alt="Zimmer et al. 2012" width="400"/>

In [6]:
def recreate_zimmer(tax_profiles):
    """recreate a comparison of taxa that have been routinely analysed by Zimmer et al. 2012
    Keyword arguments:
    tax_profiles -- A pandas datatax_profilesframe containing the taxonomic profile per barcode for all barcodes
    
    Returns:
    zimmer_subset -- A pandas dataframe containing a subset of taxa of interest of the taxonomic profile per barcode for all barcodes 
    dataframes containing only the taxa analyzed by zimmer et al. 2012 as values
    """
    # Subset for the Zimmer et al. 2012 bar plot
    bacteroides_subset = tax_profiles[tax_profiles['clade_name'].str.endswith("g__Bacteroides")]
    bifidobacteria_subset = tax_profiles[tax_profiles['clade_name'].str.endswith("g__Bifidobacterium")]
    ecoli_subset = tax_profiles[tax_profiles['clade_name'].str.endswith("s__Escherichia_coli")]
    enterobacter_subset = tax_profiles[tax_profiles['clade_name'].str.endswith("f__Enterobacteriaceae")]

    # Other taxa Zimmer et al. 2012 deemed of interest
    clostridium_subset = tax_profiles[tax_profiles['clade_name'].str.contains("Clostridium")]

    zimmer_subset = pd.concat([bacteroides_subset,
                               bifidobacteria_subset,
                               ecoli_subset,
                               enterobacter_subset]).sort_index()
    
    return zimmer_subset

In [7]:
def main():
    config = get_config()
    gfkb, tax_profiles = load_data_microbiota(config['microbiota_files_path'])

    zimmer_subset = recreate_zimmer(tax_profiles)
    with open("tmp.html",'w') as f:
        f.write(zimmer_subset.to_html(justify="left"))
    display(zimmer_subset)
    
if __name__ == '__main__':
    main()

Unnamed: 0_level_0,Unnamed: 1_level_0,clade_name,NCBI_tax_id,relative_abundance,additional_species,subject,is_vegan,kingdom,phylum,class,order,family,genus,species
subject,is_vegan,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
A,False,k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o_...,2|976|200643|171549|815|816,11.504069,,A,False,UNKNOWN,,,,,,
A,False,k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o_...,2|976|200643|171549|815|816,11.504069,,A,False,k__Bacteria,,,,,,
A,False,k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o_...,2|976|200643|171549|815|816,11.504069,,A,False,k__Viruses,,,,,,
A,False,k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o_...,2|976|200643|171549|815|816,11.504069,,A,False,k__Bacteria,p__Bacteroidetes,,,,,
A,False,k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o_...,2|976|200643|171549|815|816,11.504069,,A,False,k__Bacteria,p__Firmicutes,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
E,True,k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o_...,2|976|200643|171549|815|816,4.673976,,E,True,k__Bacteria,p__Bacteroidetes,c__Bacteroidia,o__Bacteroidales,,,
E,True,k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o_...,2|976|200643|171549|815|816,4.673976,,E,True,k__Bacteria,p__Bacteroidetes,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,,
E,True,k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o_...,2|976|200643|171549|815|816,4.673976,,E,True,k__Bacteria,p__Bacteroidetes,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Bacteroides,
E,True,k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o_...,2|976|200643|171549|815|816,4.673976,,E,True,k__Bacteria,p__Bacteroidetes,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Bacteroides,s__Bacteroides_vulgatus
