In [1]:
import yaml
from glob import glob
import pandas as pd
from IPython.display import display
from tqdm import tqdm
import yaml
import os
from ete3 import NCBITaxa
import numpy as np

In [2]:
def get_config():
    with open("config.yaml", 'r') as con:
        config = yaml.safe_load(con)
    return config

config = get_config()
merged_blast_df = pd.read_csv(config["molded_merged_blast_output"], sep="\t")
merged_blast_df = merged_blast_df.set_index(["subject", "is_vegan"])
len(merged_blast_df["Tax ID"])

790471

In [3]:
def get_desired_ranks(taxid):
    """ Author: Andre de la Rambelje
    Function that gets all taxa IDs from a species ID.
    Keyword arguments
    taxid -- a taxID needs to be provided
    """
    ncbi = NCBITaxa()
    desired_ranks = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
    lineage = ncbi.get_lineage(taxid)
    lineage2ranks = ncbi.get_rank(lineage)
    ranks2lineage = dict((rank, taxid) for (taxid, rank) in lineage2ranks.items())

    return {'{}_id'.format(rank): ranks2lineage.get(rank, 0) for rank in desired_ranks}

def make_taxa_df(taxids):
    desired_ranks = ['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
    df = pd.DataFrame([])
    tree = []
    for taxid in taxids:
        tree = get_desired_ranks(int(taxid))
        df = df.append(tree, ignore_index=True)
    order_taxa = ['kingdom_id', 'phylum_id', 'class_id', 'order_id', 'family_id', 'genus_id', 'species_id']
    df = df[order_taxa].applymap(np.int64)
    
    return df

In [4]:
display(merged_blast_df[merged_blast_df.index.get_level_values("is_vegan")==True].loc["A"])

Unnamed: 0_level_0,Read ID,Scientific Name,Tax ID,e-value,bitscore,length,qcov
is_vegan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
True,1,Prevotella copri,165179,6.610000e-157,566.0,449,76
True,2,Bacteroides uniformis,820,3.620000e-128,470.0,292,72
True,3,uncultured organism,155900,0.000000e+00,678.0,458,14
True,4,Alistipes sp. 5CBH24,2585118,0.000000e+00,900.0,716,35
True,6,Lachnospiraceae bacterium GAM79,2109691,5.710000e-29,139.0,104,47
...,...,...,...,...,...,...,...
True,100185,Bacteroides salanitronis DSM 18170,667015,6.920000e-100,375.0,230,65
True,100186,Callithrix jacchus,9483,3.640000e-13,89.8,48,4
True,100187,Bacteroides dorei,357276,0.000000e+00,1155.0,741,87
True,100189,synthetic construct,32630,6.000000e-15,95.3,67,7


In [5]:
def get_subject_or_pooled_condition_data(is_vegan, merged_blast_df,subject="C"):
    print(subject)
    if subject == None:
        sub_condition = merged_blast_df[merged_blast_df.index.get_level_values("is_vegan")==is_vegan]
    else:
        sub_condition = merged_blast_df[merged_blast_df.index.get_level_values("is_vegan")==is_vegan].loc[subject]
    taxIDs = sub_condition["Tax ID"]
    taxIDs = taxIDs.str.split(";").str[0]
    
    tree_df = make_taxa_df(taxIDs)
    return tree_df

a_control = get_subject_or_pooled_condition_data(is_vegan=False, merged_blast_df=merged_blast_df, subject="A")
b_control = get_subject_or_pooled_condition_data(is_vegan=False, merged_blast_df=merged_blast_df, subject="B")
c_control = get_subject_or_pooled_condition_data(is_vegan=False, merged_blast_df=merged_blast_df, subject="C")
d_control = get_subject_or_pooled_condition_data(is_vegan=False, merged_blast_df=merged_blast_df, subject="D")
e_control = get_subject_or_pooled_condition_data(is_vegan=False, merged_blast_df=merged_blast_df, subject="E")
pooled_control =  get_subject_or_pooled_condition_data(is_vegan=False, merged_blast_df=merged_blast_df, subject=None)
a_vegan = get_subject_or_pooled_condition_data(is_vegan=True, merged_blast_df=merged_blast_df, subject="A")
b_vegan = get_subject_or_pooled_condition_data(is_vegan=True, merged_blast_df=merged_blast_df, subject="B")
c_vegan = get_subject_or_pooled_condition_data(is_vegan=True, merged_blast_df=merged_blast_df, subject="C")
d_vegan = get_subject_or_pooled_condition_data(is_vegan=True, merged_blast_df=merged_blast_df, subject="D")
e_vegan = get_subject_or_pooled_condition_data(is_vegan=True, merged_blast_df=merged_blast_df, subject="E")
pooled_vegan = get_subject_or_pooled_condition_data(is_vegan=True, merged_blast_df=merged_blast_df, subject=None)

A




B




C
D
E
None
A




B
C
D
E
None


In [19]:
list_for_taxafiles = [a_control,b_control,c_control,d_control,e_control,a_vegan,b_vegan,c_vegan,d_vegan,e_vegan]
def merge_output(list_for_taxafiles, output_path):
    dfs = []
    subjects = ["A","B","C","D","E"]
    for i, taxa_file in tqdm(enumerate(list_for_taxafiles[:5])):
        taxa_file.loc[:,"subject"] = subjects[i]
        taxa_file.loc[:,"is_vegan"] = False
        dfs.append(taxa_file)
    for i, taxa_file in tqdm(enumerate(list_for_taxafiles[5:])):
        taxa_file.loc[:,"subject"] = subjects[i]
        taxa_file.loc[:,"is_vegan"] = True
        dfs.append(taxa_file)
    merged_taxa = pd.concat(dfs)
    merged_taxa.to_csv(output_path,sep="\t")
    return merged_taxa
merge_output(list_for_taxafiles, output_path="/home/madelarambelje/data/subjects_taxa_profile.txt")
pooled_control.loc[:,"is_vegan"] = False
pooled_vegan.loc[:, "is_vegan"] = False
pooled_taxa = pd.concat([pooled_control, pooled_vegan])
pooled_taxa.to_csv("/home/madelarambelje/data/pooled_taxa.txt", sep="\t")
#get_count_per_level(merged_blast_df=merged_blast_df, is_vegan= True, subject="A")

5it [00:00, 400.60it/s]
5it [00:00, 335.39it/s]
