In [1]:
import pandas as pd
import os
import copy
from tqdm import tqdm
import json

In [2]:
## Parameters
bounds = [0.97, 0.8] 
peri = 0.15
MIN_SAMPLES = 12

In [3]:
prevalence_list = [(0.1,"1"),(0.01,"01"),(0.001,"001"),(0.0001,"0001")]

In [4]:
DATASET = os.path.join(
    "..","..","data"
)
CHENNAI_DATASET = os.path.join(DATASET,"Chennai_data")
CORE_TAXA = os.path.join(
    DATASET,
    "MetaSUB_data",
)
RESULTS = os.path.join(
    "..","..","results"
)


## Helper Function

In [5]:
def prevalence_extractor(df, bounds=[], col_not_to_include = []):
    taxa_df = copy.copy(df)
    taxa_list = [i for i in taxa_df.columns if i not in col_not_to_include]

    for i in taxa_list:
        taxa_df[i] = taxa_df[i] > 0
        taxa_df[i] = taxa_df[i] * 1

    v = taxa_df[taxa_list].sum() / taxa_df.shape[0]
    v = v.to_frame()
    v = v.reset_index()
    v.rename(columns={"index": "Species", 0: "prevalence"}, inplace=True)
    #v["Species"] = v["Species"].str.replace("prev_", "")
    v.sort_values('prevalence', ascending = False, inplace = True)
    core_df = v.query("prevalence > @bounds[0]")
    sub_core_df = v.query("prevalence > @bounds[1] and prevalence < @bounds[0]")
    peripheral_df = v.query("prevalence < @peri and prevalence > 0.00001")
    v = v.rename(columns={'prevalence':'prevalence'})
    return v, core_df, sub_core_df, peripheral_df

def merger_function(taxa_df, metadata_df):
    taxa_temp = copy.copy(taxa_df)
    taxa_meta = copy.copy(metadata_df)
    taxa_temp = taxa_temp.merge(
        taxa_meta, how="inner", left_on="Samples", right_on="uuid"
    )
    return taxa_temp

def filter_all_zero(df1, col_not_to_include=["Samples"]):
    df = copy.copy(df1)
    col_list = [i for i in df.columns if i not in col_not_to_include]
    temp = df[col_list].sum().to_frame()#.reset_index()
    temp.rename(columns={0: "non_zeros"}, inplace=True)
    #print("before ", df.shape)
    ## The name of the species which are not having any read mapped. These columns can be dropped
    df = df.drop(
        columns=list(temp.query("non_zeros<=0.000001").index.values)
    )  

    #print("after ", df.shape)
    #print('Removed',list(temp.query("non_zeros<=0.000001").index.values))
    return df


In [6]:
## Load the Chennai bracken results
chennai_species_df = pd.read_csv(os.path.join(CHENNAI_DATASET, 'bracken_species_non_human_processed.csv'))
chennai_species_df.set_index('Samples', inplace= True)

# Analysis
## Chennai analysis

In [7]:
supplementary_s3 = {"Species Category":["Core","Sub-core","Peripheral", "Unique-core"],}

for threshold,folder in prevalence_list:
    print(f"Working with Threshold : {threshold}")
    ## Chennai 
    s_list = chennai_species_df.columns
    chennai_species_df[s_list] = chennai_species_df[s_list].applymap(lambda x: 0 if x < threshold else x)
    chennai_species_df = filter_all_zero(chennai_species_df)
    full_prevalence_df, chennai_core_df, chennai_sub_core_df, chennai_peripheral_df = prevalence_extractor(chennai_species_df, bounds=bounds, col_not_to_include = ['Samples'])

    core_species_dict = {'Chennai': { 
                                    'core':list(chennai_core_df["Species"].values),
                                    'sub_core':list(chennai_sub_core_df["Species"].values),
                                    'peripheral':list(chennai_peripheral_df["Species"].values) 
                                  } 
                                  }
    ## ROW
    METADATA = os.path.join(CORE_TAXA,"Filtered_complete_metadata.csv",)
    metadata_df = pd.read_csv(METADATA)
    metasub_species = pd.read_csv(os.path.join(CORE_TAXA, 'Filtered_bracken_species_non_human_processed.csv'))

    s_list = [i for i in metasub_species.columns if i != 'Samples']
    print('Core apply threshold')
    metasub_species[s_list] = metasub_species[s_list].applymap(lambda x: 0 if x < threshold else x)
    #metasub_species = metasub_species.apply(abd_threshold, axis=1)
    print('Core filter all zero')
    metasub_species = filter_all_zero(metasub_species)
    print('Core merge metadata')
    metasub_species = merger_function(metasub_species, metadata_df)

    ## Filter the cities based on number of samples
    vc = metasub_species['city'].value_counts()
    filtered_vc = vc[vc >= MIN_SAMPLES]

    print("Iterate through each cities")
    for city in tqdm(filtered_vc.index):
        city_species_df = metasub_species.query('city == @city')
        col_not_to_include=["Samples","uuid",
                            "core_project",
                            "project",
                            "city",
                            "surface_material",
                            "continent",
                            "surface_ontology_fine",
                            "control_type",]
        _, _core_df, _sub_core_df, _peripheral_df = prevalence_extractor(city_species_df, bounds=bounds, col_not_to_include = col_not_to_include)
        core_species_dict[city] = { 
                                        'core':list(_core_df["Species"].values),
                                        'sub_core':list(_sub_core_df["Species"].values),
                                        'peripheral':list(_peripheral_df["Species"].values) 
                                      } 
                                      
    ## Make the directories
    RESULT_threshold = os.path.join(RESULTS, 'microbial_signatures', folder)
    os.makedirs(RESULT_threshold, exist_ok = True)
    with open(os.path.join(RESULT_threshold,'all_city_core.json'), 'w') as fp:
        json.dump(core_species_dict, fp)

    ## Get the unique core for Chennai
    other_city = [i for i in core_species_dict.keys() if i != 'Chennai']

    unique_taxas_dict = {}
    #for city_of_int in all_cities:
    city_of_int = 'Chennai'
    all_other_city_core_subcore_set = set()
    for city in other_city:
        #if city != city_of_int:
        all_other_city_core_subcore_set = all_other_city_core_subcore_set.union(
            core_species_dict[city]["core"]
        )
        all_other_city_core_subcore_set = all_other_city_core_subcore_set.union(
            core_species_dict[city]["sub_core"]
        )

    c_difference = list(
        set(core_species_dict[city_of_int]["core"]).difference(all_other_city_core_subcore_set)
    )
    unique_taxas_dict[city_of_int] = c_difference
    print(
        f" Total {city_of_int.capitalize()} {'core'}",
        len(core_species_dict[city_of_int]["core"]),
        "Where",
        c_difference.__len__(),
        "are unique to the city",
    )
    temp = pd.DataFrame.from_dict({"Chennai_spl": unique_taxas_dict["Chennai"]})
    temp.to_csv(os.path.join(RESULT_threshold, "chennai_spl_core_considered_core_subcore.csv"), index=False)

    supplementary_s3[f"Relative abundance > 0.{folder}"] = [len(core_species_dict["Chennai"]["core"]),
                                                            len(core_species_dict["Chennai"]["sub_core"]),
                                                            len(core_species_dict["Chennai"]["peripheral"]),
                                                            len(unique_taxas_dict["Chennai"]),]
pd.DataFrame.from_dict(supplementary_s3).to_csv(os.path.join(RESULTS, 'microbial_signatures', "supplementary_s3.csv"), index=False)
        

Working with Threshold : 0.0001
Core apply threshold
Core filter all zero
Core merge metadata
Iterate through each cities


100%|██████████| 31/31 [04:35<00:00,  8.88s/it]


 Total Chennai core 464 Where 11 are unique to the city
Working with Threshold : 0.001
Core apply threshold
Core filter all zero
Core merge metadata
Iterate through each cities


100%|██████████| 31/31 [02:12<00:00,  4.26s/it]


 Total Chennai core 27 Where 8 are unique to the city
Working with Threshold : 0.01
Core apply threshold
Core filter all zero
Core merge metadata
Iterate through each cities


100%|██████████| 31/31 [00:45<00:00,  1.47s/it]


 Total Chennai core 2 Where 0 are unique to the city
Working with Threshold : 0.1
Core apply threshold
Core filter all zero
Core merge metadata
Iterate through each cities


100%|██████████| 31/31 [00:11<00:00,  2.59it/s]

 Total Chennai core 0 Where 0 are unique to the city



