In [31]:
import os
import pandas as pd
import numpy as np
from scipy.stats import norm
from scipy.stats import chi2
import json
import sys
import math
import matplotlib.pyplot as plt

# Load the DB file
# df_db : Data frame of accumulated Experimental result information - Abundance
path_db = os.path.abspath('') + "/input/db_abundance.csv"
df_db = pd.read_csv(path_db)

# Load the Experiment result file
# df_exp : Data frame of Experimental result information - Abundance
path_exp = os.path.abspath('') + "/input/experiment_result_abundance.csv"
df_exp = pd.read_csv(path_exp)

# Load the merged Valencia output file
# df_valencia : Data frame of merged Valencia output
path_valencia = os.path.abspath('') + "/input/VALENCIA_output_merged.csv"
df_valencia = pd.read_csv(path_valencia)

# Insert data into DB - Merge the data frame df_db & df_exp

try: 
    df_db = pd.merge(df_db, df_exp, how='outer',on='taxa', suffixes=['', '_right']) 
    df_db = df_db.fillna(0)
    df_db = df_db.filter(regex='^(?!.*_right).*') # Eliminate duplicate columns

    df_db_rev = df_db.set_index(keys=['taxa'], inplace=False, drop=True)    
    df_db_rev.to_csv(path_db)
    
except:
    print("Check the Experiment result file")
    sys.exit()

    
# Delete the diversity, observed rows
if (list(df_exp['taxa'][0:2]) == ['diversity', 'observed']) & (list(df_db['taxa'][0:2]) == ['diversity', 'observed']):
    df_exp = df_exp.iloc[2:,:]
    df_db = df_db.iloc[2:,:]
else:
    print("Check the diversity & observed rows in the exp file or db file")
    sys.exit()


# Load the Phenotype-Microbiome file
# df_beta : Data frame of of Phenotype-Microbiome information
path_beta = os.path.abspath('') + "/input/phenotype_microbiome.xlsx"
df_beta = pd.read_excel(path_beta)
df_beta.rename(columns = {"Disease": "phenotype", "NCBI name": "ncbi_name", "MIrROR name": "microbiome", "Health sign": "beta", "subtract": "microbiome_subtract"}, inplace=True)
df_beta = df_beta[["phenotype", "ncbi_name", "microbiome", "beta","microbiome_subtract"]]
df_beta['beta'] = df_beta['beta'].replace({'증가': 1, '감소': -1})

li_new_sample_name = list(df_exp.columns)[1:]  
li_phenotype = list(dict.fromkeys(df_beta['phenotype']))

## Top 5 NCBI name 
li_phenotype_ncbi_name = []

for idx, row in df_beta.iterrows(): 
    if [row['phenotype'], row['ncbi_name']] not in li_phenotype_ncbi_name:
        li_phenotype_ncbi_name.append([row['phenotype'], row['ncbi_name']])

json_abundance = []

for i in range(len(li_new_sample_name)):
    for j in range(len(li_phenotype_ncbi_name)):
        
        condition_phen = (df_beta.phenotype == li_phenotype_ncbi_name[j][0]) & (df_beta.ncbi_name == li_phenotype_ncbi_name[j][1]) & (df_beta.beta == 1) 

        abundance = 0 
        for idx_beta, row_beta in df_beta[condition_phen].iterrows(): 
            if row_beta['microbiome'][:3] in ['s__', 'g__']:
                condition = (df_exp.taxa == row_beta['microbiome'])
                if len(df_exp[condition]) > 0:
                    abundance += df_exp[condition][li_new_sample_name[i]].values[0]

                    if (pd.isna(row_beta['microbiome_subtract']) is False):
                        li_micro_sub = row_beta['microbiome_subtract'].split('\n')
                        for micro_sub in li_micro_sub:
                            condition_sub = (df_exp.taxa == micro_sub)
                            if len(df_exp[condition_sub]) > 0:
                                 abundance -= df_exp[condition_sub][li_new_sample_name[i]].values[0]
                            
                json_abundance.append({"sample_name" : li_new_sample_name[i], "phenotype" : li_phenotype_ncbi_name[j][0], "ncbi_name" : li_phenotype_ncbi_name[j][1], "abundance" : abundance})
                
df_abundance = pd.DataFrame.from_dict(json_abundance)   






        

In [32]:
df_abundance = df_abundance.drop_duplicates(['sample_name', 'phenotype', 'ncbi_name'], keep='first')


In [33]:
df_abundance

Unnamed: 0,sample_name,phenotype,ncbi_name,abundance
0,20230116_BC05,Pelvic Inflammatory Diseases,Neisseria gonorrhoeae,0.000000
1,20230116_BC05,Pelvic Inflammatory Diseases,Chlamydia trachomatis,0.000000
3,20230116_BC05,Endometritis,Ureaplasma,0.000631
4,20230116_BC05,Endometritis,Dialister,0.000000
5,20230116_BC05,Endometritis,Bifidobacterium,0.000000
...,...,...,...,...
7937,20230215_BC08,Gestational Diabetes,Veillonella,0.000000
7938,20230215_BC08,Gestational Diabetes,Klebsiella,0.000000
7942,20230215_BC08,Gestational Diabetes,Escherichia/Shigella,0.000030
7943,20230215_BC08,Gestational Diabetes,Enterococcus,0.000000
