In [52]:
def percentile_rank_interpolation(data, value):
    """
    Calculates the percentile rank for a value in non-normal distribution data using the interpolation method.
    If the value is not in the data, the percentile rank of the nearest value is returned.

    Parameters:
    -----------
    data : array_like
        Input data. Must be 1-dimensional.
    value : float
        Value for which to calculate the percentile rank.

    Returns:
    --------
    result : float
        Percentile rank of the given value in the dataset.
    """
    # Sort the data in ascending order
    sorted_data = np.sort(data)

    # Find the index of the value if it is in the data
    try:
        index = np.where(sorted_data == value)[0][0]
    except IndexError:
        # If the value is not in the data, find the index of the nearest value
        index = (np.abs(sorted_data - value)).argmin()

    # Calculate the rank of the value
    rank = 1 + index

    # Calculate the percentile corresponding to the rank
    percentile = 100 * (rank - 0.5) / len(sorted_data)

    # Check if percentile is an integer
    if percentile.is_integer():
        result = int(percentile)
    else:
        # Interpolate between adjacent percentile ranks
        lower_percentile = int(percentile)
        upper_percentile = lower_percentile + 1
        lower_rank = int(np.floor((lower_percentile / 100) * len(sorted_data)))
        upper_rank = int(np.ceil((upper_percentile / 100) * len(sorted_data)))
        lower_value = sorted_data[lower_rank - 1]
        upper_value = sorted_data[upper_rank - 1]
                
        if upper_value != lower_value:
            result = lower_percentile + (value - lower_value) / (upper_value - lower_value) * (upper_percentile - lower_percentile)
        else:
            result = lower_percentile
        
        # Outlier
        if result >= 95:
            result = 95.0
        elif result <= 5:
            result = 5.0   
        
        # Round    
        result = round(result, 1)
            
            
    return result


In [53]:
import os
import pandas as pd
import numpy as np
from scipy.stats import norm
from scipy.stats import chi2
import json
import sys
import math
import matplotlib.pyplot as plt

# Load the DB file
# df_db : Data frame of accumulated Experimental result information - Abundance
path_db = os.path.abspath('') + "/input/db_abundance.csv"
df_db = pd.read_csv(path_db)

# Load the Experiment result file
# df_exp : Data frame of Experimental result information - Abundance
path_exp = os.path.abspath('') + "/input/experiment_result_abundance.csv"
df_exp = pd.read_csv(path_exp)

# Load the merged Valencia output file
# df_valencia : Data frame of merged Valencia output
path_valencia = os.path.abspath('') + "/input/VALENCIA_output_merged.csv"
df_valencia = pd.read_csv(path_valencia)

# Insert data into DB - Merge the data frame df_db & df_exp

try: 
    df_db = pd.merge(df_db, df_exp, how='outer',on='taxa', suffixes=['', '_right']) 
    df_db = df_db.fillna(0)
    df_db = df_db.filter(regex='^(?!.*_right).*') # Eliminate duplicate columns

    df_db_rev = df_db.set_index(keys=['taxa'], inplace=False, drop=True)    
    df_db_rev.to_csv(path_db)
    
except:
    print("Check the Experiment result file")
    sys.exit()

    
# Delete the diversity, observed rows
if (list(df_exp['taxa'][0:2]) == ['diversity', 'observed']) & (list(df_db['taxa'][0:2]) == ['diversity', 'observed']):
    df_exp = df_exp.iloc[2:,:]
    df_db = df_db.iloc[2:,:]
else:
    print("Check the diversity & observed rows in the exp file or db file")
    sys.exit()


# Load the Phenotype-Microbiome file
# df_beta : Data frame of of Phenotype-Microbiome information
path_beta = os.path.abspath('') + "/input/phenotype_microbiome.xlsx"
df_beta = pd.read_excel(path_beta)
df_beta.rename(columns = {"Disease": "phenotype", "NCBI name": "ncbi_name", "MIrROR name": "microbiome", "Health sign": "beta", "subtract": "microbiome_subtract"}, inplace=True)
df_beta = df_beta[["phenotype", "ncbi_name", "microbiome", "beta","microbiome_subtract"]]
df_beta['beta'] = df_beta['beta'].replace({'증가': 1, '감소': -1})

li_new_sample_name = list(df_exp.columns)[1:]  
li_phenotype = list(dict.fromkeys(df_beta['phenotype']))

## Top 5 NCBI name 
li_phenotype_ncbi_name = []

for idx, row in df_beta.iterrows(): 
    if [row['phenotype'], row['ncbi_name']] not in li_phenotype_ncbi_name:
        li_phenotype_ncbi_name.append([row['phenotype'], row['ncbi_name']])

json_abundance = []

for i in range(len(li_new_sample_name)):
    for j in range(len(li_phenotype_ncbi_name)):
        
        condition_phen = (df_beta.phenotype == li_phenotype_ncbi_name[j][0]) & (df_beta.ncbi_name == li_phenotype_ncbi_name[j][1])

        abundance = 0 
        for idx_beta, row_beta in df_beta[condition_phen].iterrows():
             
            if (row_beta['beta'] == 1) & (row_beta['microbiome'][:3] in ['s__', 'g__']):
                condition = (df_exp.taxa == row_beta['microbiome'])
                if len(df_exp[condition]) > 0:
                    abundance += df_exp[condition][li_new_sample_name[i]].values[0]
                
            if (row_beta['beta'] == 1) & (pd.isna(row_beta['microbiome_subtract']) is False):
                li_micro_sub = row_beta['microbiome_subtract'].split('\n')
                    
                for micro_sub in li_micro_sub:
                    condition_sub = (df_exp.taxa == micro_sub)
                    if len(df_exp[condition_sub]) > 0:
                        abundance -= df_exp[condition_sub][li_new_sample_name[i]].values[0]
     
            
        json_abundance.append({"sample_name" : li_new_sample_name[i], "phenotype" : li_phenotype_ncbi_name[j][0], "ncbi_name" : li_phenotype_ncbi_name[j][1], "abundance" : abundance})
df_abundance = pd.DataFrame.from_dict(json_abundance)   

df_top_five = pd.DataFrame(columns = ["sample_name", "phenotype", "ncbi_name","abundance"])

for i in range(len(li_new_sample_name)):
    for j in range(len(li_phenotype)):
    
        condition = (df_abundance.sample_name == li_new_sample_name[i]) & (df_abundance.phenotype == li_phenotype[j])
        df_new = df_abundance[condition].sort_values(by=['abundance'], ascending=False).head(5)
        df_top_five = pd.concat([df_top_five,df_new])

df_top_five = df_top_five.set_index(keys=['sample_name'], inplace=False, drop=True)           
df_top_five.to_excel('/home/kbkim/vaginal_microbiome/output/top5.xlsx')


# Subtract the abundance - df_exp

for idx_beta, row_beta in df_beta.iterrows(): 
    li_micro_sub = []

    if pd.isna(row_beta['microbiome_subtract']) is False:
        li_micro_sub = row_beta['microbiome_subtract'].split('\n')
        
        for micro_sub in li_micro_sub:
            condition = (df_exp.taxa == row_beta['microbiome'])
            condition_sub = (df_exp.taxa == micro_sub)
            
            if len(df_exp[condition_sub]) > 0:
                
                for sample_name in li_new_sample_name:
                    df_exp.loc[condition, sample_name] -= df_exp[condition_sub][sample_name].values[0]

# Calculate the GRS 
# li_phenotype : Phenotype list 
# df_grs : Data frame of grs corresponding to specific phenotype and sample

df_grs = pd.DataFrame(index = li_phenotype, columns = li_new_sample_name)
df_grs = df_grs.fillna(0) 

for i in range(len(li_phenotype)):
    for j in range(len(li_new_sample_name)):
        condition_phen = (df_beta.phenotype == li_phenotype[i])   
        grs = 0
        
        for idx_beta, row_beta in df_beta[condition_phen].iterrows():
            condition_micro = (df_exp.taxa == row_beta['microbiome'])
            
            if (len(df_exp[condition_micro]) > 0):      
                x_i = df_exp[condition_micro][li_new_sample_name[j]].values[0]
                ln_x_i = math.log(x_i + 1e-15)  
                grs += ln_x_i * row_beta['beta']
            
            elif (len(df_exp[condition_micro]) == 0):      
                x_i = 0
                ln_x_i = math.log(x_i + 1e-15)  
                grs += ln_x_i * row_beta['beta']                
            
        grs /= len(df_beta[condition_phen])       
        df_grs.loc[li_phenotype[i], li_new_sample_name[j]] = grs

In [54]:
list(df_grs.loc['Gestational Diabetes'])

[-12.971991950728935,
 -15.141609839315965,
 -17.293292388448073,
 -12.93679703076642,
 -16.131875625679683,
 -13.927609517251785,
 -19.25159793871679,
 -14.2851507615931,
 -14.249138956454113,
 -14.959456311670051,
 -16.368265932671843,
 -16.366257452433757,
 -16.454928487226713,
 -15.745726332518766,
 -15.281595337494686,
 -15.161693517068215,
 -13.311184603386078,
 -15.620383265173864,
 -14.0641460491618,
 -15.724772568868167,
 -12.968723496623593,
 -16.6573996218846,
 -14.836930532187926,
 -12.08926642720251,
 -12.282922555352643,
 -14.143920225277325,
 -15.476141425350447,
 -15.002047236302126,
 -15.353401211956117,
 -13.168967229540744,
 -14.258193200079145,
 -12.931137110752053,
 -14.126115768879671,
 -15.837719215732427,
 -14.166088213674616,
 -16.35332511669916,
 -12.225736761939006,
 -12.965784292954192,
 -15.09550265891955,
 -15.934130625797444,
 -15.157828443639701,
 -13.843420229389972,
 -14.150335700018264,
 -12.870397119520339,
 -15.494686293878262,
 -18.424128637560734,

In [55]:
df_grs

Unnamed: 0,20230116_BC05,20230116_BC02,20230116_BC03,20230116_BC12,20230116_BC21,20230116_BC07,20230116_BC11,20230116_BC01,20230116_BC04,20230116_BC06,...,20230215_BC05,20230215_BC02,20230215_BC03,20230215_BC10,20230215_BC07,20230215_BC11,20230215_BC01,20230215_BC04,20230215_BC06,20230215_BC08
Pelvic Inflammatory Diseases,-34.538776,-34.538776,-34.538776,-34.538776,-34.538776,-34.538776,-34.538776,-34.538776,-34.538776,-34.538776,...,-34.538776,-34.538776,-34.538776,-34.538776,-34.538776,-34.538776,-34.538776,-34.538776,-34.538776,-34.538776
Endometritis,-24.535127,-25.511043,-24.026066,-23.920989,-26.112207,-25.316118,-25.591302,-24.871322,-23.817224,-25.564766,...,-22.968149,-24.421896,-23.826575,-24.723929,-24.885049,-23.847495,-23.506767,-24.960792,-24.673163,-24.253436
Miscarriage,-19.007868,-25.906646,-21.529225,-34.538776,-28.467116,-28.381822,-26.908524,-21.219019,-19.834583,-28.328659,...,-21.880003,-28.135738,-18.65604,-28.094646,-19.959625,-21.633698,-21.13765,-28.17913,-19.807751,-18.78217
Ovarian Cancer,0.0,0.0,-4.110846,-5.304035,-4.316013,0.0,-5.328096,-8.810681,-0.667889,0.0,...,-5.153289,-4.231502,-4.907349,0.0,-4.034895,-5.440789,-5.646118,-0.298627,-4.121632,-4.809164
Cervical Intraepithelial Neoplasia,-6.813467,-13.817561,-10.270767,-25.710802,-9.110455,-10.73351,-4.630893,1.241988,3.464132,-15.755172,...,-5.624367,-12.466042,-0.93197,-17.204785,-5.010272,-5.579353,-4.630136,-9.144683,-1.867132,-1.501082
Preterm Prelabor Rupture of Membranes (PPROM),-22.837514,-30.708326,-15.094988,-21.625525,-23.801275,-26.758045,-16.832237,-18.272095,-13.587188,-22.778143,...,-13.451352,-23.644971,-13.896711,-23.429376,-23.395436,-17.482893,-21.263455,-23.686843,-23.751472,-23.000842
Dysmenorrhea (Menstrual pain),-16.478956,-18.817418,-17.726801,-21.916208,-20.640097,-19.366613,-13.859194,-13.838798,-13.50981,-19.300545,...,-11.900752,-18.10281,-11.003346,-18.122903,-14.737276,-14.570039,-17.027448,-18.186754,-13.82011,-13.881666
Adenomyosis,-30.834825,-31.650078,-31.555045,-33.79201,-33.864147,-32.293063,-33.805009,-33.853933,-32.475707,-30.847761,...,-31.052293,-31.706803,-31.693393,-33.830753,-31.043378,-30.980948,-29.428998,-31.109901,-32.466697,-31.757403
Vaginal Dryness,-11.382317,-18.596881,-8.031523,-13.872296,-9.129453,-13.907652,-8.991965,-14.66741,-11.295602,-6.511699,...,-2.598056,-9.250614,-10.627305,-12.610375,-8.952478,-9.590651,-14.458698,-12.706744,-18.973147,-12.55562
Gestational Diabetes,-12.971992,-15.14161,-17.293292,-12.936797,-16.131876,-13.92761,-19.251598,-14.285151,-14.249139,-14.959456,...,-17.125144,-13.114704,-16.579177,-16.16919,-15.200542,-16.925769,-14.25601,-15.209468,-15.209468,-14.22955


In [71]:
percentile_rank_interpolation(list(df_grs.loc['Cervical Intraepithelial Neoplasia']), -17)

22.7