In [5]:
import pandas as pd
import plotly.express as px
from tqdm.notebook import tqdm
from sklearn.metrics import auc

In [16]:
DISEASE_NAMES_OLD = ["C0006142_Malignant_neoplasm_of_breast",  "C0009402_Colorectal_Carcinoma", \
                     "C0023893_Liver_Cirrhosis_Experimental","C0376358_Malignant_neoplasm_of_prostate", \
                     "C0036341_Schizophrenia"]

DISEASE_NAMES_NEW = ["C0001973_Alcoholic_Intoxication_Chronic", "C0011581_Depressive_disorder", \
                     "C0860207_Drug_Induced_Liver_Disease", "C3714756_Intellectual_Disability", \
                     "C0005586_Bipolar_Disorder"]

DISEASE_NAMES =  DISEASE_NAMES_OLD + DISEASE_NAMES_NEW

DISEASE_CODES = {"C0006142_Malignant_neoplasm_of_breast": "C0006142", "C0009402_Colorectal_Carcinoma": "C0009402", \
                 "C0023893_Liver_Cirrhosis_Experimental": "C0023893", "C0036341_Schizophrenia": "C0036341", \
                 "C0376358_Malignant_neoplasm_of_prostate": "C0376358", "C0001973_Alcoholic_Intoxication_Chronic": "C0001973", \
                 "C0011581_Depressive_disorder": "C0011581", "C0860207_Drug_Induced_Liver_Disease": "C0860207", \
                 "C3714756_Intellectual_Disability": "C3714756", "C0005586_Bipolar_Disorder": "C0005586"}

GUILD_METHODS   = ["fFlow", "NetScore", "NetZcore", "NetShort","NetCombo", "NetRank"]
COMPARE_METHODS = ["XGDAG - GNNExplainer",  "XGDAG - GraphSVX", "NIAPU", "DIAMOnD", "MCL", "RWR", "fFlow", "NetCombo", "NetRank"]
XAI_METHODS     = ["GNNExplainer", "XGDAG - GNNExplainer", "GraphSVX", "XGDAG - GraphSVX", "SubgraphX", "XGDAG - SubgraphX"]
ALL_METHODS     = GUILD_METHODS + XAI_METHODS

print('Total disease used:', len(DISEASE_NAMES))
print('Total methods used:', len(ALL_METHODS))

Total disease used: 10
Total methods used: 12


In [3]:
ratios_to_validate = [25, 50, 100, 200, 500, 750, 1000, 1500, 2000, 2500, 3000]

In [19]:
# Dictionary to store, the metric achieved by each method for each disease:
# - disease_method_metric_d = {disease: {method: {metric: value}}}

disease_method_metric_d = {}

# Inizialization
for disease in DISEASE_NAMES:
    disease_method_metric_d[disease] = {}
    for method in ALL_METHODS:
        disease_method_metric_d[disease][method] = {}
        for metric in ['P', 'R', 'F1']:
            disease_method_metric_d[disease][method][metric] = 0

In [22]:
auc_scores = {}
for METHOD in COMPARE_METHODS:
    auc_scores[METHOD] = []
    

for DISEASE_NAME in tqdm(DISEASE_NAMES):
    recall_folds = []
    precision_folds = []
    F1_folds = []

    recall_folds_compare_methods = {}
    precision_folds_compare_methods = {}
    F1_folds_compare_methods = {}
    
    for METHOD in COMPARE_METHODS:
        recall_folds_compare_methods[METHOD] = []
        precision_folds_compare_methods[METHOD] = []
        F1_folds_compare_methods[METHOD] = []

    for ratio_to_validate in ratios_to_validate:
        GENE_APU_SCORES_PATH = "Rankings/other_methods/NIAPU/" + DISEASE_NAME + "/" + DISEASE_NAME + "_ranking"
        TRAIN_SEEDS_PATH = "Datasets_v2/" + DISEASE_CODES[DISEASE_NAME] + "_seed_genes.txt"

        APU_scores_df = pd.read_csv(GENE_APU_SCORES_PATH, header = None, sep = " ")
        APU_scores_df.columns = ["name", "score", "label"]
        APU_ranking_df = APU_scores_df.sort_values(by = "score", ascending= False)
        

        # seed genes used for diffusion that we consider as P class in this scenario 
        # (20% or seed genes were removed to check for robustness)
        train_seeds_df = pd.read_csv(TRAIN_SEEDS_PATH, header = None, sep = " ")
        train_seeds_df.columns = ["name", "GDA Score"]
        train_seeds_list = train_seeds_df["name"].values.tolist()

        APU_ranking_df_not_seeds = APU_ranking_df[~APU_ranking_df['name'].isin(train_seeds_list)]

        APU_ranking_candidate_genes = APU_ranking_df_not_seeds["name"].values.tolist()
        
        N = None
        
        FILE_NAME_ALL_SEEDS = "Datasets_v2/all_seed_genes/" + DISEASE_NAME + "_all_seed_genes.txt"
        all_seed_genes_df = pd.read_csv(FILE_NAME_ALL_SEEDS, sep = " ", header = None)
        all_seed_genes = all_seed_genes_df[0].values
        test_seeds = list(set(all_seed_genes).difference(set(train_seeds_list)))
        
        N = len(all_seed_genes.tolist())
        
        APU_ranking_candidate_genes = APU_ranking_candidate_genes[:round(ratio_to_validate)]
        TP = 0
        FP = 0
        P = len(test_seeds) #TP+FP

        for gene in APU_ranking_candidate_genes:
            
            if gene in test_seeds:
                TP += 1
                
            else:
                FP += 1
        
        recall = TP / P
        precision = TP / (TP + FP)

        F1_score = 0
        if (precision + recall) != 0:
            F1_score = 2*(precision*recall)/(precision+recall)

        recall_folds.append(recall)
        precision_folds.append(precision)
        F1_folds.append(F1_score)

        recall_folds_compare_methods["NIAPU"].append(recall)
        precision_folds_compare_methods["NIAPU"].append(precision)
        F1_folds_compare_methods["NIAPU"].append(F1_score)
        
        for METHOD in COMPARE_METHODS:
            if METHOD != "NIAPU":
                ranking_method = []

                if METHOD in XAI_METHODS:
                    with open("Rankings/" + DISEASE_CODES[DISEASE_NAME] + "_all_positives_new_ranking_" + METHOD.lower().replace("-", "_").replace(" ", "") + ".txt", "r", encoding="utf-8") as rankingFile:
                        for line in rankingFile:
                            ranking_method.append(line.strip("\n"))

                elif METHOD in GUILD_METHODS:        
                    GUILD_METHOD_PATH = "Rankings/other_methods/GUILD/" + METHOD + "/" + DISEASE_NAME + "_" + METHOD + ".txt"

                    GUILD_scores_df = pd.read_csv(GUILD_METHOD_PATH, header = None, sep = "\t")
                    GUILD_scores_df.columns = ["name", "score"]
                    GUILD_scores_df = GUILD_scores_df.sort_values(by = "score", ascending= False)

                    ranking_method_df_not_seeds = GUILD_scores_df[~GUILD_scores_df['name'].isin(train_seeds_list)]
                    ranking_method = ranking_method_df_not_seeds["name"].values.tolist()
                
                else:
                    with open("Rankings/other_methods/" + METHOD + "/" + METHOD.lower() + "_output_" + DISEASE_NAME + ".txt", "r", encoding="utf-8") as rankingFile:
                            for line in rankingFile:
                                ranking_method.append(line.strip("\n"))
                    

                ranking_method = ranking_method[:round(ratio_to_validate)]
                TP = 0
                FP = 0
                P = len(test_seeds) #TP+FP

                for gene in ranking_method:
                    
                    if gene in test_seeds:
                        TP += 1
                        
                    else:
                        FP += 1

                recall = TP / P
                precision = TP / (TP + FP)
                
                F1_score = 0
                if (precision + recall) != 0:
                    F1_score = 2*(precision*recall)/(precision+recall)

                recall_folds_compare_methods[METHOD].append(recall)
                precision_folds_compare_methods[METHOD].append(precision)
                F1_folds_compare_methods[METHOD].append(F1_score)
        
    #compute area under the precision-recall curve (AUC)
    for METHOD in COMPARE_METHODS:
        auc_score = auc(recall_folds_compare_methods[METHOD], precision_folds_compare_methods[METHOD])
        auc_scores[METHOD].append(auc_score)
    
    print('R')
    print(recall_folds_compare_methods)
    print('P')
    print(precision_folds_compare_methods)
    print('F1')
    print(F1_folds_compare_methods)
    print('AUC')
    print(auc_scores)
    break



  0%|          | 0/10 [00:00<?, ?it/s]

R
{'XGDAG - GNNExplainer': [0.003930817610062893, 0.007075471698113208, 0.01474056603773585, 0.02790880503144654, 0.06269654088050315, 0.09099842767295598, 0.11556603773584906, 0.1611635220125786, 0.2034198113207547, 0.24371069182389937, 0.28714622641509435], 'XGDAG - GraphSVX': [0.003930817610062893, 0.007075471698113208, 0.01474056603773585, 0.028105345911949686, 0.06328616352201258, 0.09099842767295598, 0.11556603773584906, 0.1611635220125786, 0.20400943396226415, 0.24449685534591195, 0.29068396226415094], 'NIAPU': [0.0005896226415094339, 0.0025550314465408804, 0.009040880503144654, 0.021816037735849055, 0.0514937106918239, 0.08038522012578617, 0.10790094339622641, 0.1566430817610063, 0.19929245283018868, 0.24194182389937108, 0.27849842767295596], 'DIAMOnD': [0.0047169811320754715, 0.009237421383647798, 0.014544025157232705, 0.022602201257861634, 0.04559748427672956, 0.07114779874213836, 0.09040880503144653, 0.1324685534591195, 0.16705974842767296, 0.20931603773584906, 0.24665880503