# Methods Comparisons on the OMIM Dataset

In [1]:
import pickle
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.metrics import auc
import plotly.graph_objects as go

from Paths import PATH_TO_DATASETS

In [3]:
DISEASE_NAMES = ['C0006142_Malignant_neoplasm_of_breast', 'C0009402_Colorectal_Carcinoma',\
                 'C0023893_Liver_Cirrhosis_Experimental']

DISEASE_CODES = {"C0006142_Malignant_neoplasm_of_breast": "C0006142", \
                    "C0009402_Colorectal_Carcinoma": "C0009402", \
                    "C0023893_Liver_Cirrhosis_Experimental": "C0023893"
                }

GUILD_METHODS   = []
COMPARE_METHODS = ["XGDAG - GNNExplainer",  "XGDAG - GraphSVX", "NIAPU", "DIAMOnD",]
XAI_METHODS     = ["GNNExplainer", "XGDAG - GNNExplainer", "GraphSVX", "XGDAG - GraphSVX", "SubgraphX", "XGDAG - SubgraphX"]

ratios_to_validate = [25, 50, 100, 200, 500, 750, 1000]

print('Total disease used:', len(DISEASE_NAMES))
print('Total methods used:', len(COMPARE_METHODS))

Total disease used: 3
Total methods used: 4


## Initialize Dictionary

In [4]:
# Dictionary to store, the metric achieved by each method for each disease:
# - disease_method_metric_d = {disease: {method: {metric: value(s)}}}
# COMPARE_METHODS = XAI_METHODS

disease_method_metric_d = {}

# Inizialization
for disease in DISEASE_NAMES:
    disease_method_metric_d[disease] = {}
    for method in COMPARE_METHODS:
        disease_method_metric_d[disease][method] = {}
        for metric in ['P', 'R', 'F1', 'AUC']:
            disease_method_metric_d[disease][method][metric] = 0

## Compute Metrics

In [6]:
auc_scores = {}
for METHOD in COMPARE_METHODS:
		auc_scores[METHOD] = []
		
for DISEASE_NAME in tqdm(DISEASE_NAMES):
		recall_folds = []
		precision_folds = []
		F1_folds = []

		recall_folds_compare_methods = {}
		precision_folds_compare_methods = {}
		F1_folds_compare_methods = {}
		
		for METHOD in COMPARE_METHODS:
				recall_folds_compare_methods[METHOD] = []
				precision_folds_compare_methods[METHOD] = []
				F1_folds_compare_methods[METHOD] = []

		for ratio_to_validate in ratios_to_validate:
			GENE_APU_SCORES_PATH = PATH_TO_DATASETS + "/Diamond_dataset/" + DISEASE_CODES[DISEASE_NAME] + "_diamond_ranking"
			TRAIN_SEEDS_PATH = PATH_TO_DATASETS + "/Diamond_dataset/" + DISEASE_CODES[DISEASE_NAME] + "_seed_genes_scores_diamond.txt"

			APU_scores_df = pd.read_csv(GENE_APU_SCORES_PATH, header = None, sep = " ")
			APU_scores_df.columns = ["name", "score", "label"]
			APU_ranking_df = APU_scores_df.sort_values(by = "score", ascending= False)
			

			# seed genes used for diffusion that we consider as P class in this scenario 
			# (20% or seed genes were removed to check for robustness)
			train_seeds_df = pd.read_csv(TRAIN_SEEDS_PATH, header = None, sep = " ")
			train_seeds_df.columns = ["name", "GDA Score"]
			train_seeds_list = train_seeds_df["name"].values.tolist()

			APU_ranking_df_not_seeds = APU_ranking_df[~APU_ranking_df['name'].isin(train_seeds_list)]

			APU_ranking_candidate_genes = APU_ranking_df_not_seeds["name"].values.tolist()
			
			N = None
			
			FILE_NAME_ALL_SEEDS = PATH_TO_DATASETS + "/all_seed_genes/" + DISEASE_NAME + "_all_seed_genes.txt"
			all_seed_genes_df = pd.read_csv(FILE_NAME_ALL_SEEDS, sep = " ", header = None)
			all_seed_genes = all_seed_genes_df[0].values

			diamond_all_genes_df = pd.read_csv(PATH_TO_DATASETS + '/Diamond_dataset/diamond_all_genes.txt', header=None)
			diamond_all_genes = diamond_all_genes_df[0].values
			all_seed_genes = list(set(all_seed_genes).intersection(set(diamond_all_genes)))

			test_seeds = list(set(all_seed_genes).difference(set(train_seeds_list)))
			
			N = len(all_seed_genes)
			
			APU_ranking_candidate_genes = APU_ranking_candidate_genes[:round(ratio_to_validate)]
			TP = 0
			FP = 0
			P = len(test_seeds) #TP+FP

			for gene in APU_ranking_candidate_genes:
					
					if gene in test_seeds:
							TP += 1
							
					else:
							FP += 1
			
			recall = TP / P
			precision = TP / (TP + FP)

			F1_score = 0
			if (precision + recall) != 0:
					F1_score = 2*(precision*recall)/(precision+recall)

			recall_folds.append(recall)
			precision_folds.append(precision)
			F1_folds.append(F1_score)

			recall_folds_compare_methods["NIAPU"].append(recall)
			precision_folds_compare_methods["NIAPU"].append(precision)
			F1_folds_compare_methods["NIAPU"].append(F1_score)
			
			for METHOD in COMPARE_METHODS:
					if METHOD != "NIAPU":
							ranking_method = []

							if METHOD in XAI_METHODS:
									with open("Rankings/" + DISEASE_CODES[DISEASE_NAME] + "_all_positives_diamond_" + METHOD.lower().replace("-", "_").replace(" ", "") + ".txt", "r", encoding="utf-8") as rankingFile:
											for line in rankingFile:
													ranking_method.append(line.strip("\n"))

							elif METHOD in GUILD_METHODS:        
									GUILD_METHOD_PATH = "Rankings/other_methods/GUILD/" + METHOD + "/" + DISEASE_NAME + "_" + METHOD + ".txt"

									GUILD_scores_df = pd.read_csv(GUILD_METHOD_PATH, header = None, sep = "\t")
									GUILD_scores_df.columns = ["name", "score"]
									GUILD_scores_df = GUILD_scores_df.sort_values(by = "score", ascending= False)

									ranking_method_df_not_seeds = GUILD_scores_df[~GUILD_scores_df['name'].isin(train_seeds_list)]
									ranking_method = ranking_method_df_not_seeds["name"].values.tolist()
							
							else:
								if METHOD == 'DIAMOnD':
											try:
												with open('Rankings/other_methods/' + METHOD + '/diamond_output_' + DISEASE_CODES[DISEASE_NAME] + '_diamond_new.txt', 'r', encoding='utf-8') as rankingFile:
													for line in rankingFile:
														ranking_method.append(line.strip("\n"))
											except:
												print('[!] Disease not available in the DIAMOnD dataset')
								else:
									with open("Rankings/other_methods/" + METHOD + "/" + METHOD.lower() + "_output_" + DISEASE_NAME + ".txt", "r", encoding="utf-8") as rankingFile:
										for line in rankingFile:
											ranking_method.append(line.strip("\n"))
									

							ranking_method = ranking_method[:round(ratio_to_validate)]
							TP = 0
							FP = 0
							P = len(test_seeds) #TP+FP

							for gene in ranking_method:
								gene.replace('-', '_')
								gene.replace('.', '_')
								
								if gene in test_seeds:
									TP += 1
								else:
									FP += 1

							recall = TP / P
							precision = TP / (TP + FP)
							
							F1_score = 0
							if (precision + recall) != 0:
									F1_score = 2*(precision*recall)/(precision+recall)

							recall_folds_compare_methods[METHOD].append(recall)
							precision_folds_compare_methods[METHOD].append(precision)
							F1_folds_compare_methods[METHOD].append(F1_score)
			
		#compute area under the precision-recall curve (AUC)
		for METHOD in COMPARE_METHODS:
				auc_score = auc(recall_folds_compare_methods[METHOD], precision_folds_compare_methods[METHOD])
				auc_scores[METHOD].append(auc_score)

				disease_method_metric_d[DISEASE_NAME][METHOD]['P']      = precision_folds_compare_methods[METHOD]
				disease_method_metric_d[DISEASE_NAME][METHOD]['R']      = recall_folds_compare_methods[METHOD]
				disease_method_metric_d[DISEASE_NAME][METHOD]['F1']     = F1_folds_compare_methods[METHOD]
				disease_method_metric_d[DISEASE_NAME][METHOD]['AUC']    = auc_score



  0%|          | 0/3 [00:00<?, ?it/s]

## Line Plots

In [10]:
# LINEPLOTS
format = 'eps'
metric = 'F1'

metric_name_d = {
    'F1': 'F1 Score',
    'P': 'Precision',
    'R': 'Recall'
}

colors  = ['skyblue', 'tomato', 'brown', 'orange', 'green', 'seagreen']
markers = ['circle', 'square', 'cross', 'diamond', 'x', 'triangle-up', 'triangle-down', 'star']
styles 	= ['solid', 'dot', 'dash', 'longdash', 'dashdot', 'longdashdot']

for DISEASE_NAME in DISEASE_NAMES:
    # data = []
    print(DISEASE_NAME)

    fig = go.Figure()

    marker_idx  = 0
    color_idx   = 0
    for METHOD in COMPARE_METHODS:
        fig.add_trace(go.Scatter(
            y = disease_method_metric_d[DISEASE_NAME][METHOD][metric],
            x = [str(x) for x in ratios_to_validate],
            name = METHOD,
            line = dict(
                color = colors[color_idx],
                dash = 'solid'
            ),
            marker = dict(
                symbol = markers[marker_idx]
            )
        ))
        
        if 'XGDAG' in METHOD: color_idx += 1
        marker_idx  += 1
        
    fig.update_layout(xaxis_title='Top K Genes',
                   yaxis_title=metric_name_d[metric],
                   template='plotly_white')
                   
    fig.show()
    # uncomment to save images
    # pio.write_image(fig, 'Images/XAI/'+DISEASE_NAME+'_'+metric+'.'+format, scale=1)

C0006142_Malignant_neoplasm_of_breast


C0009402_Colorectal_Carcinoma


C0023893_Liver_Cirrhosis_Experimental


## Save Dictionary

In [None]:
# Save dictionary of metrics using Pickle
with open('diamond_disease_method_metrics.pickle', 'wb') as fout:
    pickle.dump(disease_method_metric_d, fout, protocol=pickle.HIGHEST_PROTOCOL)