In [None]:
import helper
import pickle

In [None]:
reindexed_embeddings, gene_lists, reference_node2index, reference_genes = helper.get_embeddings()

In [None]:
doid_prop_use, doid_slim_use, doid_to_slim, graph, filtered_doid_to_slim = helper.load_annotations(reference_genes, gmt_direct ='data/omim.20231030.direct.gmt', gmt_prop = 'data/omim.20231030.prop.gmt', slim = 'data/omim_slim.txt', obo = '/doid.obo')
holdout_dict, cv_fold1_dict, cv_fold2_dict, cv_fold3_dict = helper.fold_split(doid_prop_use, doid_slim_use, doid_to_slim)

file_names = ["omim_cv_fold1_dict.pkl", "omim_cv_fold2_dict.pkl", "omim_cv_fold3_dict.pkl", "omim_holdout_dict.pkl"]
data_dicts = [cv_fold1_dict, cv_fold2_dict, cv_fold3_dict, holdout_dict]

for file_name, data_dict in zip(file_names, data_dicts):
    with open(file_name, 'wb') as f:
        pickle.dump(data_dict, f)

file_names

In [None]:
all_fold_results, all_holdout_results = helper.run_benchmark(reindexed_embeddings, cv_fold1_dict, cv_fold2_dict, cv_fold3_dict, holdout_dict)

with open('omim_all_fold_results.pkl', 'wb') as f:
    pickle.dump(all_fold_results, f)

with open('omim_all_holdout_results.pkl', 'wb') as f:
    pickle.dump(all_holdout_results, f)

df_summary_with_meta, fold_auc_df, fold_auprc_df, fold_pr10_df, holdout_auc_df, holdout_auprc_df, holdout_pr10_df, holdout_time_df = helper.reformat_results(all_fold_results, all_holdout_results)

In [None]:
helper.plot_scatter(holdout_auc_df)
helper.anova(df_summary_with_meta)

g = helper.plot_slim_clustermap(df=fold_auc_df.loc[fold_auc_df.mean(axis=1).sort_values(ascending=False).index], graph=graph, filtered_annot_to_slim=filtered_doid_to_slim, title='OMIM Fold AUC', legend=True)
g = helper.plot_slim_clustermap(df=fold_auprc_df.loc[fold_auprc_df.mean(axis=1).sort_values(ascending=False).index], graph=graph, filtered_annot_to_slim=filtered_doid_to_slim, title='OMIM Fold AUPRC', legend=True)
g = helper.plot_slim_clustermap(df=fold_pr10_df.loc[fold_pr10_df.mean(axis=1).sort_values(ascending=False).index], graph=graph, filtered_annot_to_slim=filtered_doid_to_slim, title='OMIM Fold PR@10', legend=True)
g = helper.plot_slim_clustermap(df=holdout_auc_df.loc[holdout_auc_df.mean(axis=1).sort_values(ascending=False).index], graph=graph, filtered_annot_to_slim=filtered_doid_to_slim, title='OMIM Holdout AUC', legend=True)
g = helper.plot_slim_clustermap(df=holdout_auprc_df.loc[holdout_auprc_df.mean(axis=1).sort_values(ascending=False).index], graph=graph, filtered_annot_to_slim=filtered_doid_to_slim, title='OMIM Holdout AUPRC', legend=True)
g = helper.plot_slim_clustermap(df=holdout_pr10_df.loc[holdout_pr10_df.mean(axis=1).sort_values(ascending=False).index], graph=graph, filtered_annot_to_slim=filtered_doid_to_slim, title='OMIM Holdout PR@10', legend=True)


In [None]:
go_prop_use, go_slim_use, go_to_slim, graph, filtered_go_to_slim = helper.load_annotations(reference_genes, gmt_direct ='/hsa_ALL_BP_direct.gmt', gmt_prop = '/hsa_ALL_BP_prop.gmt', slim = '/goslim_agr.tsv', obo = '/go.obo')
holdout_dict, cv_fold1_dict, cv_fold2_dict, cv_fold3_dict = helper.fold_split(doid_prop_use, doid_slim_use, doid_to_slim)

file_names = ["go_cv_fold1_dict.pkl", "go_cv_fold2_dict.pkl", "go_cv_fold3_dict.pkl", "go_holdout_dict.pkl"]
data_dicts = [cv_fold1_dict, cv_fold2_dict, cv_fold3_dict, holdout_dict]

for file_name, data_dict in zip(file_names, data_dicts):
    with open(file_name, 'wb') as f:
        pickle.dump(data_dict, f)

file_names

In [None]:
all_fold_results, all_holdout_results = helper.run_benchmark(reindexed_embeddings, cv_fold1_dict, cv_fold2_dict, cv_fold3_dict, holdout_dict)

with open('go_all_fold_results.pkl', 'wb') as f:
    pickle.dump(all_fold_results, f)

with open('go_all_holdout_results.pkl', 'wb') as f:
    pickle.dump(all_holdout_results, f)

df_summary_with_meta, fold_auc_df, fold_auprc_df, fold_pr10_df, holdout_auc_df, holdout_auprc_df, holdout_pr10_df, holdout_time_df = helper.reformat_results(all_fold_results, all_holdout_results)

In [None]:
helper.plot_scatter(holdout_auc_df)
helper.anova(df_summary_with_meta)

g = helper.plot_slim_clustermap(df=fold_auc_df.loc[fold_auc_df.mean(axis=1).sort_values(ascending=False).index], graph=graph, filtered_annot_to_slim=filtered_doid_to_slim, title='GO Fold AUC', legend=True)
g = helper.plot_slim_clustermap(df=fold_auprc_df.loc[fold_auprc_df.mean(axis=1).sort_values(ascending=False).index], graph=graph, filtered_annot_to_slim=filtered_doid_to_slim, title='GO Fold AUPRC', legend=True)
g = helper.plot_slim_clustermap(df=fold_pr10_df.loc[fold_pr10_df.mean(axis=1).sort_values(ascending=False).index], graph=graph, filtered_annot_to_slim=filtered_doid_to_slim, title='GO Fold PR@10', legend=True)
g = helper.plot_slim_clustermap(df=holdout_auc_df.loc[holdout_auc_df.mean(axis=1).sort_values(ascending=False).index], graph=graph, filtered_annot_to_slim=filtered_doid_to_slim, title='GO Holdout AUC', legend=True)
g = helper.plot_slim_clustermap(df=holdout_auprc_df.loc[holdout_auprc_df.mean(axis=1).sort_values(ascending=False).index], graph=graph, filtered_annot_to_slim=filtered_doid_to_slim, title='GO Holdout AUPRC', legend=True)
g = helper.plot_slim_clustermap(df=holdout_pr10_df.loc[holdout_pr10_df.mean(axis=1).sort_values(ascending=False).index], graph=graph, filtered_annot_to_slim=filtered_doid_to_slim, title='GO Holdout PR@10', legend=True)


In [None]:
go_prop_use, go_slim_use, go_to_slim, graph, filtered_go_to_slim = helper.load_annotations(reference_genes, gmt_direct ='/2020_hsa_ALL_BP_direct_fixed.gmt"', gmt_prop = '2020_hsa_ALL_BP_prop_fixed.gmt"', slim = '/goslim_agr.tsv', obo = '/go.obo')
holdout_dict, cv_fold1_dict, cv_fold2_dict, cv_fold3_dict = helper.fold_split(doid_prop_use, doid_slim_use, doid_to_slim)

file_names = ["go2020_cv_fold1_dict.pkl", "go2020_cv_fold2_dict.pkl", "go2020_cv_fold3_dict.pkl", "go2020_holdout_dict.pkl"]
data_dicts = [cv_fold1_dict, cv_fold2_dict, cv_fold3_dict, holdout_dict]

for file_name, data_dict in zip(file_names, data_dicts):
    with open(file_name, 'wb') as f:
        pickle.dump(data_dict, f)

file_names

In [None]:
all_fold_results, all_holdout_results = helper.run_benchmark(reindexed_embeddings, cv_fold1_dict, cv_fold2_dict, cv_fold3_dict, holdout_dict)

with open('go2020_all_fold_results.pkl', 'wb') as f:
    pickle.dump(all_fold_results, f)

with open('go2020_all_holdout_results.pkl', 'wb') as f:
    pickle.dump(all_holdout_results, f)

df_summary_with_meta, fold_auc_df, fold_auprc_df, fold_pr10_df, holdout_auc_df, holdout_auprc_df, holdout_pr10_df, holdout_time_df = helper.reformat_results(all_fold_results, all_holdout_results)

In [None]:
helper.plot_scatter(holdout_auc_df)
helper.anova(df_summary_with_meta)

g = helper.plot_slim_clustermap(df=fold_auc_df.loc[fold_auc_df.mean(axis=1).sort_values(ascending=False).index], graph=graph, filtered_annot_to_slim=filtered_doid_to_slim, title='GO 2020 Fold AUC', legend=True)
g = helper.plot_slim_clustermap(df=fold_auprc_df.loc[fold_auprc_df.mean(axis=1).sort_values(ascending=False).index], graph=graph, filtered_annot_to_slim=filtered_doid_to_slim, title='GO 2020 Fold AUPRC', legend=True)
g = helper.plot_slim_clustermap(df=fold_pr10_df.loc[fold_pr10_df.mean(axis=1).sort_values(ascending=False).index], graph=graph, filtered_annot_to_slim=filtered_doid_to_slim, title='GO 2020 Fold PR@10', legend=True)
g = helper.plot_slim_clustermap(df=holdout_auc_df.loc[holdout_auc_df.mean(axis=1).sort_values(ascending=False).index], graph=graph, filtered_annot_to_slim=filtered_doid_to_slim, title='GO 2020 Holdout AUC', legend=True)
g = helper.plot_slim_clustermap(df=holdout_auprc_df.loc[holdout_auprc_df.mean(axis=1).sort_values(ascending=False).index], graph=graph, filtered_annot_to_slim=filtered_doid_to_slim, title='GO 2020 Holdout AUPRC', legend=True)
g = helper.plot_slim_clustermap(df=holdout_pr10_df.loc[holdout_pr10_df.mean(axis=1).sort_values(ascending=False).index], graph=graph, filtered_annot_to_slim=filtered_doid_to_slim, title='GO 2020 Holdout PR@10', legend=True)
