# Import libraries & Load Data

In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np



In [None]:
plots_path = "../../plots/initial_run/"

In [None]:
multiqc_files_path = "../../data/multiqc_data"
multiqc_data = json.load(open(f"{multiqc_files_path}/multiqc_data.json"))

In [None]:
multiqc_input_df =  pd.DataFrame.from_dict(multiqc_data['report_saved_raw_data']['multiqc_input'], orient='index')
multiqc_general_stats_df = pd.DataFrame.from_dict(
    {k: v for d in multiqc_data['report_general_stats_data'] for k, v in d.items()},
    orient='index'
)

In [None]:
# Add 'Not in network' column to multiqc_general_stats_df
not_in_network = multiqc_input_df['Not in network'].rename('seeds_not_in_network')

multiqc_general_stats_df = multiqc_general_stats_df.merge(
    not_in_network,
    left_on=multiqc_general_stats_df.index.str.rsplit('.', n=1).str[0],
    right_index=True,
    how='left'
)

In [None]:
not_in_network

In [None]:
multiqc_general_stats_df

In [None]:
# 1. bring the index into a column
multiqc_general_stats_df.reset_index(inplace=True)

# 2. split into five columns (sample, ppi, reviewed_proteins_exp, namespace, algorithm)
multiqc_general_stats_df[['sample','ppi','reviewed_proteins_exp','namespace','algorithm']] = \
    multiqc_general_stats_df['index'].str.rsplit('.', n=4, expand=True)

# 3. drop the temporary index column
multiqc_general_stats_df.drop(columns='index', inplace=True)
multiqc_general_stats_df = multiqc_general_stats_df[['sample','ppi','reviewed_proteins_exp','namespace','algorithm'] + [c for c in multiqc_general_stats_df.columns if c not in ('sample','ppi','reviewed_proteins_exp','namespace','algorithm')]]
multiqc_general_stats_df = multiqc_general_stats_df[['sample','ppi','reviewed_proteins_exp','namespace','algorithm'] + [c for c in multiqc_general_stats_df.columns if c not in ('sample','ppi','reviewed_proteins_exp','namespace','algorithm')]]

In [None]:
# 1. Build the DataFrame from the nested dict
multiqc_prioritizationevaluation_df = pd.DataFrame.from_dict(
    multiqc_data['report_saved_raw_data']['multiqc_prioritizationevaluation'],
    orient='index'
)

# 2. Reset the index into a column for splitting
multiqc_prioritizationevaluation_df.reset_index(inplace=True)

# 3. Split into sample, ppi, reviewed_proteins_exp, namespace, algorithm, prioritization_algorithm
multiqc_prioritizationevaluation_df[
    ['sample','ppi','reviewed_proteins_exp','namespace','algorithm','prioritization_algorithm']
] = multiqc_prioritizationevaluation_df['index'].str.rsplit('.', n=5, expand=True)

# 4. Drop the temporary index column
multiqc_prioritizationevaluation_df.drop(columns='index', inplace=True)

# 5. Reorder so the split columns come first
multiqc_prioritizationevaluation_df = multiqc_prioritizationevaluation_df[
    ['sample','ppi','reviewed_proteins_exp','namespace','algorithm','prioritization_algorithm']
    + [c for c in multiqc_prioritizationevaluation_df.columns
       if c not in ('sample','ppi','reviewed_proteins_exp','namespace','algorithm','prioritization_algorithm')]
]
# 6. Rename empirical_p_value_without_considering_ranks to p_value_without_ranks and empirical_DCG_based_p_value to p_value_DCG
multiqc_prioritizationevaluation_df.rename(
    columns={
        'empirical_p_value_without_considering_ranks': 'p_value_without_ranks',
        'empirical_DCG_based_p_value': 'p_value_DCG'
    },
    inplace=True
)

## Preview Data

In [None]:
multiqc_input_df.head()

In [None]:
multiqc_general_stats_df.head()

In [None]:
multiqc_prioritizationevaluation_df.head()

## Combine Data

In [None]:
# Merge general stats with prioritization evaluation on key columns
multiqc_combined_df = multiqc_general_stats_df.merge(
    multiqc_prioritizationevaluation_df,
    on=['sample', 'ppi', 'reviewed_proteins_exp', 'namespace', 'algorithm'],
    how='inner'
)

# Ensure 'prioritization_algorithm' follows 'algorithm'
multiqc_combined_df = multiqc_combined_df[
    ['sample','ppi','reviewed_proteins_exp','namespace','algorithm','prioritization_algorithm']
    + [c for c in multiqc_combined_df.columns
       if c not in ('sample','ppi','reviewed_proteins_exp','namespace','algorithm','prioritization_algorithm')]
]

# Preview the merged DataFrame
multiqc_combined_df.head()

In [None]:
## Load and Combine NedrexDB data
nedrex_db_path = "../../data/nedrexDB"
disorder_df = pd.read_csv(f"{nedrex_db_path}/disorder.csv")
drug_df = pd.read_csv(f"{nedrex_db_path}/drug.csv")
drug_has_indication_df = pd.read_csv(f"{nedrex_db_path}/drug_has_indication.csv")
drug_has_target_df = pd.read_csv(f"{nedrex_db_path}/drug_has_target.csv")
gene_df = pd.read_csv(f"{nedrex_db_path}/gene.csv")
gene_associated_with_disorder_df = pd.read_csv(f"{nedrex_db_path}/gene_associated_with_disorder.csv")
# for each dataframe, print the header along with one row
print(disorder_df.head(1))
print(drug_df.head(1))
print(drug_has_indication_df.head(1))
print(drug_has_target_df.head(1))
print(gene_df.head(1))
print(gene_associated_with_disorder_df.head(1))

In [None]:
# Add columns to merged
# Compute total number of drugs per disease (sample)
drug_counts = drug_has_indication_df.groupby('targetDomainId')['sourceDomainId'].nunique()

# Identify approved drugs from drug_df
approved_drugs = drug_df.loc[
    drug_df['drugGroups'].apply(lambda grp: 'approved' in grp),
    'primaryDomainId'
]

# Compute number of approved drugs per disease
approved_counts = drug_has_indication_df[
    drug_has_indication_df['sourceDomainId'].isin(approved_drugs)
].groupby('targetDomainId')['sourceDomainId'].nunique()

# Map counts into the combined DataFrame, defaulting missing to 0
multiqc_combined_df['num_drugs'] = multiqc_combined_df['sample'] \
    .map(drug_counts).fillna(0).astype(int)

multiqc_combined_df['num_approved_drugs'] = multiqc_combined_df['sample'] \
    .map(approved_counts).fillna(0).astype(int)


In [None]:
multiqc_combined_df.head()

# Analysis
## Prioritization Algorithms

In [None]:
# 1. Select only numeric columns and drop unwanted ones
drop_cols = ['dcg_exceed_count', 'observed_DCG', 'observed_overlap', 'overlap_exceed_count']
# get an excerpt of the df that contains only numeric columns, drops the cols listed, and only inludes rows without dcg p val of 1
num_df = multiqc_combined_df.select_dtypes(include='number').drop(columns=drop_cols)
# make a variant that only includes rows with p_value_DCG < 1
num_df = num_df[multiqc_combined_df['p_value_DCG'] < 1]
# 2. Compute correlation matrix
corr = num_df.corr()

# 3. Plot
fig, ax = plt.subplots(figsize=(14, 12))
cax = ax.imshow(corr, cmap='RdBu_r', vmin=-1, vmax=1)

# 4. Add colorbar
cb = fig.colorbar(cax, ax=ax, fraction=0.046, pad=0.04)
cb.set_label('Pearson correlation', rotation=270, labelpad=15)

# 5. Tick labels
labels = corr.columns
ax.set_xticks(np.arange(len(labels)))
ax.set_yticks(np.arange(len(labels)))
ax.set_xticklabels(labels, rotation=45, ha='right')
ax.set_yticklabels(labels)

# 6. Draw cell outlines via minor-grid
ax.set_xticks(np.arange(-.5, len(labels), 1), minor=True)
ax.set_yticks(np.arange(-.5, len(labels), 1), minor=True)
ax.grid(which='minor', color='black', linewidth=1)
ax.grid(False, which='major')
ax.tick_params(which='minor', bottom=False, left=False)

# 7. Annotate each cell with its correlation value
for i in range(len(labels)):
    for j in range(len(labels)):
        ax.text(j, i, f"{corr.iat[i, j]:.2f}",
                ha='center', va='center', fontsize=8, color='white' if abs(corr.iat[i, j])>0.5 else 'black')

# 8. Titles
ax.set_title('Correlation Heatmap of Numeric Metrics', pad=20)
ax.set_xlabel('Numeric Features')
ax.set_ylabel('Numeric Features')

plt.tight_layout()
plt.savefig(f"{plots_path}/prioritization_evaluation_correlation_heatmap.pdf", bbox_inches='tight')
plt.show()

In [None]:
# --- 1) Overall KDE of p_value_DCG ---
fig, ax = plt.subplots(figsize=(8, 6))
multiqc_combined_df['p_value_DCG'].plot.kde(ax=ax)
ax.set_xlim(0, 1)
ax.set_title('KDE of DCG p-values', pad=15)
ax.set_xlabel('DCG p-value')
ax.set_ylabel('Density')
plt.tight_layout()
plt.savefig(f"{plots_path}/prioritization_evaluation_kde_p_value_DCG.pdf", bbox_inches='tight')
plt.show()


# --- 2) KDE of p_value_DCG, stratified by algorithm ---
to_stratify_by = 'algorithm'
fig, ax = plt.subplots(figsize=(10, 7))
for algo in multiqc_combined_df[to_stratify_by].unique():
    subset = multiqc_combined_df.loc[
        multiqc_combined_df[to_stratify_by] == algo, 'p_value_DCG'
    ]
    subset.plot.kde(ax=ax, label=algo)

ax.set_xlim(0, 1)
ax.set_title(f'KDE of DCG p-values by {to_stratify_by}', pad=15)
ax.set_xlabel('DCG p-value')
ax.set_ylabel('Density')


ax.legend(title=to_stratify_by)
plt.tight_layout()
plt.savefig(f"{plots_path}/prioritization_evaluation_kde_p_value_DCG_by_algorithm.pdf", bbox_inches='tight')
plt.show()

to_stratify_by = 'prioritization_algorithm'
fig, ax = plt.subplots(figsize=(10, 7))
for algo in multiqc_combined_df[to_stratify_by].unique():
    subset = multiqc_combined_df.loc[
        multiqc_combined_df[to_stratify_by] == algo, 'p_value_DCG'
    ]
    subset.plot.kde(ax=ax, label=algo)

ax.set_xlim(0, 1)
ax.set_title(f'KDE of DCG p-values by {to_stratify_by}', pad=15)
ax.set_xlabel('DCG p-value')
ax.set_ylabel('Density')
ax.legend(title=to_stratify_by)
plt.tight_layout()
plt.savefig(f"{plots_path}/prioritization_evaluation_kde_p_value_DCG_by_prioritization_algorithm.pdf", bbox_inches='tight')
plt.show()

In [None]:

# unique values
algos = multiqc_combined_df['algorithm'].unique()
prios = multiqc_combined_df['prioritization_algorithm'].unique()

# choose a color palette for algorithms
cmap = plt.get_cmap('tab10')
colors = {alg: cmap(i) for i, alg in enumerate(algos)}

# define dash styles for prioritization algorithms
dash_styles = ['solid', 'dashed', 'dashdot', 'dotted']
# if you have >4 prios, you can add custom patterns like (5,1), (3,3,1,3), etc.
style_map = {prio: dash_styles[i % len(dash_styles)] for i, prio in enumerate(prios)}

fig, ax = plt.subplots(figsize=(12, 8))

for alg in algos:
    for prio in prios:
        subset = multiqc_combined_df.loc[
            (multiqc_combined_df['algorithm'] == alg) &
            (multiqc_combined_df['prioritization_algorithm'] == prio),
            'p_value_DCG'
        ]
        if subset.empty:
            continue
        subset.plot.kde(
            ax=ax,
            color=colors[alg],
            linestyle=style_map[prio],
            label=f"{alg} | {prio}",

        )

ax.set_xlim(0, 1)
ax.set_title('KDE of DCG p-values by Algorithm & Prioritization Algorithm', pad=15)
ax.set_xlabel('DCG p-value')
ax.set_ylabel('Density')

# build a legend that shows both color and dash
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles, labels, title='Algorithm | Prioritization Algorithm', loc='best', fontsize='small')

plt.tight_layout()
plt.savefig(f"{plots_path}/prioritization_evaluation_kde_p_value_DCG_by_algorithm_and_prioritization_algorithm.pdf", bbox_inches='tight')
plt.show()

In [None]:
# Scatter plot of p_value_DCG vs num_drugs, color by number of seeds
fig, ax = plt.subplots(figsize=(10, 6))
multiqc_combined_df.plot.scatter(
    x='num_drugs',
    y='p_value_DCG',
    c='seeds',
    colormap='viridis',
    alpha=0.7,
    ax=ax
)