# Imports & Setup

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from venn import venn

In [None]:
plots_path = "../../plots/nedrexdb/"

In [None]:
nedrex_db_path = "../../data/nedrexDB"
disorder_df = pd.read_csv(f"{nedrex_db_path}/disorder.csv")
drug_df = pd.read_csv(f"{nedrex_db_path}/drug.csv")
drug_has_indication_df = pd.read_csv(f"{nedrex_db_path}/drug_has_indication.csv")
drug_has_target_df = pd.read_csv(f"{nedrex_db_path}/drug_has_target.csv")
gene_df = pd.read_csv(f"{nedrex_db_path}/gene.csv")
gene_associated_with_disorder_df = pd.read_csv(f"{nedrex_db_path}/gene_associated_with_disorder.csv")

In [None]:
# for each dataframe, print the header along with one row
print(disorder_df.head(1))
print(drug_df.head(1))
print(drug_has_indication_df.head(1))
print(drug_has_target_df.head(1))
print(gene_df.head(1))
print(gene_associated_with_disorder_df.head(1))

# Disorder Sets Analysis

In [None]:


# Define sets of disorder IDs
all_disorders_set = set(disorder_df['primaryDomainId'])
disorders_with_indications_set = set(drug_has_indication_df['targetDomainId'])
disorders_with_gene_seeds_set = set(gene_associated_with_disorder_df['targetDomainId'])
# set of disorders that have drug indications that are approved
# Set of drugs marked as approved
approved_drugs_set = set(
    drug_df.loc[
        drug_df['drugGroups'].apply(lambda groups: 'approved' in groups),
        'primaryDomainId'
    ]
)

# Set of disorders with at least one approved drug indication
disorders_with_approved_drugs_set = set(
    drug_has_indication_df.loc[
        drug_has_indication_df['sourceDomainId'].isin(approved_drugs_set),
        'targetDomainId'
    ]
)
# Print counts for reference
print(f"Total disorders: {len(all_disorders_set)}")
print(f"Disorders with drug indications: {len(disorders_with_indications_set)}")
print(f"Disorders with gene seeds: {len(disorders_with_gene_seeds_set)}")
print(f"Disorders with approved drug indications: {len(disorders_with_approved_drugs_set)}")

In [None]:
venn_sets = {
    "All Disorders": all_disorders_set,
    "Drug Indications": disorders_with_indications_set,
    "Gene Seeds": disorders_with_gene_seeds_set,
    "Approved Drug Indications": disorders_with_approved_drugs_set
}
# Plot
plt.figure(figsize=(8, 8))
venn(venn_sets)
plt.title("Overlap of Disorder Subsets")
plt.savefig(f"{plots_path}/overlap_of_disorder_subsets.pdf", bbox_inches='tight')
plt.show()

In [None]:
# Number of disorders with both drug and gene indications
disorders_with_drug_and_gene = disorders_with_indications_set & disorders_with_gene_seeds_set
print(f"Disorders with both drug and gene indications: {len(disorders_with_drug_and_gene)}")

# Number of disorders with approved drugs and gene indications
disorders_with_approved_drug_and_gene = disorders_with_approved_drugs_set & disorders_with_gene_seeds_set
print(f"Disorders with approved drug and gene indications: {len(disorders_with_approved_drug_and_gene)}")

In [None]:

input_disorders_path = "../../data/input"
with open(f'{input_disorders_path}/disorders_with_drug_and_gene.csv', 'w') as f:
    for domain_id in sorted(disorders_with_drug_and_gene):
        f.write(f"{domain_id}\n")

# Save disorders with approved drug and gene indications
with open(f'{input_disorders_path}/disorders_with_approved_drug_and_gene.csv', 'w') as f:
    for domain_id in sorted(disorders_with_approved_drug_and_gene):
        f.write(f"{domain_id}\n")

In [None]:
import seaborn as sns

# Count indications per disorder
indication_counts = drug_has_indication_df['targetDomainId'].value_counts()

# Include disorders with zero indications
all_disorders = disorder_df['primaryDomainId']
full_counts = indication_counts.reindex(all_disorders, fill_value=0)

# Plot histogram
plt.figure(figsize=(10, 6))
sns.histplot(full_counts, binwidth=1)
plt.xlabel('Number of drug indications per disease')
plt.ylabel('Count of diseases')
# log scale for better visibility
plt.title('Distribution of drug indications per disease')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(f"{plots_path}/drug_indications_per_disease_histogram.pdf", bbox_inches='tight')

plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.ecdfplot(full_counts, stat='count')
plt.xlabel('Number of drug indications per disease')
plt.ylabel('Cumulative count of diseases')
plt.title('Cumulative distribution of drug indications per disease')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(f"{plots_path}/drug_indications_per_disease_ecdf.pdf", bbox_inches='tight')

plt.show()

In [None]:
zero_count = (full_counts == 0).sum()
positive_counts = full_counts[full_counts > 0]

plt.figure(figsize=(10, 6))
sns.histplot(positive_counts, binwidth=1)
plt.xlabel('Number of drug indications per disease')
plt.ylabel('Count of diseases')
plt.title('Distribution of drug indications per disease (excluding zeros)')
plt.xticks(rotation=45)
plt.annotate(f'Zero indications: {zero_count}', xy=(0.95, 0.95), xycoords='axes fraction',
             ha='right', va='top')
plt.tight_layout()
plt.savefig(f"{plots_path}/drug_indications_per_disease_histogram_excluding_zeros.pdf", bbox_inches='tight')
plt.show()
