In [1]:
import pandas as pd

# manuallynamed the columns
colnames = [
    "gene_id", "gene_symbol", "disease_id", "disease_name",
    "source", "evidence_type", "confidence_score"
]

df = pd.read_csv("/Volumes/my_expansion/human_disease_experiments_filtered.tsv", sep='\t', names=colnames)

# remove entries with empty gene symbol
#df = df[df["gene_symbol"].notna() & df["gene_symbol"].str.len() > 0]

# print initial # of diseases
initial_disease_count = df["disease_id"].nunique()
print(f"Initial disease count (no filtering): {initial_disease_count}")

# construct disease → gene set
disease_to_genes = df.groupby("disease_id")["gene_symbol"].apply(set)

# filtering: diseases have suitable (20-400) gene number
filtered_disease_to_genes = disease_to_genes[
    disease_to_genes.apply(lambda x: 20 <= len(x) <= 600)
].to_dict()

print(f"Filtered disease count (20–600 genes): {len(filtered_disease_to_genes)}")


Initial disease count (no filtering): 277
Filtered disease count (20–600 genes): 152


In [3]:
doid_to_cui = {}

with open("/Volumes/my_expansion/doid.obo", "r") as f:
    current_doid = None
    for line in f:
        line = line.strip()
        if line.startswith("id: DOID:"):
            current_doid = line.split("id: ")[1]
        elif line.startswith("xref: UMLS_CUI:") and current_doid:
            disease_cui = line.split("xref: UMLS_CUI:")[1]
            doid_to_cui[current_doid] = disease_cui
        elif line == "":
            current_doid = None  # reset, avoid wrong mapping across terms

print(f"Extracted {len(doid_to_cui)} DOID → disease CUI mappings.")

Extracted 6423 DOID → disease CUI mappings.


In [4]:
doids = set(filtered_disease_to_genes.keys())
mapped = {d: doid_to_cui[d] for d in doids if d in doid_to_cui}
unmapped = [d for d in doids if d not in doid_to_cui]

print(f"Found CUI for {len(mapped)} out of {len(doids)} diseases.")
print("Missing CUI for:", unmapped[:10])

Found CUI for 136 out of 152 diseases.
Missing CUI for: ['DOID:0050169', 'DOID:7061', 'DOID:11132', 'DOID:0050748', 'DOID:0060076', 'DOID:0050211', 'DOID:14566', 'DOID:0060250', 'DOID:0050801', 'DOID:0060108']


In [8]:
with open("doid_to_cui.tsv", "w") as f:
    for doid, disease_cui in mapped.items():
        f.write(f"{doid}\t{disease_cui}\n")

In [9]:
#construct global mapping from DOID to parents

from collections import defaultdict

doid_parents = defaultdict(set)
current_id = None
doid_names = {}

with open("/Volumes/my_expansion/doid.obo", "r") as f:
    for line in f:
        line = line.strip()
        if line.startswith("id: DOID:"):
            current_id = line.split("id: ")[1]
        elif line.startswith("is_a:") and current_id:
            parent = line.split("is_a: ")[1].split(" !")[0]
            doid_parents[current_id].add(parent)
        elif line.startswith("name:") and current_id:
            name = line.split("name: ")[1]
            doid_names[current_id] = name

# calculate depth of each DOID
def calculate_depth(doid, doid_parents, depth_cache=None):
    if depth_cache is None:
        depth_cache = {}
    if doid in depth_cache:
        return depth_cache[doid]
    
    parents = doid_parents.get(doid, set())
    if not parents:
        depth = 0 if doid == "DOID:4" else -1  #root:DOID:4, set depth = -1 to isolated DOID
    else:
        depth = max(calculate_depth(p, doid_parents, depth_cache) for p in parents) + 1
    
    depth_cache[doid] = depth
    return depth

# only care about diseases inside mapped set
depth_cache = {}
for doid in mapped:
    calculate_depth(doid, doid_parents, depth_cache)

# filtering
min_depth = 4
passed = []    # diseases that passed filter
filtered_out = [] # diseases that filtered out

for doid in mapped:
    depth = depth_cache.get(doid, -1)
    if depth >= min_depth:
        passed.append(doid)
    else:
        filtered_out.append((doid, depth))

print(f"✅ number of diseases passed: {len(passed)}")
print(f"❌ number of diseases filtered out: {len(filtered_out)}")

print("\n details of thoes filtered out (DOID, disease name, depth):")
for doid, depth in filtered_out:
    name = doid_names.get(doid, "name")
    print(f"- {doid} ({name}), depth: {depth}")

✅ number of diseases passed: 129
❌ number of diseases filtered out: 7

 details of thoes filtered out (DOID, disease name, depth):
- DOID:10892 (hypospadias), depth: 2
- DOID:8536 (herpes zoster), depth: 3
- DOID:8534 (gastroesophageal reflux disease), depth: 3
- DOID:37 (skin disease), depth: 3
- DOID:0060262 (gallbladder disease), depth: 3
- DOID:5295 (intestinal disease), depth: 3
- DOID:178 (vascular disease), depth: 3


In [12]:
filtered_df = df[df["disease_id"].isin(passed)] # the set 'passed' stored the doids that path the depth threshold
name_list = list(filtered_df["disease_name"].unique())
from rapidfuzz import fuzz

similar_pairs = []

for i, name1 in enumerate(name_list):
    for j in range(i+1, len(name_list)):
        name2 = name_list[j]
        # unify names to lowercase letters and remove Spaces/symbols
        name1_clean = name1.lower().replace(" ", "").strip(",-_")
        name2_clean = name2.lower().replace(" ", "").strip(",-_")
        # use token_sort_ratio to ignore the words' order differences
        score = fuzz.token_sort_ratio(name1_clean, name2_clean)
        if score > 71:
            similar_pairs.append((name1, name2, score))

# sort in similarity, reverse
similar_pairs.sort(key=lambda x: x[2], reverse=True)

print(f"discover {len(similar_pairs)} pairs of similar diseases (similarity > 70%):")
for name1, name2, score in similar_pairs:
    print(f"{name1} ≈ {name2} ({score:.1f}%)")

discover 41 pairs of similar diseases (similarity > 70%):
Lung squamous cell carcinoma ≈ Squamous cell carcinoma (91.3%)
Hepatitis B ≈ Hepatitis C (90.0%)
Hypothyroidism ≈ Hyperthyroidism (89.7%)
Lung squamous cell carcinoma ≈ Laryngeal squamous cell carcinoma (87.3%)
Diabetes mellitus ≈ Type 1 diabetes mellitus (86.5%)
Lung carcinoma ≈ Lung adenocarcinoma (83.9%)
Renal cell carcinoma ≈ Basal cell carcinoma (83.3%)
Rheumatoid arthritis ≈ Juvenile rheumatoid arthritis (82.6%)
Squamous cell carcinoma ≈ Esophagus squamous cell carcinoma (82.4%)
Squamous cell carcinoma ≈ Laryngeal squamous cell carcinoma (82.4%)
Esophagus adenocarcinoma ≈ Esophageal carcinoma (81.0%)
Skin melanoma ≈ Melanoma (80.0%)
Lung squamous cell carcinoma ≈ Esophagus squamous cell carcinoma (80.0%)
Testicular germ cell cancer ≈ Testicular cancer (80.0%)
IgA glomerulonephritis ≈ Membranous glomerulonephritis (77.6%)
Lung carcinoma ≈ Skin carcinoma (76.9%)
Basal cell carcinoma ≈ Squamous cell carcinoma (76.9%)
Esophagu

In [13]:
name_to_doid = df.set_index("disease_name")["disease_id"].to_dict()

namelist_to_delete = [
    "Lung carcinoma",
    "Juvenile rheumatoid arthritis",
    "Esophagus adenocarcinoma",
    "Esophagus squamous cell carcinoma",
    "Esophageal carcinoma",
    "Skin melanoma",
    "Testicular germ cell cancer",
    "Alcoholic liver cirrhosis",
    "Alopecia",           ## generic terms (keep Alopecia areata)
    "Diabetes mellitus",  ## generic terms (keep Type 1 diabetes mellitus)
    "Squamous cell carcinoma"  ## generic terms (keep subtypes)
]

remove_similar_DOID = [name_to_doid[name] for name in namelist_to_delete if name in name_to_doid]
passed_similar_depth = [doid for doid in passed if doid not in remove_similar_DOID]
print(f"number of diseases after delete similar terms:: {len(passed_similar_depth)}")

number of diseases after delete similar terms:: 118


In [14]:
final_candidates = {
    doid: genes
    for doid, genes in filtered_disease_to_genes.items()
    if doid in passed_similar_depth
}

sorted_final = sorted(
    final_candidates.items(),
    key=lambda item: len(item[1]),
    reverse=True
)

# pick the top 100 diseases with most related genes
final_100_disease_to_genes = dict(sorted_final[:100])
print(f"✅ Final selected diseases: {len(final_100_disease_to_genes)}")

for i, (doid, genes) in enumerate(sorted_final[:10]):
    print(f"{i+1:2d}. {doid}: {len(genes)} genes")

✅ Final selected diseases: 100
 1. DOID:8577: 585 genes
 2. DOID:0060224: 566 genes
 3. DOID:4481: 531 genes
 4. DOID:1459: 528 genes
 5. DOID:7148: 528 genes
 6. DOID:10763: 474 genes
 7. DOID:5844: 446 genes
 8. DOID:0050589: 433 genes
 9. DOID:0050425: 408 genes
10. DOID:7188: 407 genes


In [17]:
import json

with open("selected_diseases.tsv", "w") as f:
    f.write("DOID\tGeneCount\n")
    for doid, genes in final_100_disease_to_genes.items():
        f.write(f"{doid}\t{len(genes)}\n")

with open("final_100_disease_to_genes.json", "w") as f:
    json_ready_data = {k: list(v) for k, v in final_100_disease_to_genes.items()}
    json.dump(json_ready_data, f, indent=2)