In [1]:
import sqlite3
import pandas as pd

# Load detected mutations from BLAST
mutations_df = pd.read_csv("../data/results/detected_mutations.csv")
mutations_df["ref_base"] = mutations_df["ref_base"].str.upper()
mutations_df["alt_base"] = mutations_df["alt_base"].str.upper()

# Load the variants from clinvar.db
conn = sqlite3.connect("../data/clinvar/clinvar.db")

# Edit this table for different genes
query = """
SELECT Start AS position, ReferenceAllele AS ref_base, AlternateAllele AS alt_base,
       ClinicalSignificance AS clinical_significance, PhenotypeList AS disease
FROM pten_variants
WHERE ClinicalSignificance LIKE '%Pathogenic%' AND Type = 'single nucleotide variant'
"""
clinvar_df = pd.read_sql_query(query, conn)
conn.close()

# Ensure casing is consistent
clinvar_df["ref_base"] = clinvar_df["ref_base"].str.upper()
clinvar_df["alt_base"] = clinvar_df["alt_base"].str.upper()

# mutations_df = mutations_df.drop_duplicates(subset=["position", "ref_base", "alt_base"])

# Match detected mutations to ClinVar
matched = pd.merge(
    mutations_df,
    clinvar_df,
    how="inner",
    on=["position", "ref_base", "alt_base"]
)

matched.to_csv("../data/results/final_predicted_diseases.csv", index=False)
print(f"✅ Matched {len(matched)} mutation(s). Saved to final_predicted_diseases.csv")

✅ Matched 272 mutation(s). Saved to final_predicted_diseases.csv
