# PTEN

In [2]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

input_path = "../data/references/chr10_full.fasta"
output_path = "../data/references/pten_reference.fasta"

start = 87863225 - 1
end = 87971930

record = SeqIO.read(input_path, "fasta")
pten_seq = record.seq[start:end]

pten_record = SeqRecord(
    pten_seq,
    id="PTEN_GRCh38",
    description="PTEN region extracted from chromosome 10"
)

SeqIO.write(pten_record, output_path, "fasta")
print(f"✅ Saved PTEN region ({len(pten_seq)} bp) to {output_path}")


✅ Saved PTEN region (108706 bp) to ../data/references/pten_reference.fasta


# TP53

In [2]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

input_path = "../data/references/chr17_full.fasta"
output_path = "../data/references/tp53_reference.fasta"

# TP53 location on GRCh38
start = 7661779 - 1 # convert to 0-based index
end = 7687550       # inclusive

record = SeqIO.read(input_path, "fasta")
tp53_seq = record.seq[start:end]

# Save TP53 reference
tp53_record = SeqRecord(
    tp53_seq,
    id="TP53_GRCh38",
    description="TP53 region extracted from chromosome 17"
)

SeqIO.write(tp53_record, output_path, "fasta")
print(f"✅ Saved TP53 region ({len(tp53_seq)} bp) to {output_path}")

✅ Saved TP53 region (25772 bp) to ../data/references/tp53_reference.fasta


# BRCA1

In [1]:
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

input_path = "../data/references/chr17_full.fasta"
output_path = "../data/references/brca1_reference.fasta"

# BRCA1 location on GRCh38
start = 43044295 - 1  # convert to 0-based index
end = 43125482        # inclusive

record = SeqIO.read(input_path, "fasta")
brca1_seq = record.seq[start:end]

# Save BRCA1 reference
brca1_record = SeqRecord(
    brca1_seq,
    id="BRCA1_GRCh38",
    description="BRCA1 region extracted from chromosome 17"
)

SeqIO.write(brca1_record, output_path, "fasta")
print(f"✅ Saved BRCA1 region ({len(brca1_seq)} bp) to {output_path}")

✅ Saved BRCA1 region (81188 bp) to ../data/references/brca1_reference.fasta


# TP53 Patient

In [1]:
import sqlite3
import random

# Load TP53 reference sequence
with open("../data/references/tp53_reference.fasta", "r") as f:
    lines = f.readlines()
    reference = "".join(line.strip() for line in lines if not line.startswith(">"))

# Convert to mutable list
sequence = list(reference)
seq_len = len(sequence)

# Load known mutations from clinvar.db
conn = sqlite3.connect("../data/clinvar/clinvar.db")
query = """
SELECT Start AS position, ReferenceAllele AS ref_base, AlternateAllele AS alt_base
FROM tp53_variants
WHERE ClinicalSignificance LIKE '%Pathogenic%'
  AND Type = 'single nucleotide variant'
"""
mutations = conn.execute(query).fetchall()
conn.close()

# Choose N random mutations to introduce
N = 10
random.shuffle(mutations)

applied_mutations = []
skipped_mutations = []

# Offset of TP53 on chr17 (GRCh38)
TP53_GENOMIC_START = 7661779

for pos, ref, alt in mutations:
    relative_pos = int(pos) - TP53_GENOMIC_START
    if 0 <= relative_pos < len(sequence):
        sequence[relative_pos] = alt.upper()
        applied_mutations.append((int(pos), ref.upper(), alt.upper()))
    else:
        skipped_mutations.append((int(pos), ref.upper(), alt.upper(), "OUT_OF_BOUNDS"))

# Save mutated patient sample
with open("../data/samples/patient_tp53.fasta", "w") as f:
    f.write(">fake_patient_with_known_mutations_tp53\n")
    for i in range(0, len(sequence), 60):
        f.write("".join(sequence[i:i+60]) + "\n")

# Save truth data
import pandas as pd
df_applied = pd.DataFrame(applied_mutations, columns=["position", "ref_base", "alt_base"])
df_applied.to_csv("../data/samples/patient_truth_tp53.csv", index=False)

if skipped_mutations:
    df_skipped = pd.DataFrame(skipped_mutations, columns=["position", "ref_base", "alt_base", "reason"])
    df_skipped.to_csv("../data/samples/skipped_mutations_log_tp53.csv", index=False)

print(f"✅ Applied {len(applied_mutations)} mutation(s) to TP53. Saved mutated sample and ground truth.")


✅ Applied 863 mutation(s) to TP53. Saved mutated sample and ground truth.


# BRCA1 Patient

In [None]:
import sqlite3
import random

# Load BRCA1 reference sequence
with open("../data/references/brca1_reference.fasta", "r") as f:
    lines = f.readlines()
    reference = "".join(line.strip() for line in lines if not line.startswith(">"))

# Convert to mutable list
sequence = list(reference)
seq_len = len(sequence)

# Load known mutations from clinvar.db
conn = sqlite3.connect("../data/clinvar/clinvar.db")
query = """
SELECT Start AS position, ReferenceAllele AS ref_base, AlternateAllele AS alt_base
FROM brca1_variants
WHERE ClinicalSignificance LIKE '%Pathogenic%'
  AND Type = 'single nucleotide variant'
"""
mutations = conn.execute(query).fetchall()
conn.close()

# Choose N random mutations to introduce
N = 10
random.shuffle(mutations)

applied_mutations = []
skipped_mutations = []

# Offset for BRCA1 on chromosome 17 (GRCh38)
BRCA1_GENOMIC_START = 43044295

for pos, ref, alt in mutations:
    relative_pos = int(pos) - BRCA1_GENOMIC_START
    if 0 <= relative_pos < len(sequence):
        original_base = sequence[relative_pos]
        sequence[relative_pos] = alt.upper()
        applied_mutations.append((int(pos), original_base.upper(), alt.upper()))
    else:
        skipped_mutations.append((int(pos), ref.upper(), alt.upper(), "OUT_OF_BOUNDS"))

# Save mutated patient sample
with open("../data/samples/patient_brca1.fasta", "w") as f:
    f.write(">fake_patient_with_known_mutations_brca1\n")
    for i in range(0, len(sequence), 60):
        f.write("".join(sequence[i:i+60]) + "\n")

# Save truth data
import pandas as pd
df_applied = pd.DataFrame(applied_mutations, columns=["position", "ref_base", "alt_base"])
df_applied.to_csv("../data/samples/patient_truth_brca1.csv", index=False)

print(f"✅ Applied {len(applied_mutations)} mutation(s). Saved to brca1_patient.fasta and patient_brca1_truth.csv")

if skipped_mutations:
    df_skipped = pd.DataFrame(skipped_mutations, columns=["position", "ref_base", "alt_base", "reason"])
    df_skipped.to_csv("../data/samples/skipped_mutations_log_brca1.csv", index=False)
    print(f"⚠️ Skipped {len(skipped_mutations)} mutation(s). See skipped_mutations_log.csv for details.")


Loaded reference length: 81188
✅ Applied 3766 mutation(s). Saved to fake_patient.fasta and fake_patient_truth.csv


# PTEN Patient

In [6]:
import sqlite3
import random

with open("../data/references/pten_reference.fasta", "r") as f:
    lines = f.readlines()
    reference = "".join(line.strip() for line in lines if not line.startswith(">"))

sequence = list(reference)
seq_len = len(sequence)

conn = sqlite3.connect("../data/clinvar/clinvar.db")
query = """
SELECT Start AS position, ReferenceAllele AS ref_base, AlternateAllele AS alt_base
FROM pten_variants
WHERE ClinicalSignificance LIKE '%Pathogenic%'
  AND Type = 'single nucleotide variant'
"""
mutations = conn.execute(query).fetchall()
conn.close()

N = 10
random.shuffle(mutations)

applied_mutations = []
skipped_mutations = []

PTEN_GENOMIC_START = 87863225

for pos, ref, alt in mutations:
    relative_pos = int(pos) - PTEN_GENOMIC_START
    if 0 <= relative_pos < len(sequence):
        sequence[relative_pos] = alt.upper()
        applied_mutations.append((int(pos), ref.upper(), alt.upper()))
    else:
        skipped_mutations.append((int(pos), ref.upper(), alt.upper(), "OUT_OF_BOUNDS"))

# Save mutated patient sample
with open("../data/samples/patient_pten.fasta", "w") as f:
    f.write(">fake_patient_with_known_mutations_pten\n")
    for i in range(0, len(sequence), 60):
        f.write("".join(sequence[i:i+60]) + "\n")

# Save truth data
import pandas as pd
df_applied = pd.DataFrame(applied_mutations, columns=["position", "ref_base", "alt_base"])
df_applied.to_csv("../data/samples/patient_truth_pten.csv", index=False)

if skipped_mutations:
    df_skipped = pd.DataFrame(skipped_mutations, columns=["position", "ref_base", "alt_base", "reason"])
    df_skipped.to_csv("../data/samples/skipped_mutations_log_pten.csv", index=False)

print(f"✅ Applied {len(applied_mutations)} mutation(s) to PTEN. Saved mutated sample and ground truth.")


✅ Applied 631 mutation(s) to PTEN. Saved mutated sample and ground truth.
