In [1]:
import pandas as pd

# Offset for BRCA1: 43044295 - 1 
# Offset for TP53:  7661779 - 1 
# Offset for PTEN: 87863225 - 1 

GENOMIC_OFFSET = 87863225 - 1 

# Load the updated TSV with qseq and sseq
cols = [
    "qseqid", "sseqid", "pident", "length", "mismatch", "gapopen",
    "qstart", "qend", "sstart", "send", "evalue", "bitscore",
    "qseq", "sseq"
]

# Filepath for BRCA1: ../data/results/blast_output_brca1.tsv
# Filepath for TP53: ../data/results/blast_output_tp53.tsv
# Filepath for PTEN: ../data/results/blast_output_pten.tsv
df = pd.read_csv("../data/results/blast_output_pten.tsv", sep='\t', names=cols)

mutations = []

for _, row in df.iterrows():
    qseq = row["qseq"]
    sseq = row["sseq"]
    sstart = int(row["sstart"])
    send = int(row["send"])

    ref_pos = sstart if sstart < send else send  # lower bound
    direction = 1 if send >= sstart else -1      # forward or reverse strand

    pos = ref_pos
    for q_base, s_base in zip(qseq, sseq):
        if q_base != '-' and s_base != '-':
            if q_base.upper() != s_base.upper():
                genomic_pos = pos + GENOMIC_OFFSET
                mutations.append({
                    "position": genomic_pos,
                    "ref_base": s_base.upper(),
                    "alt_base": q_base.upper()
                })
            pos += direction
        elif q_base == '-' and s_base != '-':
            pos += direction  # skip deletion
        elif s_base == '-' and q_base != '-':
            pass  # skip insertion

mut_df = pd.DataFrame(mutations)
mut_df.to_csv("../data/results/detected_mutations.csv", index=False)
print(f"✅ Extracted {len(mut_df)} mutations from BLAST TSV output.")

✅ Extracted 5426 mutations from BLAST TSV output.
