# From a csv file containing rsID, chromosome

In [1]:
from __future__ import annotations

import pandas as pd
import gzip
from dataclasses import dataclass
from pathlib import Path
from typing import List, Dict


@dataclass(frozen=True)
class Config:
    FOLDER: Path = Path("1000genomes/snps/cal_et_al")
    BED_FILE_PATTERN: str = "1000genomes/snps/snp_db/bed_chr_{chrom}.bed.gz"
    INPUT_CSV: Path = Path("1000genomes/snps/cal_et_al/locus_chromosome.csv")
    OUTPUT_CSV: Path = FOLDER / "rs_coords_extracted.csv"
    OUTPUT_BED: Path = FOLDER / "rs_coords_extracted.bed"

    IGNORE_FIRST_LINE: bool = True  # many .bed.gz files have a header
    BED_RSID_COL: int = 3  # rsID column index (0-based)
    BED_START_COL: int = 1
    BED_END_COL: int = 2

    # For faster testing; set None when running on full dataset
    TEST_LINE_LIMIT: int | None = None


CFG = Config()
def find_rsid_in_bed(bed_path: Path, target_rsid: str) -> tuple[str, str] | None:
    """
    Scan a BED.gz file for an rsID.
    Returns (start, end) if found, or None.
    """
    line_count = 0

    with gzip.open(bed_path, "rt") as f:
        for line in f:
            # Skip header if needed
            if CFG.IGNORE_FIRST_LINE and line_count == 0:
                line_count += 1
                continue

            fields = line.strip().split("\t")
            if len(fields) <= CFG.BED_RSID_COL:
                line_count += 1
                continue

            rsid = fields[CFG.BED_RSID_COL]
            if rsid == target_rsid:
                start = fields[CFG.BED_START_COL]
                end = fields[CFG.BED_END_COL]
                return start, end

            line_count += 1
            if CFG.TEST_LINE_LIMIT and line_count >= CFG.TEST_LINE_LIMIT:
                break

    return None


def extract_coords(df: pd.DataFrame) -> List[Dict[str, str]]:
    """
    For each locus in the input CSV, find its genomic coordinates
    from appropriate bed_chr_{chrom}.bed.gz file.
    """
    results: List[Dict[str, str]] = []

    for _, row in df.iterrows():
        locus: str = str(row["locus"])
        chrom: str = str(row["chromosome"])
        bed_path = Path(CFG.BED_FILE_PATTERN.format(chrom=chrom))

        match = find_rsid_in_bed(bed_path, locus)
        if match:
            start, end = match
            print(f"Found {locus} on chromosome {chrom} at {start}-{end}")

            results.append(
                {"Locus": locus, "Chromosome": chrom, "Start": start, "End": end}
            )

    return results


def write_bed_file(df: pd.DataFrame, path: Path) -> None:
    """Save extracted coordinates as a BED file."""
    with open(path, "w") as f:
        for _, row in df.iterrows():
            f.write(
                f"{row['Chromosome']}\t{row['Start']}\t{row['End']}\t{row['Locus']}\n"
            )

df = pd.read_csv(CFG.INPUT_CSV)

results = extract_coords(df)

out_df = pd.DataFrame(results)
out_df.to_csv(CFG.OUTPUT_CSV, index=False)

write_bed_file(out_df, CFG.OUTPUT_BED)

print("\n=== Extraction complete! ===")
print(f"→ CSV saved to: {CFG.OUTPUT_CSV}")
print(f"→ BED saved to: {CFG.OUTPUT_BED}")

out_df.head()

Found rs1288367 on chromosome 1 at 53605883-53605884
Found rs7554936 on chromosome 1 at 151122488-151122489
Found rs79200067 on chromosome 1 at 102457869-102457870
Found rs10497191 on chromosome 2 at 158667216-158667217
Found rs1250233 on chromosome 2 at 216307102-216307103
Found rs1371048 on chromosome 2 at 145753165-145753166
Found rs17034666 on chromosome 2 at 109571507-109571508
Found rs4907251 on chromosome 2 at 97484813-97484814
Found rs7596027 on chromosome 2 at 166187533-166187534
Found rs116783706 on chromosome 3 at 152553768-152553769
Found rs12498138 on chromosome 3 at 121459588-121459589
Found rs149768401 on chromosome 3 at 100365527-100365528
Found rs17005847 on chromosome 3 at 69457139-69457140
Found rs2072053 on chromosome 3 at 50197091-50197092
Found rs570435573 on chromosome 3 at 86028381-86028382
Found rs6548616 on chromosome 3 at 79399574-79399575
Found rs10008492 on chromosome 4 at 38765719-38765720
Found rs1229984 on chromosome 4 at 100239318-100239319
Found rs5466

Unnamed: 0,Locus,Chromosome,Start,End
0,rs1288367,1,53605883,53605884
1,rs7554936,1,151122488,151122489
2,rs79200067,1,102457869,102457870
3,rs10497191,2,158667216,158667217
4,rs1250233,2,216307102,216307103
