In [10]:
import pandas as pd
import pyranges as pr
import random
from Bio import SeqIO


In [None]:
g4_rep1 = pd.read_csv("GSE133379_293T-G4P-hg19-rep1.narrowPeak", sep='\t', header=None)
g4_rep2 = pd.read_csv("GSE133379_293T-G4P-hg19-rep2.narrowPeak", sep='\t', header=None)

columns = ["chrom", "start", "end", "name", "score", "strand", "signalValue", "pValue", "qValue", "summit_offset"]
g4_rep1.columns = columns
g4_rep2.columns = columns

peaks = pd.concat([g4_rep1, g4_rep2], ignore_index=True).drop_duplicates()

peaks["summit"] = peaks["start"] + peaks["summit_offset"]


In [None]:
g4_seq = pd.read_csv("G4_seq_peaks.bed", sep="\t", header=None)
g4_seq.columns = ["chrom", "start", "end", "name", "score", "strand"]

chip_pr = pr.PyRanges(peaks.rename(columns={
    "chrom": "Chromosome", "start": "Start", "end": "End", "summit": "Summit"
}))
g4_pr = pr.PyRanges(g4_seq.rename(columns={
    "chrom": "Chromosome", "start": "Start", "end": "End"
}))

active_g4 = chip_pr.intersect(g4_pr).df

active_g4 = active_g4.rename(columns={
    "Chromosome": "chrom", "Start": "start", "End": "end", "Summit": "summit"
})


In [None]:

atac_293T = pd.read_csv("GSM5321298_R21037745-293-2-293-2_peaks.narrowPeak", sep="\t", header=None)
atac_293T.columns = [
    "chrom", "start", "end", "name", "score", "strand",
    "signal", "pval", "qval", "peak_offset"
]
atac_293T = atac_293T[["chrom", "start", "end"]]

g4_ranges = pr.PyRanges(active_g4.rename(columns={"chrom": "Chromosome", "start": "Start", "end": "End"}))
atac_ranges = pr.PyRanges(atac_293T.rename(columns={"chrom": "Chromosome", "start": "Start", "end": "End"}))

overlapping = g4_ranges.overlap(atac_ranges).df[["Chromosome", "Start", "End"]]
overlapping["is_open"] = 1

active_g4 = active_g4.merge(
    overlapping.rename(columns={"Chromosome": "chrom", "Start": "start", "End": "end"}),
    on=["chrom", "start", "end"],
    how="left"
)
active_g4["is_open"] = active_g4["is_open"].fillna(0).astype(int)


In [None]:
genome = {rec.id: str(rec.seq) for rec in SeqIO.parse("hg19.fa", "fasta")}

def get_sequence(row):
    try:
        chrom = str(row["chrom"])
        summit = int(float(row["summit"]))
        start = max(0, summit - 100)
        end = summit + 101
        chrom_seq = genome.get(chrom, genome.get("chr" + chrom))
        if not chrom_seq:
            raise ValueError(f"Chromosome {chrom} not found")
        end = min(end, len(chrom_seq))
        return chrom_seq[start:end].upper()
    except Exception as e:
        print(f"Error: {e} at {row['chrom']}:{row['start']}-{row['end']}")
        return None

active_g4["sequence"] = active_g4.apply(get_sequence, axis=1)
active_g4 = active_g4.dropna(subset=["sequence"])
active_g4["label"] = 1

print(f" Extracted {len(active_g4)} active G4 sequences for H1975")


✅ Extracted 65184 active G4 sequences for H1975


In [None]:
import random

gc_content = active_g4["sequence"].apply(lambda x: (x.count("G") + x.count("C")) / 201)
gc_mean, gc_std = gc_content.mean(), gc_content.std()

def generate_negatives(n, gc_mean, gc_std):
    negatives = []
    chromosomes = [k for k in genome if k.startswith("chr") and len(genome[k]) > 201]
    while len(negatives) < n:
        chrom = random.choice(chromosomes)
        start = random.randint(0, len(genome[chrom]) - 201)
        seq = genome[chrom][start:start+201].upper()
        gc = (seq.count("G") + seq.count("C")) / 201
        if abs(gc - gc_mean) < gc_std:
            negatives.append({
                "chrom": chrom,
                "start": start,
                "end": start + 201,
                "sequence": seq,
                "is_open": 0,
                "label": 0
            })
    return pd.DataFrame(negatives)

negatives = generate_negatives(len(active_g4), gc_mean, gc_std)


neg_ranges = pr.PyRanges(negatives.rename(columns={"chrom": "Chromosome", "start": "Start", "end": "End"}))
neg_open = neg_ranges.overlap(atac_ranges).df[["Chromosome", "Start", "End"]]
neg_open["is_open"] = 1

negatives = negatives.merge(
    neg_open.rename(columns={"Chromosome": "chrom", "Start": "start", "End": "end"}),
    on=["chrom", "start", "end"],
    how="left"
)
negatives["is_open"] = negatives["is_open"].fillna(0).astype(int) if "is_open" in negatives.columns else 0


In [None]:
positives = active_g4[["sequence", "is_open"]].copy()
positives["label"] = 1

negatives_df = negatives[["sequence", "is_open", "label"]].copy()

dataset = pd.concat([positives, negatives_df], ignore_index=True)
dataset = dataset.sample(frac=1, random_state=42).reset_index(drop=True)

# Save to file
dataset.to_csv("293T_training_dataset.csv", index=False)
print(f"Final H1975 training dataset saved with {len(dataset)} samples")


✅ Final H1975 training dataset saved with 130832 samples
