In [2]:

#output_coords = "/lustre/scratch126/cellgen/behjati/lr26/T2T/mapped_d4z4_coords.tsv"

import pysam

# Open BAM file
bam = pysam.AlignmentFile("/nfs/team274/lr26/PacBio-mom/PacBio-mom.bp.p_ctg_vs_t2t.bam", "rb")

# Build mapping: contig name -> (ref name, ref start, ref end, strand)
contig_to_ref = {}

for aln in bam.fetch(until_eof=True):
    if aln.is_unmapped:
        continue
    contig = aln.query_name
    ref = bam.get_reference_name(aln.reference_id)
    strand = '-' if aln.is_reverse else '+'
    ref_start = aln.reference_start
    ref_end = aln.reference_end
    contig_to_ref[contig] = (ref, ref_start, ref_end, strand)

bam.close()

# Read D4Z4 regions (e.g. ptg000123l:100-1375)
with open("/lustre/scratch126/cellgen/behjati/lr26/T2T/d4z4_regions_mom.txt") as f_in, open("/lustre/scratch126/cellgen/behjati/lr26/T2T/d4z4_regions_mapped_to_T2T.tsv", "w") as f_out:
    for line in f_in:
        line = line.strip()
        contig_info = line.split(":")
        contig = contig_info[0]
        start, end = map(int, contig_info[1].split("-"))

        if contig not in contig_to_ref:
            print(f"Warning: {contig} not found in BAM alignments.")
            continue

        ref, ref_start, ref_end, strand = contig_to_ref[contig]

        # Offset of D4Z4 region within contig alignment
        offset = start  # assuming alignment starts at contig pos 0
        d4z4_len = end - start
        ref_d4z4_start = ref_start + offset if strand == '+' else ref_end - offset - d4z4_len
        ref_d4z4_end = ref_d4z4_start + d4z4_len

        f_out.write(f"{contig}:{start}-{end}\t{ref}:{ref_d4z4_start+1}-{ref_d4z4_end}\t{strand}\n")  # 1-based BED


In [16]:
base_dir = "/lustre/scratch126/cellgen/behjati/lr26/blast-search/"
file1 = base_dir + "d4z4c-chr1_rc.fasta"
from Bio import SeqIO
with open (file1) as file:
    for line in file:
        print(line)
record = next(SeqIO.parse(file1, "fasta"))
print(f">{record.id}_rc\n{record.seq.reverse_complement()}")
# Translate the sequence
prot = record.translate(to_stop=False)
print(prot)

>chr1:128421745-128422968_rc

ATGGCCCTCCTGAAACCTTCGGACTgcaccctccccacggaagcccggagactgggacggtgaaggaaactcgtttggaccccgagccaaagcgaggccctgcgagcctgctttgagcggtacccatacccagacatcaccacccgagaacagctcgcccaggccatcggcattccagagcccatggtccagatttggtatcagaatgggaggtcacaccagctgaggcagcaccggcgggaatctcggccctggcctgggagatgcggcctgcaagaacgcaggtgaaagcagtccgccgtcactggatcccagaccgccctgctcctccgagcctttgagaaagatcgctttccaggcatcgctgccagggaagagctggccagagagactgccttcccgagtacaggattcagatctcgtttcagaatcgaagggccaggccccatcgcaggcaggcggcctgtgcaacacggcccctggcaggtgtcaccctgctaacacatgtgtcgcctttgcccacgctggcacgtggggaacggggcttccgcaccccacgtgccctgcgcacctggtgctctcccacaggaggttttcgtgagccagggagcaagggccatccccgtgctccagcccagcaaggccgtgccggcagagtgtatctcccaaccggatccggcacgcgggaatattccctatgctgcccctgctcctccggaaggggcgttctcccaccctcaggctcctcggtggcctccgcagccgggcaaaacccggaagaaccaggacctgcagggcgatgtcctgccgggcccttgcgcggtgggacagactgggcccgctcaagtgcttgcgccacccgcttcccaggggaatccgtggttgggctggggccggggtccccaggtcgacggggtggcgtgggaaccccaagccagggcagctccaccttgccagcccgcgcccccgg

In [8]:
from Bio import SeqIO

fasta_file = base_dir + "d4z4c-chr1_rc.fasta"  # replace with your filename

# Parse the first (and presumably only) record in the FASTA file
record = next(SeqIO.parse(fasta_file, "fasta"))
seq = record.seq

# Extract from index 913 (0-based) to end
subseq = seq[913:]

print(f">Extract_from_913_to_end_{record.id}")
print(subseq)
print(f"Length of subsequence from position 913 onwards: {len(subseq)}")


>Extract_from_913_to_end_chr1:128421745-128422968_rc
gggtggcgtgggaaccccaagccagggcagctccaccttgccagcccgcgcccccggaggtctccgcgcagcaggggcagatgcaaggcatcccggcgccctcccaggagctcaggagccagggcgctcctctgcactcccctccagcttgctgctggatgagctcctggcgagggcggagtttctgcagcaggagcaacctttcctagaaagggaggccccgggggagctggaggccttggaagaggccgtctcactggaaggacccctcagcgaggaagaattccgggctctgctggaggagctttagg
Length of subsequence from position 913 onwards: 311


In [3]:
from Bio.Seq import Seq

# Your DNA sequence
dna_seq = Seq("ccgggtggcgtgggaaccccaagccagggcagctccaccttgccagcccgcgcccccggaggtctccgcgcagcaggggcagatgcaaggcatcccggcgccctcccaggagctcaggagccagggcgctcctctgcactcccctccagcttgctgctggatgagctcctggcgagggcggagtttctgcagcaggagcaacctttcctagaaagggaggccccgggggagctggaggccttggaagaggccgtctcactggaaggacccctcagcgaggaagaattccgggctctgctggaggagctttagg")

# Translate the sequence
protein_seq = dna_seq.translate(to_stop=False)

print("Protein sequence:")
print(protein_seq)


Protein sequence:
PGGVGTPSQGSSTLPARAPGGLRAAGADARHPGALPGAQEPGRSSALPSSLLLDELLARAEFLQQEQPFLEREAPGELEALEEAVSLEGPLSEEEFRALLEEL*
