In [19]:
import pysam

def split_bam_by_base(input_bam, output_prefix, chrom="chr1", pos=206588082):
    """
    Splits a BAM file into four separate BAM files based on the base at a given position.

    Parameters:
    - input_bam: Path to input BAM file.
    - output_prefix: Prefix for output BAM files.
    - chrom: Chromosome of interest.
    - pos: 1-based genomic position to check.

    Output:
    - Four BAM files with reads containing A, T, C, or G at the specified position.
    """
    # Open input BAM file
    bamfile = pysam.AlignmentFile(input_bam, "rb")

    # Create output BAM files
    bam_a = pysam.AlignmentFile(f"{output_prefix}_{chrom}_{pos}_A.bam", "wb", template=bamfile)
    bam_t = pysam.AlignmentFile(f"{output_prefix}_{chrom}_{pos}_T.bam", "wb", template=bamfile)
    bam_c = pysam.AlignmentFile(f"{output_prefix}_{chrom}_{pos}_C.bam", "wb", template=bamfile)
    bam_g = pysam.AlignmentFile(f"{output_prefix}_{chrom}_{pos}_G.bam", "wb", template=bamfile)

    # Process reads
    for read in bamfile.fetch(chrom, pos-1, pos):  # pysam uses 0-based positions
        # Get the reference-aligned position of the read
        read_positions = read.get_reference_positions(full_length=True)
        
        if pos-1 in read_positions:  # Check if the read covers the position
            # Get the index of the base in the read
            base_index = read_positions.index(pos-1)
            base = read.query_sequence[base_index]  # Extract base at position

            # Write the read to the corresponding BAM file
            if base == 'A':
                bam_a.write(read)
            elif base == 'T':
                bam_t.write(read)
            elif base == 'C':
                bam_c.write(read)
            elif base == 'G':
                bam_g.write(read)

    # Close all files
    bamfile.close()
    bam_a.close()
    bam_t.close()
    bam_c.close()
    bam_g.close()

    print("Splitting complete. Output BAM files generated.")

# Example usage
# split_bam_by_base("input.bam", "output")



In [20]:
! ls  -lah /home/michalula/data/cas9_nanopore/data/20241226_MR_nCATs_TcellsPrES_unedit_P2R9/passed_fast5/5mCG/to_t2t_v2_0

total 13G
drwxrwxr-x 3 michalula michalula 4.0K Mar 18 05:18 .
drwxrwxrwx 4 michalula michalula 4.0K Mar 17 04:54 ..
-rw-rw-r-- 1 michalula michalula 5.2G Mar 17 05:26 align_t2t_v2_0_trim_20241226_MR_nCATs_TcellsPrES_unedit_P2R9_passed.dna_r9.4.1_e8_sup@v3.3.5mCG.bam
-rw-rw-r-- 1 michalula michalula  24M Mar 17 23:13 chr1_206560169_206614236.sort_chr1_sort_align_t2t_v2_0_trim_20241226_MR_nCATs_TcellsPrES_unedit_P2R9_passed_dna_r9_e8_supv3mCG.bam
-rw-rw-r-- 1 michalula michalula 104K Mar 17 23:13 chr1_206560169_206614236.sort_chr1_sort_align_t2t_v2_0_trim_20241226_MR_nCATs_TcellsPrES_unedit_P2R9_passed_dna_r9_e8_supv3mCG.bam.bai
-rw-r--r-- 1 michalula michalula 482M Mar 17 10:35 chr1.sort_align_t2t_v2_0_trim_20241226_MR_nCATs_TcellsPrES_unedit_P2R9_passed.dna_r9.4.1_e8_sup@v3.3.5mCG.bam
-rw-rw-r-- 1 michalula michalula 482M Mar 17 23:39 chr1_sort_align_t2t_v2_0_trim_20241226_MR_nCATs_TcellsPrES_unedit_P2R9_passed.dna_r9.4.1_e8_sup@v3.3.5mCG.bam
-rw-rw-r-- 1 michalula michalula 482M Mar 

In [21]:
!mkdir /home/michalula/data/cas9_nanopore/data/20241226_MR_nCATs_TcellsPrES_unedit_P2R9/passed_fast5/5mCG/to_t2t_v2_0/haplotyped/manual

mkdir: cannot create directory ‘/home/michalula/data/cas9_nanopore/data/20241226_MR_nCATs_TcellsPrES_unedit_P2R9/passed_fast5/5mCG/to_t2t_v2_0/haplotyped/manual’: File exists


In [22]:
! ls /home/michalula/data/cas9_nanopore/data/20241226_MR_nCATs_TcellsPrES_unedit_P2R9/passed_fast5/5mCG/to_t2t_v2_0/haplotyped/manual

In [23]:
# cd /home/michalula/data/cas9_nanopore/data/20241226_MR_nCATs_TcellsPrES_unedit_P2R9/passed_fast5/5mCG/to_t2t_v2_0/

input_bam_path = "/home/michalula/data/cas9_nanopore/data/20241226_MR_nCATs_TcellsPrES_unedit_P2R9/passed_fast5/5mCG/to_t2t_v2_0/sort_chr1_sort_align_t2t_v2_0_trim_20241226_MR_nCATs_TcellsPrES_unedit_P2R9_passed_dna_r9_e8_supv3mCG.bam"

output_bam_prefix = "/home/michalula/data/cas9_nanopore/data/20241226_MR_nCATs_TcellsPrES_unedit_P2R9/passed_fast5/5mCG/to_t2t_v2_0/haplotyped/manual/haplotyped_sort_chr1_sort_align_t2t_v2_0_trim_20241226_MR_nCATs_TcellsPrES_unedit_P2R9_passed_dna_r9_e8_supv3mCG"


chr1:206,588,080

Total count: 2820

A : 47 (2%, 6+, 41- )

C : 1467 (52%, 755+, 712- )

G : 5 (0%, 0+, 5- )

T : 1301 (46%, 658+, 643- )

N : 0

---------------

DEL: 147

INS: 75

In [None]:
chrom="chr1"
pos=206588080 

split_bam_by_base(input_bam_path, output_bam_prefix, chrom, pos) 

Splitting complete. Output BAM files generated.


In [25]:
! ls -lah /home/michalula/data/cas9_nanopore/data/20241226_MR_nCATs_TcellsPrES_unedit_P2R9/passed_fast5/5mCG/to_t2t_v2_0/haplotyped/manual

total 19M
drwxrwxr-x 2 michalula michalula 4.0K Mar 19 13:36 .
drwxrwxr-x 3 michalula michalula 4.0K Mar 19 13:05 ..
-rw-rw-r-- 1 michalula michalula 329K Mar 19 13:36 haplotyped_sort_chr1_sort_align_t2t_v2_0_trim_20241226_MR_nCATs_TcellsPrES_unedit_P2R9_passed_dna_r9_e8_supv3mCG_chr1_206588080_A.bam
-rw-rw-r-- 1 michalula michalula 9.8M Mar 19 13:36 haplotyped_sort_chr1_sort_align_t2t_v2_0_trim_20241226_MR_nCATs_TcellsPrES_unedit_P2R9_passed_dna_r9_e8_supv3mCG_chr1_206588080_C.bam
-rw-rw-r-- 1 michalula michalula  35K Mar 19 13:36 haplotyped_sort_chr1_sort_align_t2t_v2_0_trim_20241226_MR_nCATs_TcellsPrES_unedit_P2R9_passed_dna_r9_e8_supv3mCG_chr1_206588080_G.bam
-rw-rw-r-- 1 michalula michalula 8.7M Mar 19 13:36 haplotyped_sort_chr1_sort_align_t2t_v2_0_trim_20241226_MR_nCATs_TcellsPrES_unedit_P2R9_passed_dna_r9_e8_supv3mCG_chr1_206588080_T.bam


In [26]:
# # Index all the BAM files
# for bam_file in bam_files.values():
#     pysam.index(bam_file)

# print("Indexing complete.")

In [27]:
import pysam

# Define the paths to the haplotyped BAM files
bam_files = {
    'A': f"{output_bam_prefix}_{chrom}_{pos}_A.bam",
    'T': f"{output_bam_prefix}_{chrom}_{pos}_T.bam",
    'C': f"{output_bam_prefix}_{chrom}_{pos}_C.bam",
    'G': f"{output_bam_prefix}_{chrom}_{pos}_G.bam"
}

# Function to count reads in a BAM file
def count_reads(bam_file):
    pysam.index(bam_file)
    with pysam.AlignmentFile(bam_file, "rb") as bam:
        return bam.count()

# Count reads in each BAM file
read_counts = {base: count_reads(bam_file) for base, bam_file in bam_files.items()}

# Print the read counts
for base, count in read_counts.items():
    print(f"Number of reads in {base} BAM file: {count}")

Number of reads in A BAM file: 47
Number of reads in T BAM file: 1301
Number of reads in C BAM file: 1467
Number of reads in G BAM file: 5


In [None]:
# {base: count_reads(bam_file) for base, bam_file in bam_files.items()}

In [None]:
chr1:206,586,630
Total count: 2725
A : 1407 (52%, 689+, 718- )
C : 28 (1%, 21+, 7- )
G : 1258 (46%, 673+, 585- )
T : 32 (1%, 26+, 6- )
N : 0
---------------
DEL: 255
INS: 40

In [28]:
chrom="chr1"
pos=206586630 

split_bam_by_base(input_bam_path, output_bam_prefix, chrom, pos) 

Splitting complete. Output BAM files generated.


In [29]:
# Define the paths to the haplotyped BAM files
bam_files = {
    'A': f"{output_bam_prefix}_{chrom}_{pos}_A.bam",
    'T': f"{output_bam_prefix}_{chrom}_{pos}_T.bam",
    'C': f"{output_bam_prefix}_{chrom}_{pos}_C.bam",
    'G': f"{output_bam_prefix}_{chrom}_{pos}_G.bam"
}

# Function to count reads in a BAM file
def count_reads(bam_file):
    pysam.index(bam_file)
    with pysam.AlignmentFile(bam_file, "rb") as bam:
        return bam.count()

# Count reads in each BAM file
read_counts = {base: count_reads(bam_file) for base, bam_file in bam_files.items()}

# Print the read counts
for base, count in read_counts.items():
    print(f"Number of reads in {base} BAM file: {count}")

Number of reads in A BAM file: 1407
Number of reads in T BAM file: 32
Number of reads in C BAM file: 28
Number of reads in G BAM file: 1258
