In [1]:
import pysam

def count_reads_by_strain(bam_file, strain_mapping):
    """
    Count reads mapped to each strain based on tags.

    Args:
        bam_file (str): Path to the BAM file.
        strain_mapping (dict): Mapping of strain to tags.

    Returns:
        dict: Counts of reads for each strain.
    """
    counts = {strain: 0 for strain in strain_mapping.keys()}
    total_reads = 0

    with pysam.AlignmentFile(bam_file, "rb") as bam:
        for read in bam:
            total_reads += 1
            if not read.is_unmapped:
                # Check tags and assign read to a strain
                for strain, mapping in strain_mapping.items():
                    tag = mapping["tag"]  # Extract the tag name
                    if read.has_tag(tag) and read.get_tag(tag) in mapping["values"]:
                        counts[strain] += 1
                        break  # Avoid double counting

    return counts, total_reads


In [2]:
# Define BAM file and strain mapping
bam_file = "final_best_alignments.bam"

# Example mapping of strain to tags
strain_mapping = {
    "CASTEiJ": {"tag": "ct", "values": ["U"]},  # Replace "U" with actual values if needed
    "129S1": {"tag": "ct", "values": ["U"]},    # Replace with appropriate tags/values
}

# Count reads
counts, total_reads = count_reads_by_strain(bam_file, strain_mapping)

# Print results
print(f"Total reads in BAM file: {total_reads}")
for strain, count in counts.items():
    fraction = (count / total_reads) * 100 if total_reads > 0 else 0
    print(f"{strain} - Mapped Reads: {count} ({fraction:.2f}%)")


Total reads in BAM file: 2887870
CASTEiJ - Mapped Reads: 2869592 (99.37%)
129S1 - Mapped Reads: 0 (0.00%)


In [3]:
import pysam
from collections import Counter

def analyze_tags(bam_file):
    """
    Analyze the 'po' and 'ct' tags in a BAM file.

    Args:
        bam_file (str): Path to the BAM file.

    Returns:
        dict: Counts of 'po' and 'ct' tags.
    """
    po_counts = Counter()
    ct_counts = Counter()

    with pysam.AlignmentFile(bam_file, "rb") as bam:
        for read in bam:
            if read.has_tag("po"):
                po_counts[read.get_tag("po")] += 1
            if read.has_tag("ct"):
                ct_counts[read.get_tag("ct")] += 1

    return {"po": po_counts, "ct": ct_counts}

# BAM file path
final_bam = "final_best_alignments.bam"

# Final BAM file's tag analysis
final_tags = analyze_tags(final_bam)

# Output result 
print("Final BAM file - 'po' tag counts:")
print(final_tags["po"])
print("Final BAM file - 'ct' tag counts:")
print(final_tags["ct"])

# Calculate total read numbers and the ratio 
total_reads = sum(final_tags["po"].values())
common_percentage = (final_tags["po"].get(3, 0) / total_reads) * 100
maternal_percentage = (final_tags["po"].get(2, 0) / total_reads) * 100
paternal_percentage = (final_tags["po"].get(1, 0) / total_reads) * 100

print(f"Common reads: {common_percentage:.2f}%")
print(f"Maternal reads: {paternal_percentage:.2f}%")
print(f"Paternal reads: {maternal_percentage:.2f}%")

Final BAM file - 'po' tag counts:
Counter({3: 2240669, 1: 324182, 2: 323019})
Final BAM file - 'ct' tag counts:
Counter({'U': 2869592, 'R': 13728, 'Q': 4550})
Common reads: 77.59%
Maternal reads: 11.23%
Paternal reads: 11.19%


In [4]:
import pysam
from collections import Counter
import re

def count_reads_by_chromosome(bam_file):
    """
    Count reads by chromosome in a BAM file and sort them in natural order.

    Args:
        bam_file (str): Path to the BAM file.

    Returns:
        dict: Counts of reads for each chromosome, sorted in natural order.
    """
    chr_counts = Counter()

    with pysam.AlignmentFile(bam_file, "rb") as bam:
        for read in bam:
            # Only count primary alignments
            if not read.is_secondary and not read.is_supplementary:
                chr_counts[read.reference_name] += 1

    # Custom sorting for chromosome names
    def sort_key(chrom):
        match = re.match(r'chr(\d+|[A-Za-z]+)', chrom)
        if match:
            key = match.group(1)
            return (0, int(key)) if key.isdigit() else (1, key)
        return (2, chrom)

    sorted_chr_counts = dict(sorted(chr_counts.items(), key=lambda x: sort_key(x[0])))

    return sorted_chr_counts

# BAM 파일 경로
final_bam = "final_best_alignments.bam"

# 염색체별 read 수 정렬 후 계산
chromosome_counts = count_reads_by_chromosome(final_bam)

# 결과 출력
print("Chromosome read counts (sorted):")
for chromosome, count in chromosome_counts.items():
    print(f"{chromosome}: {count} reads")


Chromosome read counts (sorted):
chr1: 57897 reads
chr2: 65206 reads
chr3: 1932539 reads
chr4: 64180 reads
chr5: 59283 reads
chr6: 51023 reads
chr7: 56044 reads
chr8: 51159 reads
chr9: 60331 reads
chr10: 39979 reads
chr11: 73024 reads
chr12: 49336 reads
chr13: 49135 reads
chr14: 42524 reads
chr15: 47278 reads
chr16: 39905 reads
chr17: 46862 reads
chr18: 32651 reads
chr19: 34716 reads
chrX: 34733 reads
chrY: 65 reads
