In [1]:
import sys
import re
import argparse
import pysam
from pyfaidx import Fasta
from Bio.Seq import Seq
from Bio import pairwise2

In [2]:
# Tmp file input
bam_file = 'tmp/B1_EKDL230008374.merged.bam'
baits_file = 'tmp/Bait_Gateway_AtcDNAlibrary.fas'
araport_file = 'tmp/Araport11_cdna_20220914_representative_gene_model.fas'

In [3]:
# Defaults
__version__ = '0.1'
min_length_seq_bait_part = 100
gene_length_default = 9
gene_length = gene_length_default # tmp
accepted_index = ['ACT', 'GGA']
DEBUG = 1
mismatches_default = 2
allowed_hd = mismatches_default # tmp
threads_default = 4

In [4]:
def hammingDist(seq1, seq2):
    """
    Calculate the Hamming distance in terms of number of changes
    Args: seq1 and seq2 (str)
    Return: distance (int)
    TODO: do we need upper()?
    """
    i = 0
    count = 0
    while (i < len(seq1)):
        if (seq1[i] != seq2[i]):
            count += 1
        i += 1
    return count

In [5]:
# Read bam file. Needs to be indexed: `samtools index -@ 8 B1_EKDL230008374.merged.bam`
bam = pysam.AlignmentFile(bam_file, 'rb', threads=2)

In [6]:
# Read baits file
baits = Fasta(baits_file, sequence_always_upper=True)

In [7]:
# Read baits_file and create a hashes with key=SmB-A_AT5G44500.1, value=CATCGACAT ("gene") and value=gene_count
baits_gene_seq_dict = {}
baits_gene_name_dict = {}
baits_gene_count_dict = {}

for seq in baits:
    baits_gene_seq_dict[seq.name] = seq[0:gene_length]
    baits_gene_name_dict[str(seq[0:gene_length])] = seq.name
    baits_gene_count_dict[seq.name] = 0

In [8]:
# Read araport file
araport = Fasta(araport_file, sequence_always_upper=True)

In [9]:
# Tmp counters
len_seq_bait_part_ok_counter = 0
len_seq_bait_part_not_ok_counter = 0

In [25]:
# Test loop
for aso in bam.head(1):
    #help(aso)
    print("query name ", aso.query_name)
    print("reference name ", aso.reference_name)
    print("reference sequence ", aso.get_reference_sequence())
    print(baits[aso.reference_name])

query name  AT1G60950.1:887
reference name  SmB-A_AT5G44500.1
reference sequence  CATCGACATCTTCAGACTACTAGTTTGATTCGACCCACCGATACCGTTCGTATAATGTATGCTATACGAACGGTATCACAAGTTTGtACAAAAAAGTTGG
CATCGACATCTTCAGACTACTAGTTTGATTCGACCCACCGATACCGTTCGTATAATGTATGCTATACGAACGGTATCACAAGTTTGTACAAAAAAGTTGG


In [33]:
# Big loop
## Get aligned sequence object
#for aso in bam.fetch():
for aso in bam.head(1):
    # Bait part
    seq_bait_part = Seq(str(aso.query_sequence[aso.query_alignment_start:aso.query_alignment_end]))

    # Discard read if length of seq_bait_part is lower than min_length_seq_bait_part (100)
    if len(seq_bait_part) < min_length_seq_bait_part:
        if DEBUG:
            print(f"Length of seq_bait_part was lower than min_length_seq_bait_part ({len(seq_bait_part)})")
        len_seq_bait_part_not_ok_counter = len_seq_bait_part_not_ok_counter + 1
        continue

    # Tmp counter
    len_seq_bait_part_ok_counter = len_seq_bait_part_ok_counter + 1

    # Go through the parts by position and check if OK
    # Pos 1-9 is the "gene": CATCGACAT, ...
    gene_seq = seq_bait_part[0:gene_length]
    if baits_gene_name_dict.get(gene_seq) is None:
        if DEBUG:
            print(f"DEBUG: did NOT find gene {gene_seq} in dict")
        continue
    if DEBUG:
        print(f"DEBUG: found gene {gene_seq} in dict")

    # Pos 10-15 is Eco57I: CTTCAG
    if seq_bait_part[9:15] != 'CTTCAG':
        if DEBUG:
            print(f"DEBUG: did NOT find Eco57I CTTCAG {seq_bait_part[9:15]}")
        continue
    if DEBUG:
        print(f"DEBUG: found Eco57I {seq_bait_part[9:15]}")

    # Pos 16-18 is the index: ACT or GGA
    if seq_bait_part[15:18] not in accepted_index:
        if DEBUG:
            print(f"DEBUG: index {seq_bait_part[15:18]} NOT accepted.")
        continue
    if DEBUG:
        print(f"DEBUG: found index {seq_bait_part[15:18]}")

    # Pos 19-35 is a linker: ACTAGTTTGATTCGACC
    if DEBUG:
        print(f"DEBUG: found linker {seq_bait_part[18:35]}")

    # Pos 36-39 is an index: CACC
    if DEBUG:
        print(f"DEBUG: found index {seq_bait_part[35:39]}")

    # Pos 40-41 is a linker: GA
    if DEBUG:
        print(f"DEBUG: found linker {seq_bait_part[39:41]}")

    # Pos 42-X is lox66/71: TACCGTTCGTATAATGTATGCTATACGAACGGTA
    read_lox_seq = seq_bait_part[41:75]
    #ref_lox_seq = baits(aso.reference_name)[:] # Get the correct positions from the reference (baits), SmB-A_AT5G44500.1
    #hd = hammingDist(read_lox_seq, ref_lox_seq)
    #if hd <= allowed_hd:
    #    if DEBUG:
    #        print(f"DEBUG: found lox66/71 {seq_bait_part[41:75]}")

    # Pos 75-100 is "after Gateway recombination": TCACAAGTTTGTACAAAAAAGTTGG
    if DEBUG:
        print(f"DEBUG: found after Gateway recombination {seq_bait_part[75:100]}")

    # If All parts of seq_bait_part are OK:
    #   parse Araport part
    #
    #   Check reading frame for seq_araport_part
    #
    #   Add to count gene_count
    baits_gene_count_dict[aso.reference_name] = baits_gene_count_dict[aso.reference_name] + 1

    #   Add info on in frame/out of frame

    ## Araport part
    #start_araport_part = aso.query_alignment_end
    #seq_araport_part = Seq(str(aso.query_sequence[start_araport_part:]))

    #query_name = aso.query_name.split(':')[0]   # AT1G60950.1
    #seq_araport_db = Seq(str(araport[query_name]))

    #alignments = pairwise2.align.localms(seq_araport_part, seq_araport_db, match=2, mismatch=-1, open=-5, extend=-1, one_alignment_only=True)

    #for match in alignments:
    #    #print(match)
    #    print(match.seqA)  # seq_araport_part
    #    print(match.start) # Start pos on seq_araport_db
    #    print(match.end)   # End pos on seq_araport_db

DEBUG: found gene CATCGACAT in dict
DEBUG: found Eco57I CTTCAG
DEBUG: found index ACT
DEBUG: found linker ACTAGTTTGATTCGACC
DEBUG: found index CACC
DEBUG: found linker GA
DEBUG: found after Gateway recombination TCACAAGTTTGAACAAAAAAGTTGG


In [None]:
# End
bf.close()

print(f"nr bait length OK: {len_seq_bait_part_ok_counter}")
print(f"nr bait length not OK: {len_seq_bait_part_not_ok_counter}")