In [121]:
import sys
import re
import argparse
import pysam
from pyfaidx import Fasta
from Bio.Seq import Seq
from Bio import pairwise2

In [None]:
"""
A perfect match would look like this:

bait name:  SmG-B_AT3G11500.1
araport name:  AT1G60950.1
bait (query) seq:   TCGACTCATCTTCAGGGAACTAGTTTGATTCGACCCACCGATACCGTTCGTATAATGTATGCTATACGAACGGTATCACAAGTTTGTACAAAAAAGTTGGGGCTTCCACTGCTCTCTCAAGCGCCATCGTCGGAACTTCATTCATCCGTCGTTCCCCAGCTCCAATCAGTCTCCGTTCCCTTCCATCAGCCAACACACAATCCCTCTTCGGTCTCAAATCAGGCACCGCTCGTGGTGGACGTGTCACAGCCATGG
seg_bait_part:      TCGACTCATCTTCAGGGAACTAGTTTGATTCGACCCACCGATACCGTTCGTATAATGTATGCTATACGAACGGTATCACAAGTTTGTACAAAAAAGTTGG
bait seq (from db): TCGACTCATCTTCAGGGAACTAGTTTGATTCGACCCACCGATACCGTTCGTATAATGTATGCTATACGAACGGTATCACAAGTTTGTACAAAAAAGTTGG
seq_araport_part:                                                                                                       GGCTTCCACTGCTCTCTCAAGCGCCATCGTCGGAACTTCATTCATCCGTCGTTCCCCAGCTCCAATCAGTCTCCGTTCCCTTCCATCAGCCAACACACAATCCCTCTTCGGTCTCAAATCAGGCACCGCTCGTGGTGGACGTGTCACAGCCATGG
araport seq (from db):                                                                                                  GGCTTCCACTGCTCTCTCAAGCGCCATCGTCGGAACTTCATTCATCCGTCGTTCCCCAGCTCCAATCAGTCTCCGTTCCCTTCCATCAGCCAACACACAATCCCTCTTCGGTCTCAAATCAGGCACCGCTCGTGGTGGACGTGTCACAGCCATGG

The regions on the bait sequence:
>SmG-B_AT3G11500.1
TCGACTCATCTTCAGGGAACTAGTTTGATTCGACCCACCGATACCGTTCGTATAATGTATGCTATACGAACGGTATCACAAGTTTGTACAAAAAAGTTGG
||||||||| ("gene" no mismatches)
TCGACTCAT
         |||||| (Common. Allow N mismatches with one parameter -m1)
         CTTCAG
               ||| (ACT or GGA, no other alternatives.)
               GGA
                  ||||||||||||||||| (common. Allow N mismatches with one parameter -m2. Use a variable -m2 for all five downstream parts taken together!)
                  ACTAGTTTGATTCGACC
                                   |||| (common. Allow N mismatches with one parameter -m2)
                                   CACC
                                       || (common. Allow N mismatches with one parameter -m2)
                                       GA
                                         |||||||||||||||||||||||||||||||||| (allow 2 mismatches. Allow N mismatches with one parameter -m2)
                                         TACCGTTCGTATAATGTATGCTATACGAACGGTA
                                                                           ||||||||||||||||||||||||| (allow 2 mismatches. Allow N mismatches with one parameter -m2)
                                                                           TCACAAGTTTGTACAAAAAAGTTGG                                                                           
                                         
"""

In [122]:
## Tmp file input
bam_files = ['tmp/B1_EKDL230008374-1A_HF5CYDSX7_L2.merged.bam', 'tmp/B2_EKDL230008375-1A_HF5CYDSX7_L2.merged.bam', 'tmp/Ncre_pool_EKDL230008376-1A_HF5CYDSX7_L2.merged.bam']
#bam_file = 'tmp/B1_EKDL230008374-1A_HF5CYDSX7_L2.merged.bam'
#bam_file = 'tmp/B2_EKDL230008375-1A_HF5CYDSX7_L2.merged.bam'
#bam_file = 'tmp/Ncre_pool_EKDL230008376-1A_HF5CYDSX7_L2.merged.bam'
baits_file = 'tmp/Bait_Gateway_AtcDNAlibrary.fas'
araport_file = 'tmp/Araport11_cds_20220914_representative_gene_model'

In [123]:
## Defaults
__version__ = '0.1'
DEBUG = 2
mismatches_m1_default = 1
mismatches_m2_default = 4
allowed_hd_m1 = mismatches_m1_default # Tmp
allowed_hd_m2 = mismatches_m2_default # Tmp
threads_default = 4

## Coordinates on bait sequences
min_length_seq_bait_part = 100
gene_length_default = 9
gene_length = gene_length_default # Tmp
accepted_index1 = ['ACT', 'GGA']
accepted_eco571 = ['CTTCAG']

### Pos 1-9 is the "gene": CATCGACAT, ...
gene_start = 0
gene_end = gene_length
### Pos 10-15 is Eco57I: CTTCAG
eco571_start = 9
eco571_end = 15
### Pos 16-18 is the index: ACT or GGA
index1_start = 15
index1_end = 18
### Pos 19-100 is linker+index+linker+lox66/71+"after Gateway recombination"
rest_start = 18
rest_end = 100

In [124]:
def hammingDist(seq1, seq2):
    """
    Calculate the Hamming distance in terms of number of changes
    Args: seq1 and seq2 (str)
    Return: distance (int)
    Note: we use .upper() to always compare upper case
    """
    i = 0
    count = 0
    while (i < len(seq1)):
        if (seq1[i].upper() != seq2[i].upper()):
            count += 1
        i += 1
    return count

In [125]:
## Read araport file
araport = Fasta(araport_file, sequence_always_upper=True)

In [126]:
## Read baits file
baits = Fasta(baits_file, sequence_always_upper=True)

In [128]:
## Loop over bam files. Note: we are only counting presence of the "gene signature" at this stage.
for bam_file in bam_files:

    ## Read baits_file and create bam-specific dictionaries with key=SmB-A_AT5G44500.1, value=CATCGACAT ("gene") and value=gene_count
    baits_gene_seq_dict = {}
    baits_gene_name_dict = {}
    baits_gene_count_dict = {}
    for seq in baits:
        baits_gene_seq_dict[seq.name] = seq[0:gene_length]
        baits_gene_name_dict[str(seq[0:gene_length])] = seq.name
        baits_gene_count_dict[seq.name] = 0
    
    ## Read bam file. Needs to be indexed: `samtools index -@ 8 B1_EKDL230008374.merged.bam`
    bam = pysam.AlignmentFile(bam_file, 'rb', threads=2)
    for aso in bam.fetch():
        ## Bait part
        bait_name = aso.query_name.split(':')[0]
        seq_bait_part = Seq(str(aso.query_sequence[:aso.query_alignment_start]))
        if len(seq_bait_part) < min_length_seq_bait_part:
            #print(f"DEBUG: SKIPPING: Length of seq_bait_part was lower than min_length_seq_bait_part ({len(seq_bait_part)})")
            continue

        ## If length of seq_bait_part is longer or equal than min_length_seq_bait_part (100), try to parse from border to araport minus 100
        seq_x = str(aso.query_sequence[aso.query_alignment_start-min_length_seq_bait_part:aso.query_alignment_start])
        gene_seq = seq_x[gene_start:gene_end]
        if baits_gene_name_dict.get(gene_seq) is None:
            #print(f"DEBUG: SKIPPING: Did NOT find gene {gene_seq} in dict")
            continue
    
        baits_gene_count_dict[bait_name] = baits_gene_count_dict[bait_name] + 1

    # End
    bam.close()
    print(f"\nBam file: {bam_file}")
    print(f"Counts:")
    for k,v in sorted(baits_gene_count_dict.items(), key=lambda x:x[1], reverse=True):
        print(k, "=", v)


Bam file: tmp/B1_EKDL230008374-1A_HF5CYDSX7_L2.merged.bam
Counts:
LSM4_AT5G27720.1 = 575
SmG-B_AT3G11500.1 = 269
LSM7_AT2G03870.2 = 260
SmG-A_AT2G23930.1 = 228
LSM6B_AT2G43810.1 = 169
LSM6A_AT3G59810.1 = 41
SmD3-B_AT1G20580.1 = 34
SmD2-A_AT2G47640.1 = 26
SmD2-B_AT3G62840.1 = 26
SmB-A_AT5G44500.1 = 21
SmD3-A_AT1G76300.1 = 12
LSM1B_AT3G14080.1 = 12
LSM3A_AT1G21190.1 = 10
SmB-B_AT4G20440.1 = 9
SmD1-B_AT4G02840.2 = 9
LSM2_AT1G03330.1 = 5
LSM3B_AT1G76860.1 = 4
SmF-A_AT4G30220.2 = 3
SmD1-A_AT3G07590.1 = 2
LSM1A_AT1G19120.1 = 2
LSM5_AT5G48870.1 = 2
SmE-B_AT4G30330.1 = 1
SmE-A_AT2G18740.1 = 0
SmF-B_AT2G14285.1 = 0
LSM8_AT1G65700.3 = 0

Bam file: tmp/B2_EKDL230008375-1A_HF5CYDSX7_L2.merged.bam
Counts:
LSM4_AT5G27720.1 = 450
SmG-A_AT2G23930.1 = 230
LSM6B_AT2G43810.1 = 222
LSM7_AT2G03870.2 = 193
SmG-B_AT3G11500.1 = 158
SmB-A_AT5G44500.1 = 24
LSM6A_AT3G59810.1 = 21
SmD2-A_AT2G47640.1 = 18
SmD2-B_AT3G62840.1 = 17
SmD3-B_AT1G20580.1 = 17
SmB-B_AT4G20440.1 = 4
SmD1-B_AT4G02840.2 = 4
LSM5_AT5G48870.1

In [None]:
## Read baits_file and create hashes with key=SmB-A_AT5G44500.1, value=CATCGACAT ("gene") and value=gene_count
#baits_gene_seq_dict = {}
#baits_gene_name_dict = {}
#baits_gene_count_dict = {}
#
#for seq in baits:
#    baits_gene_seq_dict[seq.name] = seq[0:gene_length]
#    baits_gene_name_dict[str(seq[0:gene_length])] = seq.name
#    baits_gene_count_dict[seq.name] = 0

In [None]:
## Read bam file. Needs to be indexed: `samtools index -@ 8 B1_EKDL230008374.merged.bam`
#bam = pysam.AlignmentFile(bam_file, 'rb', threads=2)

In [None]:
## Loop. Bam file needs to be opened, and dictionaries needs to be reset if loop is re-run.
#print("## Begin test loop")
#for aso in bam.head(10):
##for aso in bam.fetch():
#    #help(aso)
#    ## Bait part
#    bait_name = aso.query_name.split(':')[0]
#    seq_bait_part = Seq(str(aso.query_sequence[:aso.query_alignment_start]))
#    if len(seq_bait_part) < min_length_seq_bait_part:
#        #print(f"DEBUG: SKIPPING: Length of seq_bait_part was lower than min_length_seq_bait_part ({len(seq_bait_part)})")
#        continue
#    
#    #print("\nbait (", bait_name, "): ", baits[bait_name])
#    #print("seq_bait_part: ", seq_bait_part)
#    #print("seq_bait_part_minus_100: ", str(aso.query_sequence[aso.query_alignment_start-min_length_seq_bait_part:aso.query_alignment_start]))
#    #print("bait (query) seq: ", aso.query_sequence)
#
#    #break
#
#    ## If length of seq_bait_part is longer or equal than min_length_seq_bait_part (100), try to parse from border to araport minus 100
#    seq_x = str(aso.query_sequence[aso.query_alignment_start-min_length_seq_bait_part:aso.query_alignment_start])
#    #print("seq_x        : ", seq_x)
#    #print("seq_bait_part: ", seq_bait_part)
#    #print("db seq       : ", baits[bait_name], "\n")
#    gene_seq = seq_x[gene_start:gene_end]
#    if baits_gene_name_dict.get(gene_seq) is None:
#        #print(f"DEBUG: SKIPPING: Did NOT find gene {gene_seq} in dict")
#        continue
#    #print(f"DEBUG: found gene in dict: {gene_seq}")
#    
#    baits_gene_count_dict[bait_name] = baits_gene_count_dict[bait_name] + 1
#
#    #hd = hammingDist(str(read_eco571_seq), str(accepted_eco571))
#    #print("hd: ", hd)
#    #if hd <= allowed_hd_m1:
#    #    print(f"DEBUG:       found Eco571: {seq_x[eco571_start:eco571_end]}")
#    #else:
#    #    print(f"DEBUG: SKIPPING: HD higher than {allowed_hd_m1} for Eco571 ({accepted_eco571}): {seq_x[eco571_start:eco571_end]}")
#    #    continue
#
#    ## Araport part
#    #araport_name = aso.reference_name
#    #print("araport name: ", araport_name) # AT1G78300.1
#    #seq_araport_part = Seq(str(aso.query_sequence[aso.query_alignment_start:]))
#    #print("araport (ref) sequence: ", aso.get_reference_sequence())
#    #print("seq_araport_part: ", seq_araport_part)
#    #if str(seq_araport_part) == aso.get_reference_sequence():
#    #    print("identical")
#
#
#    #print(str(araport[query_name])) # 985 bp sequence
#    #seq_araport_db = Seq(str(araport[query_name]))
#    #alignments = pairwise2.align.localms(seq_araport_part, seq_araport_db, match=2, mismatch=-1, open=-5, extend=-1, one_alignment_only=True)
#    #for match in alignments:
#    #    print(match)
#    ## Translation
#    #trsl = seq_araport_part.translate(table=11, cds=True) # Error
#    #fixed_seq = Seq('ACTTTATCTCCCAAAACACAAAACAAAAAAAA-TGGCTTCCACTGCTCTCTCAAGCGCC')
#    #fixed_seq.translate(table=11, cds=True)
#
#print("## End of test loop")


In [None]:
"""
seq_bait_parts longer than 100 bp compared with the fasta db (Baits) entries.
NOTE:
For the match against LSM4_AT5G27720.1, the seq_bait_part mathces perfectly but is missing the last base.
Using the "minus 100" index will discard the read (despite perfect match).

bait ( SmF-B_AT2G14285.1 ):                    GTATTCCATCTTCAGGGAACTAGTTTGATTCGACCCACCGATACCGTTCGTATAATGTATGCTATACGAACGGTATCACAAGTTTGTACAAAAAAGTTGG
seq_bait_part:  AGCTGGGCGGCCTGGTGTAAACTGAACGCGAGCCCGTCTAACCCGGGTGCCAGCAATGGCAGCGGCGCGCCAATAACTTCGTATAATGTATGCTATACGAACGGTATCACAAGTTTGTACAAAAAAGTTGG
seq_bait_part_minus_100:                       GCCCGTCTAACCCGGGTGCCAGCAATGGCAGCGGCGCGCCAATAACTTCGTATAATGTATGCTATACGAACGGTATCACAAGTTTGTACAAAAAAGTTGG

bait ( LSM4_AT5G27720.1 ):                                            AGGAAGCATCTTCAGACTACTAGTTTGATTCGACCCACCGATACCGTTCGTATAATGTATGCTATACGAACGGTATCACAAGTTTGTACAAAAAAGTTGG
seq_bait_part:  ATTCTTAAGCTCCACGAGCATAGGATGCCCTTGAGCAGTTTTAAGCAGCGATAGAGGAAGCATCTTCAGACTACTAGTTTGATTCGACCCACCGATACCGTTCGTATAATGTATGCTATACGAACGGTATCACAAGTTTGTACAAAAAAGTTG
seq_bait_part_minus_100:                                             GAGGAAGCATCTTCAGACTACTAGTTTGATTCGACCCACCGATACCGTTCGTATAATGTATGCTATACGAACGGTATCACAAGTTTGTACAAAAAAGTTG

bait ( SmD1-A_AT3G07590.1 ):  GAGCTTCATCTTCAGACTACTAGTTTGATTCGACCCACCGATACCGTTCGTATAATGTATGCTATACGAACGGTATCACAAGTTTGTACAAAAAAGTTGG
seq_bait_part:              GAGCCCGTCTAACCCGGGTGCCAGCAATGGCAGCGGCGCGCCAATAACTTCGTATAATGTATGCTATACGAACGGTATCACAAGTTTGTACAAAAAAGTTGG
seq_bait_part_minus_100:      GCCCGTCTAACCCGGGTGCCAGCAATGGCAGCGGCGCGCCAATAACTTCGTATAATGTATGCTATACGAACGGTATCACAAGTTTGTACAAAAAAGTTGG

bait ( SmE-A_AT2G18740.1 ):  GCTCGCCATCTTCAGACTACTAGTTTGATTCGACCCACCGATACCGTTCGTATAATGTATGCTATACGAACGGTATCACAAGTTTGTACAAAAAAGTTGG
seq_bait_part:       GAACGCGAGCCCGTCTAACCCGGGTGCCAGCAATGGCAGCGGCGCGCCAATAACTTCGTATAATGTATGCTATACGAACGGTATCACAAGTTTGTACAAAAAAGTTGG
seq_bait_part_minus_100:     GCCCGTCTAACCCGGGTGCCAGCAATGGCAGCGGCGCGCCAATAACTTCGTATAATGTATGCTATACGAACGGTATCACAAGTTTGTACAAAAAAGTTGG


"""

In [None]:
## Tmp counters
#len_seq_bait_part_ok_counter = 0
#len_seq_bait_part_not_ok_counter = 0
#genes_in_baits_counter = 0

In [None]:
# End
#bam.close()
#
#print(f"Bam file: {bam_file}")
##print(f"Nr bait length OK: {len_seq_bait_part_ok_counter}")
##print(f"Nr bait length not OK: {len_seq_bait_part_not_ok_counter}")
##print(f"Nr genes found in baits seqs: {genes_in_baits_counter}")
#
#print(f"Counts:")
#for k,v in sorted(baits_gene_count_dict.items(), key=lambda x:x[1], reverse=True):
#    print(k, "=", v)