In [21]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import re

In [2]:
# set path to experimental data
data_dir = '../data/experimental_data/'

In [3]:
# load sam files for bound 8 8, left and right reads
# skip header
# only read in first 10 columns
bound_8_8_left = np.loadtxt(data_dir + '/secondFromProt8/prot8/GAG_BO_8_8.1.sam', dtype=str, delimiter='\t', skiprows=3, usecols=[0,1,3,4,5,9,10])
bound_8_8_right = np.loadtxt(data_dir + '/secondFromProt8/prot8/GAG_BO_8_8.2.sam', dtype=str, delimiter='\t', skiprows=3, usecols=[0,1,3,4,5,9,10])

In [4]:
print(bound_8_8_left.shape)
print(bound_8_8_right.shape)
print(bound_8_8_left[0])
print(bound_8_8_right[0])

(3483872, 7)
(3483872, 7)
['NB500937:402:HY7YGAFXY:1:11101:24170:1067' '0' '21' '70' '6S70M'
 'GACCATATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCGATAAAGCCTGCCTTGAG'
 'A/AAAEEAEEEAEEEEEEEEEEEEAEAEEEAEEEAEEEE//EEE/EAEE/EE/EEEEEEEAEEEEAEEEEAE/EEE']
['NB500937:402:HY7YGAFXY:1:11101:24170:1067' '16' '230' '70' '75M1S'
 'TCTCGACACAGGACTCGGCTTGCTGAAGCGCGCACGGCAAGAGGCGAGGGGCGGCGACTGGTGAGTACGCCAAAAT'
 '<E/EEEEAEEEE/E<EEEE/EEE/E/EAEAEEEEEEEEEAEAEEEEEEEEEEEEEEEEEEE/EEEEEEEEEAAAAA']


In [5]:
# show all unique flag values for column 2
print(np.unique(bound_8_8_left[:,1]))

['0' '16' '4']


In [6]:
# get rid of all reads that have flag value 4 in either left or right read
indices = np.where((bound_8_8_left[:,1] != '4') & (bound_8_8_right[:,1] != '4'))
bound_8_8_left = bound_8_8_left[indices]
bound_8_8_right = bound_8_8_right[indices]
print(bound_8_8_left.shape)
print(bound_8_8_right.shape)

(1327259, 7)
(1327259, 7)


In [7]:
# get rid of all reads that have a mapping quality < 70 in either left or right read
indices = np.where((bound_8_8_left[:,3].astype(int) >= 70) & (bound_8_8_right[:,3].astype(int) >= 70))
bound_8_8_left = bound_8_8_left[indices]
bound_8_8_right = bound_8_8_right[indices]
print(bound_8_8_left.shape)
print(bound_8_8_right.shape)

(1326945, 7)
(1326945, 7)


In [8]:
# get all cigar strings
cigar_strings = np.concatenate([bound_8_8_left[:,4], bound_8_8_right[:,4]])
# get all unique letters that appear in cigar strings
cigar_letters = np.unique(np.concatenate([list(x) for x in cigar_strings]))
print(cigar_letters)

['0' '1' '2' '3' '4' '5' '6' '7' '8' '9' 'D' 'I' 'M' 'S']


In [9]:
# count the number of instances of "I" in the cigar strings
insertions = np.sum([x.count('I') for x in cigar_strings])
print(insertions)

67081


In [10]:
# count the number of instances of "D" in the cigar strings
deletions = np.sum([x.count('D') for x in cigar_strings])
print(deletions)

94655


In [14]:
# get rid of all reads that have an insertion or deletion in either left or right read
# check if cigar string contains "I" or "D"
indices = np.where((np.char.find(bound_8_8_left[:,4], 'I') == -1) & (np.char.find(bound_8_8_left[:,4], 'D') == -1) & (np.char.find(bound_8_8_right[:,4], 'I') == -1) & (np.char.find(bound_8_8_right[:,4], 'D') == -1))
bound_8_8_left = bound_8_8_left[indices]
bound_8_8_right = bound_8_8_right[indices]
print(bound_8_8_left.shape)

(1188258, 7)


In [16]:
cigar_strings = np.concatenate([bound_8_8_left[:,4], bound_8_8_right[:,4]])
insertions = np.sum([x.count('I') for x in cigar_strings])
print(insertions)
deletions = np.sum([x.count('D') for x in cigar_strings])
print(deletions)

0
0


In [17]:
# get all sequences
seqs = np.concatenate([bound_8_8_left[:,5], bound_8_8_right[:,5]])
# get all unique letters that appear in sequences
seq_letters = np.unique(np.concatenate([list(x) for x in seqs]))
print(seq_letters)

['A' 'C' 'G' 'N' 'T']


In [12]:
# count the number of instances of "N" in the sequences
Ns = np.sum([x.count('N') for x in seqs])
print(Ns)

2920


In [13]:
# check if ids are the same for left and right reads
print(np.all(bound_8_8_left[:,0] == bound_8_8_right[:,0]))

True


In [19]:
print(bound_8_8_left[:1])
print(bound_8_8_right[:1])

[['NB500937:402:HY7YGAFXY:1:11101:24170:1067' '0' '21' '70' '6S70M'
  'GACCATATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCGATAAAGCCTGCCTTGAG'
  'A/AAAEEAEEEAEEEEEEEEEEEEAEAEEEAEEEAEEEE//EEE/EAEE/EE/EEEEEEEAEEEEAEEEEAE/EEE']]
[['NB500937:402:HY7YGAFXY:1:11101:24170:1067' '16' '230' '70' '75M1S'
  'TCTCGACACAGGACTCGGCTTGCTGAAGCGCGCACGGCAAGAGGCGAGGGGCGGCGACTGGTGAGTACGCCAAAAT'
  '<E/EEEEAEEEE/E<EEEE/EEE/E/EAEAEEEEEEEEEAEAEEEEEEEEEEEEEEEEEEE/EEEEEEEEEAAAAA']]


In [35]:
# parse cigar string into binary string where 1 indicates a match and 0 indicates a mismatch
def parse_cigar(cigar_string):
    # get only numbers from cigar string
    numbers = re.findall(r'\d+', cigar_string)
    # get only letters from cigar string
    letters = re.findall(r'[A-Z]', cigar_string)
    # iterate through numbers
    sequence = ''
    for i in range(len(numbers)):
        # if letter is not a valid symbol, set number to 0
        if letters[i] == 'M':
            sequence += '1' * int(numbers[i])
        else:
            sequence += '0' * int(numbers[i])
    return sequence



print(parse_cigar('6S70M'))
print(parse_cigar('75M1S'))

0000001111111111111111111111111111111111111111111111111111111111111111111111
1111111111111111111111111111111111111111111111111111111111111111111111111110


In [36]:
# parse quality string into binary string where 1 indicates a high quality base and 0 indicates a low quality base
def parse_quality(quality_string):
    valid_symbols = ('?', "@", 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I')
    sequence = ''
    for i in range(len(quality_string)):
        if quality_string[i] in valid_symbols:
            sequence += '1'
        else:
            sequence += '0'
    return sequence

print(parse_quality('A/AAAEEAEEEAEEEEEEEEEEEEAEAEEEAEEEAEEEE//EEE/EAEE/EE/EEEEEEEAEEEEAEEEEAE/EEE'))
print(parse_quality('<E/EEEEAEEEE/E<EEEE/EEE/E/EAEAEEEEEEEEEAEAEEEEEEEEEEEEEEEEEEE/EEEEEEEEEAAAAA'))

1011111111111111111111111111111111111110011101111011011111111111111111110111
0101111111110101111011101011111111111111111111111111111111111011111111111111


In [41]:
# combine cigar and quality strings into one binary string
def parse_cigar_quality(cigar_string, quality_string):
    quality_seq = parse_quality(quality_string)
    cigar_seq = parse_cigar(cigar_string)
    sequence = ''
    for i in range(len(cigar_seq)):
        if cigar_seq[i] == '1' and quality_seq[i] == '1':
            sequence += '1'
        else:
            sequence += '0'
    return sequence

print(parse_cigar_quality('6S70M', 'A/AAAEEAEEEAEEEEEEEEEEEEAEAEEEAEEEAEEEE//EEE/EAEE/EE/EEEEEEEAEEEEAEEEEAE/EEE'))
print(parse_cigar_quality('75M1S', '<E/EEEEAEEEE/E<EEEE/EEE/E/EAEAEEEEEEEEEAEAEEEEEEEEEEEEEEEEEEE/EEEEEEEEEAAAAA'))

0000001111111111111111111111111111111110011101111011011111111111111111110111
0101111111110101111011101011111111111111111111111111111111111011111111111110


In [42]:
def parse_sequence(sequence, cigar_string, quality_string):
    valid_symbols = ('A', 'C', 'G', 'T')
    checked_seq = parse_cigar_quality(cigar_string, quality_string)
    parsed_sequence = ''
    for i in range(len(checked_seq)):
        if checked_seq[i] == '1' and sequence[i] in valid_symbols:
            parsed_sequence += sequence[i]
        else:
            parsed_sequence += '0'
    return parsed_sequence

print(parse_sequence('GACCATATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCGATAAAGCCTGCCTTGAG', '6S70M', 'A/AAAEEAEEEAEEEEEEEEEEEEAEAEEEAEEEAEEEE//EEE/EAEE/EE/EEEEEEEAEEEEAEEEEAE/EEE'))
print(parse_sequence('TCTCGACACAGGACTCGGCTTGCTGAAGCGCGCACGGCAAGAGGCGAGGGGCGGCGACTGGTGAGTACGCCAAAAT', '75M1S', '<E/EEEEAEEEE/E<EEEE/EEE/E/EAEAEEEEEEEEEAEAEEEEEEEEEEEEEEEEEEE/EEEEEEEEEAAAAA'))

000000ATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGG00CCC0CTGC0TA0GCCTCGATAAAGCCTGCCT0GAG
0C0CGACACAGG0C0CGGC0TGC0G0AGCGCGCACGGCAAGAGGCGAGGGGCGGCGACTGG0GAGTACGCCAAAA0


In [43]:
print(bound_8_8_left[:1])
print(bound_8_8_right[:1])

[['NB500937:402:HY7YGAFXY:1:11101:24170:1067' '0' '21' '70' '6S70M'
  'GACCATATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCGATAAAGCCTGCCTTGAG'
  'A/AAAEEAEEEAEEEEEEEEEEEEAEAEEEAEEEAEEEE//EEE/EAEE/EE/EEEEEEEAEEEEAEEEEAE/EEE']]
[['NB500937:402:HY7YGAFXY:1:11101:24170:1067' '16' '230' '70' '75M1S'
  'TCTCGACACAGGACTCGGCTTGCTGAAGCGCGCACGGCAAGAGGCGAGGGGCGGCGACTGGTGAGTACGCCAAAAT'
  '<E/EEEEAEEEE/E<EEEE/EEE/E/EAEAEEEEEEEEEAEAEEEEEEEEEEEEEEEEEEE/EEEEEEEEEAAAAA']]


In [47]:
# get min value of column 3
# conver to int
left = np.min(bound_8_8_left[:,2].astype(int))
print(left)

right = np.min(bound_8_8_right[:,2].astype(int))
print(right)

1
1


In [51]:
# align fragment
def align_fragment(length_sequence, pos, cigar_string, sequence, quality_string):
    parsed_sequence = parse_sequence(sequence, cigar_string, quality_string)
    aligned_sequence = ''
    # add pos - 1 0s to the beginning of the sequence
    aligned_sequence += '0' * (pos - 1)
    # add parsed sequence to the end of the sequence
    aligned_sequence += parsed_sequence
    # add 0s to the end of the sequence
    aligned_sequence += '0' * (length_sequence - len(aligned_sequence))
    return aligned_sequence

left_seq = align_fragment(400, 21, '6S70M', 'GACCATATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGGAACCCACTGCTTAAGCCTCGATAAAGCCTGCCTTGAG', 'A/AAAEEAEEEAEEEEEEEEEEEEAEAEEEAEEEAEEEE//EEE/EAEE/EE/EEEEEEEAEEEEAEEEEAE/EEE')
right_seq = align_fragment(400, 230, '75M1S', 'TCTCGACACAGGACTCGGCTTGCTGAAGCGCGCACGGCAAGAGGCGAGGGGCGGCGACTGGTGAGTACGCCAAAAT', '<E/EEEEAEEEE/E<EEEE/EEE/E/EAEAEEEEEEEEEAEAEEEEEEEEEEEEEEEEEEE/EEEEEEEEEAAAAA')
print(left_seq)
print(right_seq)

00000000000000000000000000ATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGG00CCC0CTGC0TA0GCCTCGATAAAGCCTGCCT0GAG0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000C0CGACACAGG0C0CGGC0TGC0G0AGCGCGCACGGCAAGAGGCGAGGGGCGGCGACTGG0GAGTACGCCAAAA000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000


In [53]:
# merge aligned sequences
def merge_sequences(left_sequence, right_sequence, length_sequence):
    merged_sequence = ''
    for i in range(length_sequence):
        if left_sequence[i] == right_sequence[i]:
            merged_sequence += left_sequence[i]
        # if left sequence is 0, add right sequence
        elif left_sequence[i] == '0':
            merged_sequence += right_sequence[i]
        # if right sequence is 0, add left sequence
        elif right_sequence[i] == '0':
            merged_sequence += left_sequence[i]
        # if left and right sequences are different, add 0
        else:
            merged_sequence += '0'

    return merged_sequence

print(merge_sequences(left_seq, right_seq, 400))

00000000000000000000000000ATCTGAGCCTGGGAGCTCTCTGGCTAACTAGGG00CCC0CTGC0TA0GCCTCGATAAAGCCTGCCT0GAG00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000C0CGACACAGG0C0CGGC0TGC0G0AGCGCGCACGGCAAGAGGCGAGGGGCGGCGACTGG0GAGTACGCCAAAA000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
