In [1]:
import sys
import re
import time
import timeit
import random
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import statistics
%matplotlib inline

from Bio import SeqIO, pairwise2
from Bio.Seq import Seq
from Bio.SeqIO.FastaIO import FastaIterator
from Bio.pairwise2 import format_alignment
from collections import Counter, OrderedDict
from itertools import islice, groupby
from collections import Counter, namedtuple
from prettytable import PrettyTable
import import_ipynb
from Util import seqInfo, parseFasta, parseSeq, unifragmentList, fragmentLenPlot
from RepeatFinder import parseSeqByCutter, findRepeatSeqs, commonRepeatTable, commonRepeatSeqsPosition, commonPositionListToDic, printCommonPositionDic, topRepeatSeqCounter, printTopSeqPercentageTable
from DataInfo import currDataset, datasetPath, matchPattern, cutter, cutterLen, fragmentN, commonCount

importing Jupyter notebook from Util.ipynb
importing Jupyter notebook from RepeatFinder.ipynb
importing Jupyter notebook from DataInfo.ipynb


In [2]:
# parse fasta file & data info
parseFastaSeqs = parseFasta(currDataset, datasetPath, matchPattern, False)
seqInfo(currDataset, parseFastaSeqs)

...start parsing DanioRerio.fasta fasta file ...
...cost19.552297830581665 sec to parse fasta file ...
DanioRerio.fasta dataset
 number of sequence:1922
 total length:1679186873



In [3]:
fragmentsLenList, fragmentsSeqList = parseSeqByCutter(parseFastaSeqs)

y cutter GATC ...
...cost 0.0035638809204101562 sec to cut sequence
...start cutting sequence by cutter GATC ...
...cost 0.002920866012573242 sec to cut sequence
...start cutting sequence by cutter GATC ...
...cost 0.0016489028930664062 sec to cut sequence
...start cutting sequence by cutter GATC ...
...cost 0.0024929046630859375 sec to cut sequence
...start cutting sequence by cutter GATC ...
...cost 0.0023279190063476562 sec to cut sequence
...start cutting sequence by cutter GATC ...
...cost 0.002051830291748047 sec to cut sequence
...start cutting sequence by cutter GATC ...
...cost 0.0035419464111328125 sec to cut sequence
...start cutting sequence by cutter GATC ...
...cost 0.004121065139770508 sec to cut sequence
...start cutting sequence by cutter GATC ...
...cost 0.00257110595703125 sec to cut sequence
...start cutting sequence by cutter GATC ...
...cost 0.0016160011291503906 sec to cut sequence
...start cutting sequence by cutter GATC ...
...cost 0.0026581287384033203 sec to 

In [4]:
commonRepeatInfo, repeatCount, positionDic = findRepeatSeqs(fragmentsLenList, cutter , fragmentN, commonCount)

... start finding repeat seq ...
...cost8.735102891921997 sec to finding repeat seq  ...


In [5]:
commonRepeatTable(commonRepeatInfo, positionDic, commonCount)

+----------------------+-------+
|     FragmentLen      | Count |
+----------------------+-------+
| (27, 27, 27, 27, 27) |  121  |
| (10, 10, 10, 10, 10) |   87  |
| (14, 14, 14, 14, 14) |   87  |
| (12, 12, 12, 12, 12) |   74  |
| (28, 28, 28, 28, 28) |   54  |
| (40, 40, 40, 40, 40) |   40  |
|  (2, 41, 2, 41, 2)   |   40  |
|  (41, 2, 41, 2, 41)  |   40  |
| (21, 21, 21, 21, 21) |   37  |
|   (2, 2, 2, 2, 2)    |   36  |
+----------------------+-------+


In [6]:
# commonCount repeat sequences
commonPositionList = commonRepeatSeqsPosition(fragmentsLenList, commonRepeatInfo, positionDic, commonCount)
commonPositionDic = commonPositionListToDic(commonPositionList)
# printCommonPositionDic(commonPositionDic)

In [7]:
# the top common repeat sequence
singleRepeatSeqCount, comRepeatSeqCount = topRepeatSeqCounter(fragmentsSeqList, commonRepeatInfo, positionDic, fragmentN)

the top repeat sequence: (27, 27, 27, 27, 27),121


In [8]:
 printTopSeqPercentageTable(comRepeatSeqCount, showPartialSeq=True)

+------------+-------+--------------------+
|    Seq     | Count |   Percentage(%)    |
+------------+-------+--------------------+
| AGTGTAGTGT |   4   | 3.3057851239669422 |
| AGTGTAGTGT |   3   | 2.479338842975207  |
| AGTGTAGTGT |   3   | 2.479338842975207  |
| AGTGTAGTGT |   3   | 2.479338842975207  |
| AGTGTAGTGT |   3   | 2.479338842975207  |
| AGTGTAGTGT |   3   | 2.479338842975207  |
| ACCGTGGAGA |   2   | 1.6528925619834711 |
| AGTGTAGTGT |   2   | 1.6528925619834711 |
| AGTGTAGTGT |   2   | 1.6528925619834711 |
| AGTGTAGTGT |   2   | 1.6528925619834711 |
+------------+-------+--------------------+


In [9]:
topRepeatSeqKey = commonRepeatInfo[0][0]
topKeySum = sum(list(topRepeatSeqKey))
tolerance = topKeySum *0.05
topRepeatSeqKey

(27, 27, 27, 27, 27)

In [15]:
positionKeys = list(positionDic.keys())

matchKeys = []
for i in range(len(positionKeys)):
    temKey = positionKeys[i]
    for j in range(fragmentN):
        flag = 1
        if abs(topRepeatSeqKey[j]- temKey[j]) >= tolerance:
            flag = -1
            break
    if flag == 1:
        matchKeys.append(temKey)

matchKeys[:10]

[(28, 28, 28, 28, 28),
 (27, 27, 24, 27, 27),
 (27, 24, 27, 27, 27),
 (24, 27, 27, 27, 27),
 (27, 27, 27, 27, 27),
 (27, 27, 27, 27, 26),
 (27, 27, 27, 26, 27),
 (27, 27, 26, 27, 27),
 (27, 26, 27, 27, 27),
 (26, 27, 27, 27, 27)]

In [11]:
matchLenList = []
for i in range(len(matchKeys)):
    matchLenList.append(positionDic[matchKeys[i]])

In [12]:
flattenMatchSeq = [ j for subset in matchLenList for j in subset ]
candidatePoiDic = dict()
[ candidatePoiDic [t[0]].append(t[1]) if t [0] in list(candidatePoiDic.keys()) else candidatePoiDic.update({t [0]: [t [1]]}) for t in flattenMatchSeq ]
sortedCandidatePoiDic = OrderedDict(sorted(candidatePoiDic.items()))

In [13]:
matchSeqDic = dict()
for i in sortedCandidatePoiDic:
    chrIndex = i
    matchList = []
    for j in range(len(sortedCandidatePoiDic[chrIndex])):
        startIndex = sortedCandidatePoiDic[chrIndex][j]
        combinedSeq = Seq('').join(fragmentsSeqList[chrIndex][startIndex:startIndex+fragmentN])
        matchList.append(combinedSeq)
    matchSeqDic[chrIndex] = matchList

In [17]:
alignments = pairwise2.align.globalxx(matchSeqDic[0][0], matchSeqDic[0][1])
alignments[0]

Alignment(seqA='TGCGgtaaacaccctaacaaccaaaaTATG-CGgtaaacaccctaacaaccaaaaTATACG--gtaaacaccctaacaaccaaaaTATGCAgtaaacaccctaacaaccaaaa----TATGCAgtaaacaccctaacaacctAAA----TA', seqB='TGCGgtaaacaccctaacaaccaaaaTAT-ACGgtaaacaccctaacaaccaaaaTAT--GCAgtaaacaccctaacaaccaaaaTATGCAgtaaacaccctaacaacc----tAAATATGCAgtaaacaccctaacaacc----aaaaTA', score=129.0, start=0, end=151)

In [24]:
print(f"Seq1:{alignments[0][0]}\n\nSeq2:{alignments[0][1]}\n\nlength:{len(alignments[0][0])}, score:{int(alignments[0][2])}")

Seq1:TGCGgtaaacaccctaacaaccaaaaTATG-CGgtaaacaccctaacaaccaaaaTATACG--gtaaacaccctaacaaccaaaaTATGCAgtaaacaccctaacaaccaaaa----TATGCAgtaaacaccctaacaacctAAA----TA

Seq2:TGCGgtaaacaccctaacaaccaaaaTAT-ACGgtaaacaccctaacaaccaaaaTAT--GCAgtaaacaccctaacaaccaaaaTATGCAgtaaacaccctaacaacc----tAAATATGCAgtaaacaccctaacaacc----aaaaTA

length:151, score:129


In [None]:
N = 5
cutter = "GATC"
fragmentLength = [ 27, 27, 27, 27, 27 ]
tolerance = sum(fragmentLength) *0.05
matchLenList = [ if(fragmentLength[i]-candidateLength[i] <= tolerance) ]
# matchLenList = [(28, 28, 28, 28, 28), (24, 27, 27, 27, 27), (27, 27, 27, 27, 26), (27, 27, 27, 26, 27)...]
alignment(fragmentLength, matchLenList[i]) # dynamic programming 