In [91]:
import sys
import re
import time
import timeit
import random
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import statistics
%matplotlib inline

from Bio import SeqIO, pairwise2
from Bio.Seq import Seq
from Bio.SeqIO.FastaIO import FastaIterator
from Bio.pairwise2 import format_alignment
from collections import Counter, OrderedDict, namedtuple
from itertools import islice, groupby, permutations
from prettytable import PrettyTable

import import_ipynb
from Util import seqInfo, parseFasta, parseSeq, parseSeqByCutter, unifragmentList, fragmentLenPlot
from RepeatFinder import findRepeatSeqs, commonRepeatTable, commonRepeatSeqsPosition, commonPositionListToDic, printCommonPositionDic, topRepeatSeqCounter, printTopSeqPercentageTable
from DataInfo import currDataset, datasetPath, matchPattern, cutter, cutterLen, fragmentN, commonCount

In [2]:
# parse fasta file & data info
parseFastaSeqs = parseFasta(currDataset, datasetPath, matchPattern, False)
seqInfo(currDataset, parseFastaSeqs)

...start parsing DanioRerio.fasta fasta file ...
...cost20.00064492225647 sec to parse fasta file ...
DanioRerio.fasta dataset
 number of sequence:1922
 total length:1679186873



In [3]:
fragmentsLenList, fragmentsSeqList = parseSeqByCutter(parseFastaSeqs)

...start parse seq by cutter: GATC
...cost 22.097705841064453 sec to cut sequence


In [11]:
allRepeatSeqInfo, commonRepeatInfo, repeatCount, positionDic = findRepeatSeqs(fragmentsLenList, cutter , fragmentN, commonCount)

... start finding repeat seq ...
...cost11.525821924209595 sec to finding repeat seq  ...


In [5]:
commonRepeatTable(commonRepeatInfo, positionDic, commonCount)

+-----------------------------+-------+
|         FragmentLen         | Count |
+-----------------------------+-------+
|     (27, 27, 27, 27, 27)    |  121  |
|     (10, 10, 10, 10, 10)    |   87  |
|     (14, 14, 14, 14, 14)    |   87  |
|     (12, 12, 12, 12, 12)    |   74  |
|     (28, 28, 28, 28, 28)    |   54  |
|     (40, 40, 40, 40, 40)    |   40  |
|      (2, 41, 2, 41, 2)      |   40  |
|      (41, 2, 41, 2, 41)     |   40  |
|     (21, 21, 21, 21, 21)    |   37  |
|       (2, 2, 2, 2, 2)       |   36  |
|     (31, 31, 31, 31, 31)    |   36  |
| (124, 1959, 124, 1959, 124) |   34  |
|   (16, 161, 35, 267, 184)   |   33  |
|     (33, 33, 33, 33, 33)    |   33  |
|     (22, 22, 22, 22, 22)    |   33  |
|     (35, 35, 35, 35, 35)    |   33  |
|      (5, 41, 5, 41, 5)      |   32  |
|    (43, 16, 161, 35, 267)   |   32  |
|     (15, 15, 15, 15, 15)    |   30  |
|     (24, 24, 24, 24, 24)    |   30  |
+-----------------------------+-------+


In [26]:
allRepeatSeqInfo = [(k, v) for k, v in allRepeatSeqInfo.items()]

[((2, 8, 32, 8, 2), 2),
 ((389, 358, 21, 103, 2), 4),
 ((232, 179, 52, 733, 505), 2),
 ((179, 52, 733, 505, 341), 2),
 ((52, 733, 505, 341, 212), 2),
 ((733, 505, 341, 212, 89), 2),
 ((505, 341, 212, 89, 383), 2),
 ((341, 212, 89, 383, 638), 2),
 ((212, 89, 383, 638, 1331), 2),
 ((31, 156, 86, 243, 100), 2),
 ((156, 86, 243, 100, 419), 2),
 ((86, 243, 100, 419, 128), 2)]

In [86]:
# return each SeqRepeatInfo of List, SeqRepeatInfo[ fragmentLenList, count, position: (chrIdx, fragmentIdx, baseIdx, seq) ]

def integrateRepeatInfo(fragmentsLenList, allRepeatSeqInfo, positionDic, repeatCount):
    SeqRepeatInfo = namedtuple('SeqRepeatInfo', ['fragmentLenList', 'count', 'position'])
    PositionInfo = namedtuple('PositionInfo', ['chrIdx', 'fragmentIdx', 'baseIdx', 'seq'])
    repeatInfoList = []
    for i in range(repeatCount):
        positionList = []
        targetFragmentLenList = allRepeatSeqInfo[i][0]
        targetFragmentDic = positionDic[targetFragmentLenList]
        for j in range(len(targetFragmentDic)):
            chrIdx = targetFragmentDic[j][0]
            fragmentIdx = targetFragmentDic[j][1]
            baseIdx = sum(fragmentsLenList[chrIndex][:fragmentIdx])+(cutterLen*fragmentIdx)
            seq = Seq('').join(fragmentsSeqList[chrIdx][fragmentIdx:fragmentIdx+fragmentN])
            positionList.append(PositionInfo(chrIdx, fragmentIdx, baseIdx, seq))
        repeatInfoList.append(SeqRepeatInfo(lenData, countData, positionList))
    return repeatInfoList

In [87]:
# From testing
repeatCount = len(allRepeatSeq)
repeatCount = 10
repeatInfoList = integrateRepeatInfo(fragmentsLenList, allRepeatSeqInfo, positionDic, repeatCount)

In [90]:
repeatInfoList[0]

SeqRepeatInfo(fragmentLenList=(33, 146, 88, 580, 260), count=2, position=[PositionInfo(chrIdx=0, fragmentIdx=59, baseIdx=47584, seq=Seq('CTCTGCTCCTCTGCTCCTGCTCTTGCTCCTGCTCTTGCTCCTCTGCTCCTCT')), PositionInfo(chrIdx=5, fragmentIdx=5725, baseIdx=4012956, seq=Seq('TGCAGCTCATTGGCTTGTGCTCTGGTTCAGGCACTTGATATGAGGCTCTCTG'))])

In [115]:
seqPermutationList = []
for i in range (len(repeatInfoList)):
    perm = permutations(repeatInfoList[i].position, 2) 
    seqPermutationList.append(perm)

In [116]:
for i in seqPermutationList:
    print(list(i))

[(PositionInfo(chrIdx=0, fragmentIdx=59, baseIdx=47584, seq=Seq('CTCTGCTCCTCTGCTCCTGCTCTTGCTCCTGCTCTTGCTCCTCTGCTCCTCT')), PositionInfo(chrIdx=5, fragmentIdx=5725, baseIdx=4012956, seq=Seq('TGCAGCTCATTGGCTTGTGCTCTGGTTCAGGCACTTGATATGAGGCTCTCTG'))), (PositionInfo(chrIdx=5, fragmentIdx=5725, baseIdx=4012956, seq=Seq('TGCAGCTCATTGGCTTGTGCTCTGGTTCAGGCACTTGATATGAGGCTCTCTG')), PositionInfo(chrIdx=0, fragmentIdx=59, baseIdx=47584, seq=Seq('CTCTGCTCCTCTGCTCCTGCTCTTGCTCCTGCTCTTGCTCCTCTGCTCCTCT')))]
[(PositionInfo(chrIdx=0, fragmentIdx=241, baseIdx=95776, seq=Seq('AGCTAAACCATTTAGATATCCCAATACAATTAAATGAGTTaacagaaaaattaa...AAT')), PositionInfo(chrIdx=6, fragmentIdx=34246, baseIdx=26093270, seq=Seq('AGCTAAACCATTTAGATATCCCAATACAATTAAATGAGTTaacagaaaaattaa...AAT'))), (PositionInfo(chrIdx=0, fragmentIdx=241, baseIdx=95776, seq=Seq('AGCTAAACCATTTAGATATCCCAATACAATTAAATGAGTTaacagaaaaattaa...AAT')), PositionInfo(chrIdx=7, fragmentIdx=17718, baseIdx=13390355, seq=Seq('AGCTAAACCATTTAGATGTCCCAATACAATTAAATGAGTTaa

In [None]:
# How to use “seqPermutationList”, Permutation to handle compare sequence 


In [15]:
topRepeatSeqKey = commonRepeatInfo[0][0]
topKeySum = sum(list(topRepeatSeqKey))
tolerance = topKeySum *0.05
topRepeatSeqKey

positionKeys = list(positionDic.keys())

matchKeys = []
for i in range(len(positionKeys)):
    temKey = positionKeys[i]
    for j in range(fragmentN):
        flag = 1
        if abs(topRepeatSeqKey[j]- temKey[j]) >= tolerance:
            flag = -1
            break
    if flag == 1:
        matchKeys.append(temKey)

matchKeys[:10]

[(28, 28, 28, 28, 28),
 (27, 27, 24, 27, 27),
 (27, 24, 27, 27, 27),
 (24, 27, 27, 27, 27),
 (27, 27, 27, 27, 27),
 (27, 27, 27, 27, 26),
 (27, 27, 27, 26, 27),
 (27, 27, 26, 27, 27),
 (27, 26, 27, 27, 27),
 (26, 27, 27, 27, 27)]

In [11]:
matchLenList = []
for i in range(len(matchKeys)):
    matchLenList.append(positionDic[matchKeys[i]])

In [12]:
flattenMatchSeq = [ j for subset in matchLenList for j in subset ]
candidatePoiDic = dict()
[ candidatePoiDic [t[0]].append(t[1]) if t [0] in list(candidatePoiDic.keys()) else candidatePoiDic.update({t [0]: [t [1]]}) for t in flattenMatchSeq ]
sortedCandidatePoiDic = OrderedDict(sorted(candidatePoiDic.items()))



In [13]:
matchSeqDic = dict()
for i in sortedCandidatePoiDic:
    chrIndex = i
    matchList = []
    for j in range(len(sortedCandidatePoiDic[chrIndex])):
        startIndex = sortedCandidatePoiDic[chrIndex][j]
        combinedSeq = Seq('').join(fragmentsSeqList[chrIndex][startIndex:startIndex+fragmentN])
        matchList.append(combinedSeq)
    matchSeqDic[chrIndex] = matchList

In [17]:
alignments = pairwise2.align.globalxx(matchSeqDic[0][0], matchSeqDic[0][1])
alignments[0]

Alignment(seqA='TGCGgtaaacaccctaacaaccaaaaTATG-CGgtaaacaccctaacaaccaaaaTATACG--gtaaacaccctaacaaccaaaaTATGCAgtaaacaccctaacaaccaaaa----TATGCAgtaaacaccctaacaacctAAA----TA', seqB='TGCGgtaaacaccctaacaaccaaaaTAT-ACGgtaaacaccctaacaaccaaaaTAT--GCAgtaaacaccctaacaaccaaaaTATGCAgtaaacaccctaacaacc----tAAATATGCAgtaaacaccctaacaacc----aaaaTA', score=129.0, start=0, end=151)

In [24]:
print(f"Seq1:{alignments[0][0]}\n\nSeq2:{alignments[0][1]}\n\nlength:{len(alignments[0][0])}, score:{int(alignments[0][2])}")

Seq1:TGCGgtaaacaccctaacaaccaaaaTATG-CGgtaaacaccctaacaaccaaaaTATACG--gtaaacaccctaacaaccaaaaTATGCAgtaaacaccctaacaaccaaaa----TATGCAgtaaacaccctaacaacctAAA----TA

Seq2:TGCGgtaaacaccctaacaaccaaaaTAT-ACGgtaaacaccctaacaaccaaaaTAT--GCAgtaaacaccctaacaaccaaaaTATGCAgtaaacaccctaacaacc----tAAATATGCAgtaaacaccctaacaacc----aaaaTA

length:151, score:129


In [None]:
N = 5
cutter = "GATC"
fragmentLength = [ 27, 27, 27, 27, 27 ]
tolerance = sum(fragmentLength) *0.05
matchLenList = [ if(fragmentLength[i]-candidateLength[i] <= tolerance) ]
# matchLenList = [(28, 28, 28, 28, 28), (24, 27, 27, 27, 27), (27, 27, 27, 27, 26), (27, 27, 27, 26, 27)...]
alignment(fragmentLength, matchLenList[i]) # dynamic programming 

In [None]:
# commonCount repeat sequences
commonPositionList = commonRepeatSeqsPosition(fragmentsLenList, commonRepeatInfo, positionDic, commonCount)
commonPositionDic = commonPositionListToDic(commonPositionList)
# printCommonPositionDic(commonPositionDic)

# the top common repeat sequence
singleRepeatSeqCount, comRepeatSeqCount = topRepeatSeqCounter(fragmentsSeqList, commonRepeatInfo, positionDic, fragmentN)printTopSeqPercentageTable(comRepeatSeqCount, showPartialSeq=True)