In [1]:
import sys
import re
import time
import timeit
import random
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import statistics
%matplotlib inline

from Bio import SeqIO, pairwise2
from Bio.Seq import Seq
from Bio.SeqIO.FastaIO import FastaIterator
from Bio.pairwise2 import format_alignment
from collections import Counter, OrderedDict, namedtuple
from itertools import islice, groupby, permutations
from prettytable import PrettyTable

import import_ipynb

from Util import seqInfo, parseFasta, parseSeq, parseSeqByCutter, unifragmentList, fragmentLenPlot
from RepeatFinder import findRepeatSeqs, commonRepeatTable, commonRepeatSeqsPosition, commonPositionListToDic, printCommonPositionDic, topRepeatSeqCounter, printTopSeqPercentageTable, integrateRepeatInfo, getSeqPermutation, evaluateRepeat, generateOutputFile
from DataInfo import currDataset, datasetPath, matchPattern, cutter, cutterLen, fragmentN, commonCount
from DataStructure import SeqRepeatInfo, PositionInfo, RepeatEvaInfo

importing Jupyter notebook from Util.ipynb
importing Jupyter notebook from DataInfo.ipynb
importing Jupyter notebook from RepeatFinder.ipynb
importing Jupyter notebook from DataStructure.ipynb


In [2]:
# parse fasta file & data info
parseFastaSeqs = parseFasta(currDataset, datasetPath, matchPattern, False)
seqInfo(currDataset, parseFastaSeqs)

...start parsing DanioRerio.fasta fasta file ...
...cost18.099308013916016 sec to parse fasta file ...
DanioRerio.fasta dataset
 number of sequence:1922
 total length:1679186873



In [3]:
fragmentsLenList, fragmentsSeqList = parseSeqByCutter(parseFastaSeqs)

...start parse seq by cutter: GATC
...cost 39.80088210105896 sec to cut sequence


In [4]:
allRepeatSeq, commonRepeatSeq, repeatCount, positionDic = findRepeatSeqs(fragmentsLenList, cutter , fragmentN, commonCount, allRepeatSeqType=1)

... start finding repeat seq ...
...cost14.488226890563965 sec to finding repeat seq  ...


In [5]:
# repeatCount = len(allRepeatSeq)
repeatInfoList = integrateRepeatInfo(fragmentsSeqList, fragmentsLenList, allRepeatSeq, positionDic, repeatCount = 10)
seqPermutation = getSeqPermutation(repeatInfoList)

In [6]:
generateOutputFile(seqPermutation)

In [None]:
# find repeat seq length list with tolerance
topRepeatSeqKey = commonRepeatInfo[0][0]
topKeySum = sum(list(topRepeatSeqKey))
tolerance = topKeySum *0.05
topRepeatSeqKey

positionKeys = list(positionDic.keys())

matchKeys = []
for i in range(len(positionKeys)):
    temKey = positionKeys[i]
    for j in range(fragmentN):
        flag = 1
        if abs(topRepeatSeqKey[j]- temKey[j]) >= tolerance:
            flag = -1
            break
    if flag == 1:
        matchKeys.append(temKey)
        
matchLenList = []
for i in range(len(matchKeys)):
    matchLenList.append(positionDic[matchKeys[i]])

In [12]:
flattenMatchSeq = [ j for subset in matchLenList for j in subset ]
candidatePoiDic = dict()
[ candidatePoiDic [t[0]].append(t[1]) if t [0] in list(candidatePoiDic.keys()) else candidatePoiDic.update({t [0]: [t [1]]}) for t in flattenMatchSeq ]
sortedCandidatePoiDic = OrderedDict(sorted(candidatePoiDic.items()))

In [13]:
matchSeqDic = dict()
for i in sortedCandidatePoiDic:
    chrIndex = i
    matchList = []
    for j in range(len(sortedCandidatePoiDic[chrIndex])):
        startIndex = sortedCandidatePoiDic[chrIndex][j]
        combinedSeq = Seq('').join(fragmentsSeqList[chrIndex][startIndex:startIndex+fragmentN])
        matchList.append(combinedSeq)
    matchSeqDic[chrIndex] = matchList

In [None]:
# commonCount repeat sequences
commonPositionList = commonRepeatSeqsPosition(fragmentsLenList, commonRepeatInfo, positionDic, commonCount)
commonPositionDic = commonPositionListToDic(commonPositionList)
# printCommonPositionDic(commonPositionDic)

# the top common repeat sequence
singleRepeatSeqCount, comRepeatSeqCount = topRepeatSeqCounter(fragmentsSeqList, commonRepeatInfo, positionDic, fragmentN)printTopSeqPercentageTable(comRepeatSeqCount, showPartialSeq=True)