In [196]:
import re
import sys
import time
import copy
import timeit
import random
import statistics
import numpy as np 
import pandas as pd 
import networkx as nx
import matplotlib.pyplot as plt

from Bio import SeqIO, pairwise2
from Bio.Seq import Seq
from Bio.SeqIO.FastaIO import FastaIterator
from Bio.pairwise2 import format_alignment
from collections import Counter, OrderedDict, namedtuple
from itertools import islice, groupby, permutations
from prettytable import PrettyTable

import importlib
importlib.reload(sys.modules['RepeatFinder'])

import import_ipynb
from Util import seqInfo, parseFasta, parseSeq, parseSeqByCutter, unifragmentList, fragmentLenPlot
from RepeatFinder import findRepeatSeqs, commonRepeatTable, commonRepeatSeqsPosition, commonPositionListToDic, printCommonPositionDic, topRepeatSeqCounter, printTopSeqPercentageTable, integrateRepeatInfo, getSeqPermutation, evaluateRepeat, generateOutputFile, longestCommonLength
from DataInfo import currDataset, datasetPath, matchPattern, cutter, cutterLen, fragmentN, commonCount
from DataStructure import SeqRepeatInfo, PositionInfo, RepeatEvaInfo, TandemRepeatInfo
from SuffixTree import STree



importing Jupyter notebook from RepeatFinder.ipynb


In [197]:
parseFastaSeqs = parseFasta(currDataset, datasetPath, matchPattern, False)
seqInfo(currDataset, parseFastaSeqs)

...start parsing DanioRerio.fasta fasta file ...
...cost21.833519220352173 sec to parse fasta file ...
DanioRerio.fasta dataset
 number of sequence:1922
 total length:1679186873



In [143]:
fragmentsLenList, fragmentsSeqList = parseSeqByCutter(parseFastaSeqs)

...start parse seq by cutter: GATC
...cost 20.99932289123535 sec to cut sequence


In [144]:
allRepeatSeqCount, commonRepeatSeq, repeatCount, positionDic = findRepeatSeqs(fragmentsLenList, cutter , fragmentN, commonCount, allRepeatSeqType=1)

... start finding repeat seq ...
...cost12.267969131469727 sec to finding repeat seq  ...


In [None]:
# repeatCount = len(allRepeatSeqCount)
repeatInfoList = integrateRepeatInfo(fragmentsSeqList, fragmentsLenList, allRepeatSeqCount[:10], positionDic)
seqPermutation = getSeqPermutation(repeatInfoList)

In [7]:
generateOutputFile(seqPermutation)

In [None]:
tandemRepeatList = list(filter(checkTandemRepeatExist, allRepeatSeqCount[:30]))

In [22]:
tandemRepeatList

[((2, 8, 32, 8, 2), 2),
 ((5, 25, 5, 25, 5), 2),
 ((25, 5, 25, 5, 34), 2),
 ((5, 25, 5, 34, 25), 2),
 ((52, 80, 217, 52, 80), 2),
 ((28, 60, 28, 28, 28), 4),
 ((60, 28, 28, 28, 28), 4),
 ((28, 28, 28, 28, 28), 54)]

In [188]:
tandemRepeatInfoList = integrateRepeatInfo(fragmentsSeqList, fragmentsLenList, tandemRepeatList, positionDic)
tandemRepeatInfoList

NameError: name 'tandemRepeatList' is not defined

In [193]:
fragmentsLenList = [38, 82, 28, 38, 82, 99, 73, 82, 22, 33, 46, 43, 54, 38, 82, 38, 82, 100 ]
matchLen, matchPosition = longestCommonLength(fragmentsLenList)

In [None]:
# find repeat seq length list with tolerance
topRepeatSeqKey = commonRepeatInfo[0][0]
topKeySum = sum(list(topRepeatSeqKey))
tolerance = topKeySum *0.05
topRepeatSeqKey

positionKeys = list(positionDic.keys())

matchKeys = []
for i in range(len(positionKeys)):
    temKey = positionKeys[i]
    for j in range(fragmentN):
        flag = 1
        if abs(topRepeatSeqKey[j]- temKey[j]) >= tolerance:
            flag = -1
            break
    if flag == 1:
        matchKeys.append(temKey)
        
matchLenList = []
for i in range(len(matchKeys)):
    matchLenList.append(positionDic[matchKeys[i]])

In [12]:
flattenMatchSeq = [ j for subset in matchLenList for j in subset ]
candidatePoiDic = dict()
[ candidatePoiDic [t[0]].append(t[1]) if t [0] in list(candidatePoiDic.keys()) else candidatePoiDic.update({t [0]: [t [1]]}) for t in flattenMatchSeq ]
sortedCandidatePoiDic = OrderedDict(sorted(candidatePoiDic.items()))

In [13]:
matchSeqDic = dict()
for i in sortedCandidatePoiDic:
    chrIndex = i
    matchList = []
    for j in range(len(sortedCandidatePoiDic[chrIndex])):
        startIndex = sortedCandidatePoiDic[chrIndex][j]
        combinedSeq = Seq('').join(fragmentsSeqList[chrIndex][startIndex:startIndex+fragmentN])
        matchList.append(combinedSeq)
    matchSeqDic[chrIndex] = matchList

In [None]:
# commonCount repeat sequences
commonPositionList = commonRepeatSeqsPosition(fragmentsLenList, commonRepeatInfo, positionDic, commonCount)
commonPositionDic = commonPositionListToDic(commonPositionList)
# printCommonPositionDic(commonPositionDic)

# the top common repeat sequence
singleRepeatSeqCount, comRepeatSeqCount = topRepeatSeqCounter(fragmentsSeqList, commonRepeatInfo, positionDic, fragmentN)printTopSeqPercentageTable(comRepeatSeqCount, showPartialSeq=True)