In [1]:
import re
import os
import sys
import copy
import random
import numpy as np 
import pandas as pd 
from Bio.Seq import Seq
from collections import Counter, OrderedDict
import importlib
# importlib.reload(sys.modules['RepeatFinder'])

from Util.FragmentUtil import fragmentLenPlot
from Evaluation.RepearEvaluation import getDfamSearchData
from Util.SeqUtil import seqInfo, parseFasta, parseSeq, parseSeqByCutter
from DataInfo import currDataset, datasetPath, matchPattern, cutter, cutterLen, fragmentN, commonCount
from RepeatFinder import findRepeatSeqs, commonRepeatTable, commonRepeatSeqsPosition, commonPositionListToDic, printCommonPositionDic, topRepeatSeqCounter, printTopSeqPercentageTable, integrateRepeatInfo, getIRComb, evaluateRepeat, generateIROutputFile, checkTandemRepeatExist, longestCommonLength, integrateTandemRepeatInfo, generateTROutputFile, generateFragmentOutputFile


In [2]:
parseFastaSeqs = parseFasta(currDataset, datasetPath, matchPattern, matchMode = False)
seqInfo(currDataset, parseFastaSeqs)

...start parsing chr1.fa fasta file ...
...cost4.980936765670776 sec to parse fasta file ...
chr1.fa dataset
 number of sequence:1
 total length:248956422



In [5]:
fragmentsLenList, fragmentsSeqList = parseSeqByCutter(parseFastaSeqs)

...start parse seq by cutter: GATC
...cost 2.340378999710083 sec to cut sequence


In [6]:
allRepeatSeqCount, commonRepeatSeq, repeatCount, positionDic = findRepeatSeqs(fragmentsLenList, cutter , fragmentN, commonCount, allRepeatSeqType=1)

... start finding repeat seq ...
...cost1.3174898624420166 sec to finding repeat seq  ...


In [14]:
# tandem Repeat in N fragment
tandemRepeatList = list(filter(checkTandemRepeatExist, allRepeatSeqCount))
tandemRepeatInfoList = integrateTandemRepeatInfo(fragmentsSeqList, fragmentsLenList, tandemRepeatList, positionDic) # integrateTandemRepeatInfo -> only difference is return seq format
generateTROutputFile(tandemRepeatInfoList, matchRatioOfSum=0.4)

In [15]:
tandemRepeatInfoList[:2]

[SeqRepeatInfo(fragmentLenList=(1495, 394, 49, 55, 49), count=2, position=[IRSPositionInfo(chrIdx=0, fragmentIdx=261, baseIdx=365287, seqList=[Seq('TCGCTTGGCCCCCACCTGATTCCCGACATACAGCAGAGGAACCTTAGGCTCAGG...TCA'), Seq('CCACGCCTGAGCCTCCGCCTCTCCGTGCAGTCCCGGAGATGGCACACAGCCTTC...TCG'), Seq('ATATACACATTTGTGCACACGTGTTCATATTCACACTCCTACACACCCA'), Seq('ATACACACATACACACACTTGTGCATACACATTCATGCTCACTCCCACACACCCA'), Seq('ATATACACACTCGTGCACACATGCTCACATTCACAATCACTCATACCCA')]), IRSPositionInfo(chrIdx=0, fragmentIdx=425, baseIdx=594351, seqList=[Seq('TCGCTTGGCCCCCACCTGATTCCTGACATACAGCAGAGGAAGCTTAGGCTCAGG...TCA'), Seq('CCACGTCTGAGCCTCCGCCTTTCCGTGCAGTCCCGGAGATGGCACACAGCCTTC...TCG'), Seq('ATATACACATTTGTGCACACGTGTTCATATTCACACTCCTACACACCCA'), Seq('ATACACACATACACACACTTGTGCATACACATTCATGCTCACTCCCACACACCCA'), Seq('ATATACACACTCGTGCACACATGCTCACATTCACAATCACTCATACCCA')])]),
 SeqRepeatInfo(fragmentLenList=(394, 49, 55, 49, 45), count=2, position=[IRSPositionInfo(chrIdx=0, fragmentIdx=262, baseIdx=366786, seqList=[Seq('C

In [None]:
# interspersed repetitive sequences
repeatInfoList = integrateRepeatInfo(fragmentsSeqList, fragmentsLenList, allRepeatSeqCount, positionDic)
seqPermutation = getIRComb(repeatInfoList)
generateIROutputFile(seqPermutation, matchRatioOfSum=0.4)

In [None]:
repeatInfoList[:2]

In [None]:
generateFragmentOutputFile(tandemRepeatInfoList, matchRatioOfSum=0.4)

In [64]:
# rf, repeat Fragments


In [None]:
# find repeat seq length list with tolerance
topRepeatSeqKey = commonRepeatInfo[0][0]
topKeySum = sum(list(topRepeatSeqKey))
tolerance = topKeySum *0.05
topRepeatSeqKey

positionKeys = list(positionDic.keys())

matchKeys = []
for i in range(len(positionKeys)):
    temKey = positionKeys[i]
    for j in range(fragmentN):
        flag = 1
        if abs(topRepeatSeqKey[j]- temKey[j]) >= tolerance:
            flag = -1
            break
    if flag == 1:
        matchKeys.append(temKey)
        
matchLenList = []
for i in range(len(matchKeys)):
    matchLenList.append(positionDic[matchKeys[i]])

In [None]:
flattenMatchSeq = [ j for subset in matchLenList for j in subset ]
candidatePoiDic = dict()
[ candidatePoiDic [t[0]].append(t[1]) if t [0] in list(candidatePoiDic.keys()) else candidatePoiDic.update({t [0]: [t [1]]}) for t in flattenMatchSeq ]
sortedCandidatePoiDic = OrderedDict(sorted(candidatePoiDic.items()))

In [None]:
matchSeqDic = dict()
for i in sortedCandidatePoiDic:
    chrIndex = i
    matchList = []
    for j in range(len(sortedCandidatePoiDic[chrIndex])):
        startIndex = sortedCandidatePoiDic[chrIndex][j]
        combinedSeq = Seq('').join(fragmentsSeqList[chrIndex][startIndex:startIndex+fragmentN])
        matchList.append(combinedSeq)
    matchSeqDic[chrIndex] = matchList