In [1]:
import re
import os
import sys
import copy
import random
import numpy as np 
import pandas as pd 
import 
from Bio.Seq import Seq
from collections import Counter, OrderedDict
import importlib
# importlib.reload(sys.modules['Evaluation.RepearEvaluation'])

from Util.FragmentUtil import fragmentLenPlot
from Evaluation.RepearEvaluation import getHitFamilyACIds, getDfamConSeqData
from Util.SeqUtil import seqInfo, parseFasta, parseSeq, parseSeqByCutter
from DataInfo import currDataset, datasetPath, matchPattern, cutter, cutterLen, fragmentN, commonCount
from RepeatFinder import findRepeatSeqs, integrateRepeatInfo, getIRComb, evaluateRepeat, generateIROutputFile, checkTandemRepeatExist, generateTROutputFile, generateFragmentOutputFile


In [2]:
parseFastaSeqs = parseFasta(currDataset, datasetPath, matchPattern, matchMode = False)
seqInfo(currDataset, parseFastaSeqs)

...start parsing chr1.fa fasta file ...
...cost9.580286264419556 sec to parse fasta file ...
chr1.fa dataset
 number of sequence:1
 total length:248956422



In [3]:
fragmentsLenList, fragmentsSeqList = parseSeqByCutter(parseFastaSeqs)

...start parse seq by cutter: GATC
...cost 3.420619010925293 sec to cut sequence


In [4]:
repeatFragNLenList, repeatFragNPositionDict = findRepeatSeqs(fragmentsLenList)

... start finding repeat seq ...
...cost0.8382689952850342 sec to finding repeat seq  ...


In [5]:
# tandem Repeat in N fragment
tandemRepeatList = list(filter(checkTandemRepeatExist, repeatFragNLenList))
tandemRepeatInfoList = integrateRepeatInfo(fragmentsSeqList, fragmentsLenList, tandemRepeatList, repeatFragNPositionDict, repeatType=1)
generateTROutputFile(tandemRepeatInfoList, matchRatioOfSum=0.4)

In [6]:
tandemRepeatInfoList[:2]

[RepeatFragNInfo(fragmentLenList=(1495, 394, 49, 55, 49), count=2, position=[IRSPositionInfo(chrIdx=0, fragmentIdx=261, baseIdx=365287, seqList=[Seq('TCGCTTGGCCCCCACCTGATTCCCGACATACAGCAGAGGAACCTTAGGCTCAGG...TCA'), Seq('CCACGCCTGAGCCTCCGCCTCTCCGTGCAGTCCCGGAGATGGCACACAGCCTTC...TCG'), Seq('ATATACACATTTGTGCACACGTGTTCATATTCACACTCCTACACACCCA'), Seq('ATACACACATACACACACTTGTGCATACACATTCATGCTCACTCCCACACACCCA'), Seq('ATATACACACTCGTGCACACATGCTCACATTCACAATCACTCATACCCA')]), IRSPositionInfo(chrIdx=0, fragmentIdx=425, baseIdx=594351, seqList=[Seq('TCGCTTGGCCCCCACCTGATTCCTGACATACAGCAGAGGAAGCTTAGGCTCAGG...TCA'), Seq('CCACGTCTGAGCCTCCGCCTTTCCGTGCAGTCCCGGAGATGGCACACAGCCTTC...TCG'), Seq('ATATACACATTTGTGCACACGTGTTCATATTCACACTCCTACACACCCA'), Seq('ATACACACATACACACACTTGTGCATACACATTCATGCTCACTCCCACACACCCA'), Seq('ATATACACACTCGTGCACACATGCTCACATTCACAATCACTCATACCCA')])]),
 RepeatFragNInfo(fragmentLenList=(394, 49, 55, 49, 45), count=2, position=[IRSPositionInfo(chrIdx=0, fragmentIdx=262, baseIdx=366786, seqList=[Se

In [8]:
# interspersed repetitive sequences
repeatInfoList = integrateRepeatInfo(fragmentsSeqList, fragmentsLenList, repeatFragNLenList, repeatFragNPositionDict, repeatType=2)
seqPermutation = getIRComb(repeatInfoList)
generateIROutputFile(seqPermutation, matchRatioOfSum=0.4)

In [10]:
repeatInfoList[:2]

[SeqRepeatInfo(fragmentLenList=(46, 221, 139, 482, 101), count=2, position=[IRSPositionInfo(chrIdx=0, fragmentIdx=1, baseIdx=12414, seq=Seq('AGGCAGGCCATCGCTGCCACAGAACCCAGTGGATTGGCCTAGGTGGTCTGAGCT...CGA')), IRSPositionInfo(chrIdx=0, fragmentIdx=176, baseIdx=182933, seq=Seq('AGGCAGGCCATCGCTGCCACAGAACCCAGTGGATTGGCCTAGGTGGTCTGAGCT...CGA'))]),
 SeqRepeatInfo(fragmentLenList=(221, 139, 482, 101, 142), count=2, position=[IRSPositionInfo(chrIdx=0, fragmentIdx=2, baseIdx=12464, seq=Seq('TCTGAGCTCAACAAGCCCTCTCTGGGTGGTAGGTGCAGAGACGGGAGGGGCAGA...AGG')), IRSPositionInfo(chrIdx=0, fragmentIdx=177, baseIdx=182983, seq=Seq('TCTGAGCTCAACAAGCCCTCTCTGGGTGGTAGGTGCAGAGACGGGAGGGGCAGA...AGG'))])]

In [None]:
generateFragmentOutputFile(tandemRepeatInfoList, matchRatioOfSum=0.4)

In [64]:
# rf, repeat Fragments


In [None]:
# find repeat seq length list with tolerance
topRepeatSeqKey = commonRepeatInfo[0][0]
topKeySum = sum(list(topRepeatSeqKey))
tolerance = topKeySum *0.05
topRepeatSeqKey

positionKeys = list(repeatFragNPositionDict.keys())

matchKeys = []
for i in range(len(positionKeys)):
    temKey = positionKeys[i]
    for j in range(fragmentN):
        flag = 1
        if abs(topRepeatSeqKey[j]- temKey[j]) >= tolerance:
            flag = -1
            break
    if flag == 1:
        matchKeys.append(temKey)
        
matchLenList = []
for i in range(len(matchKeys)):
    matchLenList.append(repeatFragNPositionDict[matchKeys[i]])

In [None]:
flattenMatchSeq = [ j for subset in matchLenList for j in subset ]
candidatePoiDic = dict()
[ candidatePoiDic [t[0]].append(t[1]) if t [0] in list(candidatePoiDic.keys()) else candidatePoiDic.update({t [0]: [t [1]]}) for t in flattenMatchSeq ]
sortedCandidatePoiDic = OrderedDict(sorted(candidatePoiDic.items()))

In [None]:
matchSeqDic = dict()
for i in sortedCandidatePoiDic:
    chrIndex = i
    matchList = []
    for j in range(len(sortedCandidatePoiDic[chrIndex])):
        startIndex = sortedCandidatePoiDic[chrIndex][j]
        combinedSeq = Seq('').join(fragmentsSeqList[chrIndex][startIndex:startIndex+fragmentN])
        matchList.append(combinedSeq)
    matchSeqDic[chrIndex] = matchList