In [3]:
import sys
import re
import time
import timeit
import random
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import statistics
%matplotlib inline

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqIO.FastaIO import FastaIterator
from collections import Counter
from itertools import islice, groupby, permutations
from collections import Counter, namedtuple
from prettytable import PrettyTable

import import_ipynb
from Util import parseSeq
from DataInfo import currDataset, datasetPath, matchPattern, cutter, cutterLen, fragmentN, commonCount
from DataStructure import SeqRepeatInfo, PositionInfo, RepeatEvaInfo

importing Jupyter notebook from Util.ipynb
importing Jupyter notebook from DataInfo.ipynb


In [1]:
'''
Return
allRepeatSeq: repeat seq (if repeatCount > 1)
commonRepeatSeq: top fragmentN repeat seq
Parameter
allRepeatSeqType: 0 dict, 1 list of tuple
'''
def findRepeatSeqs(fragmentsLenList, cutter , fragmentN, commonCount, allRepeatSeqType = 1):
    cutterLen = len(cutter)
    repeatCount, positionDic = findRepeatSeq(fragmentsLenList, fragmentN , cutterLen)
    allRepeatSeq = allRepeatSeqType and [(k, v) for k, v in repeatCount.items() if v > 1] or {k: v for k, v in repeatCount.items() if v > 1}
    commonRepeatSeq = repeatCount.most_common(commonCount)
    return allRepeatSeq, commonRepeatSeq, repeatCount, positionDic

In [None]:
def commonRepeatTable(commonRepeatInfo, positionDic, commonCount):
    # compareTable = PrettyTable(['FragmentLen', 'Count', 'Position'])
    compareTable = PrettyTable(['FragmentLen', 'Count'])
    for i in range(commonCount):
        fragmentLenValue = commonRepeatInfo[i][0]
        count = commonRepeatInfo[i][1]
        position = positionDic[fragmentLenValue]
    #     compareTable.add_row([fragmentLenValue, count, position])
        compareTable.add_row([fragmentLenValue, count])
    print(compareTable)

In [None]:
# interspersed repeats finding, join count and position with seqCombination
def findRepeatSeq(lenList, N, cutterLen):
    print(f'... start finding repeat seq ...')
    start = time.time()
    positionDic = {}
    repeatCount = Counter()
    preIndex = 0
    currentIndex = 0
    for index, i in enumerate(lenList):
        for j in range(len(i) - N + 1):
            seqCombination = tuple(x for x in i[j:j+N])
            repeatCount[seqCombination] += 1
            if seqCombination in positionDic:  
                positionDic.get(seqCombination).append(tuple([index, j]))   
            else:
                positionDic[seqCombination] = [tuple([index, j])]
            
    end = time.time() 
    print(f'...cost{ end-start } sec to finding repeat seq  ...')
    return repeatCount, positionDic

In [2]:
def commonRepeatSeqsPosition(fragmentsLenList, commonRepeatInfo, positionDic, commonCount):
    positionList = []
    for i in range(commonCount):
        targetFragmentLenList = commonRepeatInfo[i][0]
        targetFragmentDic = positionDic[targetFragmentLenList]
        for j in range(len(targetFragmentDic)):
            chrIndex = targetFragmentDic[j][0]
            fragmentIndex = targetFragmentDic[j][1]
            positionValue = sum(fragmentsLenList[chrIndex][:fragmentIndex])+(cutterLen*fragmentIndex)
            positionList.append(tuple([chrIndex, positionValue]))
    positionList.sort(key=lambda tup: tup[0])
    return positionList

In [5]:
# Repeat Position {chr i: startIndex1, startIndex2, startIndex3 ...}
def commonPositionListToDic(commonPositionList):
    commonPositionDic = dict()
    [ commonPositionDic [t[0]].append(t [1]) if t [0] in list(commonPositionDic.keys()) else commonPositionDic.update({t [0]: [t [1]]}) for t in commonPositionList ]
    return commonPositionDic

In [1]:
def printCommonPositionDic(commonPositionDic):
    keys = list(commonPositionDic.keys())
    for i in range(len(keys)):
        commonPositionDic[keys[i]].sort()
        print(f"\n{keys[i]}:\n{','.join( str(v) for v in commonPositionDic[keys[i]] )}\n")

In [None]:
def topRepeatSeqCounter(fragmentsSeqList, commonRepeatInfo, positionDic, fragmentN):
    topRepeatValue = commonRepeatInfo[0][0]
    topRepeatCount = commonRepeatInfo[0][1]
    print(f"the top repeat sequence: {topRepeatValue},{topRepeatCount}")
    topRepeatPosition = positionDic[topRepeatValue]
    topOriginalRepeatSeqs = []
    topCombinedRepeatSeqs = []
    for i in range(len(topRepeatPosition)):
        chrIndex, fragmentIndex = topRepeatPosition[i][0], topRepeatPosition[i][1]
        originalSeq = fragmentsSeqList[chrIndex][fragmentIndex]
        combinedSeq = Seq('').join(fragmentsSeqList[chrIndex][fragmentIndex:fragmentIndex+fragmentN])
        topOriginalRepeatSeqs.append(originalSeq)
        topCombinedRepeatSeqs.append(combinedSeq)
    singleRepeatSeqCount = Counter(topOriginalRepeatSeqs)
    comRepeatSeqCount = Counter(topCombinedRepeatSeqs)
    return singleRepeatSeqCount, comRepeatSeqCount

In [None]:
def printTopSeqPercentageTable(repeatSeqCount, showPartialSeq=True):
    repeatTable = PrettyTable(['Seq', 'Count', 'Percentage(%)'])
    sumCount = sum(repeatSeqCount.values())
    commonSeqs = repeatSeqCount.most_common(commonCount)
    for i in range(commonCount):
        currentSeq = showPartialSeq and commonSeqs[i][0][:10] or commonSeqs[i][0]
        currentCount = commonSeqs[i][1]
        currentPro = currentCount*100 / sumCount
        repeatTable.add_row([currentSeq, currentCount, currentPro])
    print(repeatTable)

In [None]:
'''
Return 
repeatInfoList: SeqRepeatInfo [(fragmentLenList, count, position: PositionInfo(chrIdx, fragmentIdx, baseIdx, seq)), ... ]  
'''
def integrateRepeatInfo(fragmentsSeqList, fragmentsLenList, allRepeatSeq, positionDic, repeatCount):
    repeatInfoList = []
    for i in range(repeatCount):
        positionList = []
        lenData = allRepeatSeq[i][0]
        countData = allRepeatSeq[i][1]
        targetPositionDic = positionDic[lenData]
        for j in range(len(targetPositionDic)):
            chrIdx = targetPositionDic[j][0]
            fragmentIdx = targetPositionDic[j][1]
            baseIdx = sum(fragmentsLenList[chrIdx][:fragmentIdx])+(cutterLen*fragmentIdx)
            seq = Seq('').join(fragmentsSeqList[chrIdx][fragmentIdx:fragmentIdx+fragmentN])
            positionList.append(PositionInfo(chrIdx, fragmentIdx, baseIdx, seq))
        repeatInfoList.append(SeqRepeatInfo(lenData, countData, positionList))
    return repeatInfoList

In [1]:
def getSeqPermutation(repeatInfoList):
    seqPermutation = []
    for i in range (len(repeatInfoList)):
        perm = permutations(repeatInfoList[i].position, 2) 
        seqPermutation.append(list(perm))
    return seqPermutation

In [None]:
def evaluateRepeat(seq1, seq2, match=1, mismatch=-0.5):
    score = 0
    mismatchCount = 0
    for base1, base2 in zip(seq1, seq2):
            if base1 == base2 and base1 != '-':
                score += match
            elif base1 != base2 and base1 != '-' and base2 != '-':
                score += mismatch
                mismatchCount += 1
    seqLength = max(len(seq1), len(seq2))
    mismatchRatio = mismatchCount/seqLength  
    return RepeatEvaInfo(score ,seqLength, mismatchRatio) 

In [2]:
def generateOutputFile(seqPermutation):
    with open('output.txt', 'w') as outputFile:
    # for i in range(len(seqPermutation)):
        for i in range(3):
            for j in range(len(seqPermutation[i])):
                # type(seqPermutation[i][j][0 or 1]): PositionInfo
                seq1 = seqPermutation[i][j][0] 
                seq2 = seqPermutation[i][j][1] 
                repeatEvaInfo = evaluateRepeat(seq1.seq , seq2.seq)
                if repeatEvaInfo.score > repeatEvaInfo.length*0.6:
                    output = f"""score:{repeatEvaInfo.score}, length:{repeatEvaInfo.length}, mismatch:{repeatEvaInfo.mismatchRatio}\nSeq1:({seq1.chrIdx}, {seq1.baseIdx}) {seq1.seq}\nSeq2:({seq2.chrIdx}, {seq2.baseIdx}) {seq2.seq}\n\n"""
                    outputFile.write(output)