In [1]:
import re
import os
import sys
import csv
import copy
import random
import numpy as np 
import pandas as pd 
from Bio.Seq import Seq
from collections import Counter, OrderedDict, namedtuple
from prettytable import PrettyTable

import importlib
from Util.FragmentUtil import fragmentLenPlot
# from Evaluation.RepeatEvaluation import getRepeatPositionList, positionBucketClassifier, checkDfamMatch, generateDfamPositionData, getDfamPositionList, checkOutputMatch
from Util.SeqUtil import seqInfo, parseFasta, parseSeq, parseSeqByCutter
from DataInfo import currDatasetName, currDataset, datasetPath, matchPattern, cutter, cutterLen, fragmentN, commonCount
from RepeatFinder import findRepeatSeqs, integrateRepeatInfo, getIRComb, evaluateRepeat, generateIROutputFile, checkTandemRepeatExist, generateTROutputFile, generateFragmentOutputFile
from Evaluation.RepeatEvaluation import RepeatEvaluation
from Evaluation.DfamEvaluation import DfamEvaluation

importlib.reload(sys.modules['Evaluation.RepeatEvaluation'])

<module 'Evaluation.RepeatEvaluation' from '/Users/apple/Desktop/BioProject/GenomeAnalysis/src/Evaluation/RepeatEvaluation.py'>

In [2]:
from Bio import pairwise2

In [3]:
parseFastaSeqs = parseFasta(currDataset, datasetPath, matchPattern, matchMode = False)
seqInfo(currDataset, parseFastaSeqs)

...start parsing chrY.fa fasta file ...
...cost0.9186320304870605 sec to parse fasta file ...
chrY.fa dataset
 number of sequence:1
 total length:57227415



In [4]:
fragmentsLenList, fragmentsSeqList = parseSeqByCutter(parseFastaSeqs)

...start parse seq by cutter: GATC
...cost 0.21804189682006836 sec to cut sequence


In [5]:
repeatFragNLenList, repeatFragNPositionDict = findRepeatSeqs(fragmentsLenList)

... start finding repeat seq ...
...cost0.14251184463500977 sec to finding repeat seq  ...


In [6]:
# # tandem Repeat in N fragment
# tandemRepeatLenList = list(filter(checkTandemRepeatExist, repeatFragNLenList))
# tandemRepeatInfoList = integrateRepeatInfo(fragmentsSeqList, fragmentsLenList, tandemRepeatLenList, repeatFragNPositionDict, repeatType=1)
# generateTROutputFile(tandemRepeatInfoList, outputFileName= f"{currDatasetName}TRS", matchRatioOfSum=0.4)

In [7]:
# interspersed repetitive sequences
repeatInfoList = integrateRepeatInfo(fragmentsSeqList[:1], fragmentsLenList, repeatFragNLenList, repeatFragNPositionDict, repeatType=2, filtered=True)
# seqPermutation = getIRComb(repeatInfoList)
# generateIROutputFile(seqPermutation, outputFileName= f"{currDatasetName}IRS", matchRatioOfSum=0.4)

In [8]:
# For Evaluation
evaluation = RepeatEvaluation(repeatInfoList)
repeatPositionList = evaluation.getRepeatPositionList()
repeatPositionLookupDic = evaluation.positionBucketClassifier()

In [9]:
# generateDfamPositionData(readFileName = "hg38_dfam_nrph_chrY", outputFileName = "DfamChrYPositionData")
# dfamDatasetMatchList = evaluation.checkDfamMatch(repeatPositionLookupDic, readFileName = "DfamChrYPositionData")

In [12]:
dfamPositionList = evaluation.getDfamPositionList(readFileName = "DfamChrYPositionData")
dfamPositionLookupDic = evaluation.positionBucketClassifier(bucketNum= 100)

In [15]:
outputMatchList, matchedFamilyAccList, matchedFamilyNameList = evaluation.checkOutputMatch(dfamPositionList, dfamPositionLookupDic, bucketNum= 100)

In [16]:
matchIdxList = [ idx for idx, value in enumerate(outputMatchList) if value == True]
matchIdxList[:10]
# repeatPositionList[i] >  outputMatchList[i] == True, Check Sequence Position and Base

[78]

In [17]:
repeatPositionList[0]

PositionInfo(startIdx=1638759, endIdx=1639250)

In [48]:
targetSeq = repeatInfoList[0].position[0].seq
targetSeq = "TTGAAGTTCTGACCTCCTGTCAATATCCCTTCCCCTCACCTTGACCCTCCCATTCTGCCCCACCTGTCAG"
len(targetSeq)

Target: TTGAAGTTCTGACCTCCTGTCAATATCCCTTCCCCTCACCTTGACCCTCCCATTCTGCCCCACCTGTCAG
Cndidate: 
1. GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGAG
2. AGCCGGGCGTGGTGGCGCGCGCCTGTAGTCCCAGCTACTCGGGAGGCTGAGGCGGGAG
3. GCTTGAGCCCAGGAGNTCGAGGCTGCAGTGAGCTAT

58

In [44]:
DF35ConsensusSeq = "GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGAGGATCGCTTGAGGCCAGGAGTTCGAGACCAGCCTGGGCAACATAGCGAGACCCCGTCTCTACAAAAAATATAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAGTCCCAGCTACTCGGGAGGCTGAGGCGGGAGGATCGCTTGAGCCCAGGAGNTCGAGGCTGCAGTGAGCTATGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACCCTGTCTCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"

In [46]:
alignments = pairwise2.align.localxx(DF35ConsensusSeq, targetSeq)
alignments[0]

Alignment(seqA='GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGAGGATCGCTTGAGGCCAGGAGTTCGAGACCAGCCTGGGCAACATAGCGAGACCCCGTCTCTACAAAAAATATAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAGTCCCAGCTACTCGGGAGGCTGAGGCGGGAGGATCGCTTGAGCCCAGGAGNTCGAGGCTGCAGTGAGCTATGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACCCTGTCTCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA', seqB='--------------------------------------TT---------------GA--A--G-TT-----C------T---G---A-CCT---C--C-T-----------G----T-C-----A-AT----A-T--CC---C-T--T----C-C-CC-----T--CA-C--CT--------TGA--C--------C-C-T----CC-------C-A---T----T---C--TG--C-C-CCA---C-CT---G--T-----------C---A----G----------------------------------', score=70.0, start=38, end=278)

In [35]:
matchedList = list(filter(lambda x: x, outputMatchList))
print(f'len(outputMatchList):{len(outputMatchList)}\nlen(matchedList):{len(matchedList)}\n')

len(outputMatchList):239
len(matchedList):117



In [36]:
matchedFamilyNameCounter = Counter(matchedFamilyNameList).most_common()

In [37]:
fTable = PrettyTable(['FamilyName', 'Count'])
for i in list(matchedFamilyNameCounter):
    fTable.add_row([i[0], i[1]])

In [39]:
print(fTable)

+-------------+-------+
|  FamilyName | Count |
+-------------+-------+
|    AluSx3   |   20  |
|      L2     |   14  |
|    AluSc    |   8   |
|    AluJr4   |   8   |
|      L3     |   7   |
|    AluSg    |   6   |
|    AluJo    |   6   |
|     MIRc    |   4   |
|  L1MA1_3end |   4   |
|    AluYj4   |   4   |
|    AluSx1   |   4   |
|    AluJb    |   2   |
|    AluSx    |   2   |
|    AluSc8   |   2   |
|    AluJr    |   2   |
|  L1M3d_5end |   2   |
| L1M3de_5end |   2   |
|  L1P4_orf2  |   2   |
|    MLT1H    |   2   |
|  L1PA4_3end |   2   |
|    LTR32    |   2   |
|    MLT1K    |   2   |
| MamGypLTR1d |   2   |
|    MER20    |   2   |
|  L1M5_orf2  |   2   |
|  L1MEd_5end |   2   |
|     MIRb    |   2   |
+-------------+-------+
