In [26]:
import re
import os
import sys
import csv
import numpy as np 
import pandas as pd
import seaborn as sns
from prettytable import PrettyTable
from matplotlib import pyplot as plt
from Bio import SeqIO, pairwise2, AlignIO
from collections import Counter, namedtuple
from Bio.Align import AlignInfo, MultipleSeqAlignment
import importlib
importlib.reload(sys.modules['MultipleCutter'])
from Sequence import Sequence
from MultipleCutter import MultipleCutter
from Util.SeqUtil import seqInfo, parseFasta, parseSeqByCutter
from DataStructure import refSeqSimilarityInfo
from Evaluation.DfamEvaluation import DfamEvaluation
from SharedInfo import currDatasetName, cutterA, cutterB, colorA, colorB
from Util.AnalysisUtil import listToSortedCounter, getStatisticData, mostCommonTable
from Util.PlotUtil import basicPlot, twoLabelBasicPlot, lengthScatterDistributionPlot


In [2]:
outputFolder = "../outputFile"

In [3]:
seqA = Sequence(cutterA)
parseFastaA = seqA.parseFasta()
fragmentLenListA, fragmentSeqListA = seqA.parseSeqByCutter()
repeatInfoListA = seqA.findRepeatSeqs(lengthLimit=False)
filterRepeatInfoA = seqA.filterRepeatInfo() 
repeatPositionListA = seqA.getRepeatPositionList(filter=True)  # filter out identical
seqAState = seqA.seqStateGenerator()
seqA.generateRepeatFragentFile(filePath=f'{outputFolder}/seqRepeatPosition/seq_{cutterA}.txt')
# seqA.calculateConsistencyRatio(seqA.repeatPositionTable).sum() , seqA.calculateConsistencyRatio(seqA.repeatPositionTable).mean()

...start parsing dm6/chrX_sequence.fasta fasta file ...
...cost0.13225388526916504 sec to parse fasta file ...
...start parse seq by cutter: GATC
...cost 0.7786622047424316 sec to cut sequence
... start finding repeat seq ...
...cost0.06371116638183594 sec to finding repeat seq  ...


In [4]:
seqB = Sequence(cutterB)
parseFastaB = seqB.parseFasta()
fragmentLenListB, fragmentSeqListB = seqB.parseSeqByCutter()
repeatInfoListB = seqB.findRepeatSeqs(lengthLimit=False)
filterRepeatInfoB = seqB.filterRepeatInfo()
repeatPositionListB = seqB.getRepeatPositionList(filter=True)
seqBState = seqB.seqStateGenerator()
seqB.generateRepeatFragentFile(filePath=f'{outputFolder}/seqRepeatPosition/seq_{cutterB}')

...start parsing dm6/chrX_sequence.fasta fasta file ...
...cost0.11923623085021973 sec to parse fasta file ...
...start parse seq by cutter: AAGCTT
...cost 0.6895277500152588 sec to cut sequence
... start finding repeat seq ...
...cost0.005896091461181641 sec to finding repeat seq  ...


In [5]:
matchStateLen = len(list(filter(lambda x: x == 1, seqAState)))
print(f'SeqState A Output Percentage: {matchStateLen / len(parseFastaA[0])}')

SeqState A Output Percentage: 0.037840232150925454


In [6]:
matchStateLen = len(list(filter(lambda x: x == 1, seqBState)))
print(f'SeqState B Output Percentage: {matchStateLen / len(parseFastaA[0])}')

SeqState B Output Percentage: 0.006023165734520684


In [7]:
seqInfo(currDatasetName, parseFastaA)

dm6/chrX_sequence dataset
 number of sequence:1
 total length:23542271



In [8]:
seqA.calculateConsistencyRatio(seqA.repeatPositionTable).mean()

length              381.856549
consistencyRatio      0.665008
dtype: float64

In [9]:
seqB.calculateConsistencyRatio(seqB.repeatPositionTable).mean()

length              959.271186
consistencyRatio      0.899881
dtype: float64

## Multiple cutters

In [27]:
multipleCutter = MultipleCutter(sequence=parseFastaA[0], seqStateList = [seqAState, seqBState])
mergeState = multipleCutter.getSeqStateSum()
unMatchState, unionState, intersectionState = multipleCutter.getSeqStateInfo()
stateName="intersection"
matchStateIdxList = multipleCutter.getSpecificStateIdxList(stateName)
matchStateRepeatInfoList = multipleCutter.getSpecificStatePositionList()

chr: 23542271
unMatch: 22630817, union:911454, intersection:121190


In [28]:
matchStateLen = len(list(filter(lambda x: x == 2, mergeState)))
print(f'SeqState Output Percentage: {matchStateLen / len(parseFastaA[0])}')

SeqState Output Percentage: 0.005147761658167982


## Repeat Seq -> Repeat Fragment

In [29]:
totalRepeat = multipleCutter.cutRepeatSeqToFragment()

...start parse seq by cutter: AAGCTT
...cost 0.0005171298980712891 sec to cut sequence
...start parse seq by cutter: GATC
...cost 0.0001800060272216797 sec to cut sequence
...start parse seq by cutter: AAGCTT
...cost 8.559226989746094e-05 sec to cut sequence
...start parse seq by cutter: GATC
...cost 0.00035691261291503906 sec to cut sequence
...start parse seq by cutter: AAGCTT
...cost 8.821487426757812e-05 sec to cut sequence
...start parse seq by cutter: GATC
...cost 0.0001971721649169922 sec to cut sequence
...start parse seq by cutter: AAGCTT
...cost 2.7894973754882812e-05 sec to cut sequence
...start parse seq by cutter: GATC
...cost 0.00031280517578125 sec to cut sequence
...start parse seq by cutter: GATC
...cost 4.887580871582031e-05 sec to cut sequence
...start parse seq by cutter: AAGCTT
...cost 0.00026917457580566406 sec to cut sequence
...start parse seq by cutter: GATC
...cost 4.076957702636719e-05 sec to cut sequence
...start parse seq by cutter: AAGCTT
...cost 0.0002539

In [30]:
multipleCutter.fragmentGroupbyLen()

0

In [None]:
# # [ Output GroupByData File ]
# df= pd.DataFrame(matchStateRepeatInfoList, columns=['length', 'startIdx', 'endIdx', 'seq'])
# matchDfGroupByLen = df.groupby(by=["length"], sort=True)
# tem = df.groupby(by=["length"]).agg({"length": "sum"})

# original_stdout = sys.stdout
# with open(f'../outputFile/seqRepeatPosition/seqGroupByLenData_{stateName}_Cutter_{cutterA}.txt', 'w') as f:
#     sys.stdout = f
#     for key, row in tem.iterrows():
#         print(f"{key}:")
#         for i in matchDfGroupByLen.get_group(key).index:
#             print(f"({ df.iloc[i]['startIdx']}, {df.iloc[i]['endIdx']})\n{ df.iloc[i]['seq']}")
#         print("\n")
#     sys.stdout = original_stdout

In [None]:
# [ Singal Cutters ] cutterA
dfam = DfamEvaluation(repeatPositionListA, hitFileName='chrX_dm6_dfam.nrph.hits')
repeatPositionLookupDic = dfam.positionBucketClassifier()
dfamPositionList = dfam.getDfamPositionList()

# from Dfam , check repeat
DRrepeatMatchList, DRmatchedFamilyAccList, DRmatchedFamilyNameList = dfam.checkDfamMatchWithRepeat()

# from repeat , check Dfam
RDrepeatMatchList, RDmatchedFamilyAccList, RDmatchedFamilyNameList = dfam.checkRepeatMatchWithDfam()

# dfam.familyMatchRatio(DRmatchedFamilyAccList)
dfam.matchRatio(DRrepeatMatchList)

matchCount:612	dfamCount:4458	Ratio:0.13728129205921938


0.13728129205921938

In [None]:
# [ Singal Cutters ] cutterB
dfam = DfamEvaluation(repeatPositionListB, hitFileName='chrX_dm6_dfam.nrph.hits')
repeatPositionLookupDic = dfam.positionBucketClassifier()
dfamPositionList = dfam.getDfamPositionList()

# from Dfam , check repeat
DRrepeatMatchList, DRmatchedFamilyAccList, DRmatchedFamilyNameList = dfam.checkDfamMatchWithRepeat()

# from repeat , check Dfam
RDrepeatMatchList, RDmatchedFamilyAccList, RDmatchedFamilyNameList = dfam.checkRepeatMatchWithDfam()

# dfam.familyMatchRatio(DRmatchedFamilyAccList)
dfam.matchRatio(DRrepeatMatchList)
unMatchDf = dfam.getUnmatchInfo(DRrepeatMatchList)

matchCount:138	dfamCount:4458	Ratio:0.03095558546433378
count     4320.000000
mean       600.948148
std       1057.043156
min      -2588.000000
25%        119.000000
50%        290.000000
75%        597.250000
max      11164.000000
dtype: float64


In [None]:
# [ Mltiplecutter, Intersection or Union ]
dfam = DfamEvaluation(matchStateRepeatInfoList, hitFileName='chrX_dm6_dfam.nrph.hits')
repeatPositionLookupDic = dfam.positionBucketClassifier()
dfamPositionList = dfam.getDfamPositionList()

# from Dfam , check repeat
DRrepeatMatchList, DRmatchedFamilyAccList, DRmatchedFamilyNameList = dfam.checkDfamMatchWithRepeat()

# from repeat , check Dfam
RDrepeatMatchList, RDmatchedFamilyAccList, RDmatchedFamilyNameList = dfam.checkRepeatMatchWithDfam()

# dfam.familyMatchRatio(DRmatchedFamilyAccList)
dfam.matchRatio(DRrepeatMatchList)
unMatchDf = dfam.getUnmatchInfo(DRrepeatMatchList)

matchCount:117	dfamCount:4458	Ratio:0.026244952893674293
count     4341.000000
mean       602.378715
std       1058.456198
min      -2588.000000
25%        119.000000
50%        292.000000
75%        599.000000
max      11164.000000
dtype: float64
