In [95]:
import re
import os
import sys
import csv
import numpy as np 
import pandas as pd
import seaborn as sns
from matplotlib_venn import venn2
from prettytable import PrettyTable
from matplotlib import pyplot as plt
from Bio import SeqIO, pairwise2, AlignIO
from collections import Counter, namedtuple
from Bio.Align import AlignInfo, MultipleSeqAlignment
import importlib
# importlib.reload(sys.modules['Evaluation.DfamEvaluation'])
from Sequence import Sequence
from Util.SeqUtil import seqInfo, parseFasta
from Evaluation.DfamEvaluation import DfamEvaluation
from DataStructure import PositionInfo, refSeqSimilarityInfo
from SharedInfo import currDatasetName, cutterA, cutterB, colorA, colorB
from Util.AnalysisUtil import listToSortedCounter, getStatisticData, mostCommonTable
from Util.PlotUtil import basicPlot, twoLabelBasicPlot, lengthScatterDistributionPlot
from MultipleCutter import MultipleCutter


In [96]:
# ROO_LTR_df = pd.read_csv("./Evaluation/Source/chrX_ROO_LTR_repeatSeq.csv")
# ROO_counter = Counter(list(ROO_LTR_df['length']))
# ROO_counter.most_common(len(ROO_counter))

In [97]:
seqA = Sequence(cutterA)
parseFastaA = seqA.parseFasta()
fragmentLenListA, fragmentSeqListA = seqA.parseSeqByCutter()
repeatInfoListA = seqA.findRepeatSeqs(lengthLimit=False)
filterRepeatInfoA = seqA.filterRepeatInfo()
repeatPositionListA = seqA.getRepeatPositionList(filter=False)

...start parsing dm6/chrX_sequence.fasta fasta file ...
...cost0.15546798706054688 sec to parse fasta file ...
...start parse seq by cutter: GATC
...cost 0.5550940036773682 sec to cut sequence
... start finding repeat seq ...
...cost0.0654139518737793 sec to finding repeat seq  ...


In [98]:
seqB = Sequence(cutterB)
parseFastaB = seqB.parseFasta()
fragmentLenListB, fragmentSeqListB = seqB.parseSeqByCutter()
repeatInfoListB = seqB.findRepeatSeqs(lengthLimit=False)
filterRepeatInfoB = seqB.filterRepeatInfo()
repeatPositionListB = seqB.getRepeatPositionList(filter=False)

...start parsing dm6/chrX_sequence.fasta fasta file ...
...cost0.1565690040588379 sec to parse fasta file ...
...start parse seq by cutter: AAGCTT
...cost 0.32227516174316406 sec to cut sequence
... start finding repeat seq ...
...cost0.006182193756103516 sec to finding repeat seq  ...


In [99]:
seqInfo(currDatasetName, parseFastaA)

dm6/chrX_sequence dataset
 number of sequence:1
 total length:23542271



In [100]:
repeatPositionList = repeatPositionListA + repeatPositionListB

In [101]:
print(f'Check cutter A, B: \n {len(repeatPositionList)} = {len(repeatPositionListA)} + {len(repeatPositionListB)}')

Check cutter A, B: 
 69683 = 66728 + 2955


In [102]:
repeatPositionList[0]

PositionInfo(startIdx=-4, endIdx=432)

In [103]:
df = pd.DataFrame(columns=['startIdx', 'endIdx', 'length'])
for i in repeatPositionList:
    df = df.append({'startIdx': i.startIdx, 'endIdx': i.endIdx, 'length': i.endIdx-i.startIdx}, ignore_index=True)
df.to_csv(f'../outputFile/PartialSeqInfo/NonFilter_Position.csv')

In [122]:
# repeat position
multipleCutter = MultipleCutter(chrLength=len(parseFastaA[0]),repeatPositionList = repeatPositionList )
seqStateList = multipleCutter.seqStateGenerator()
unMatchState, unionState, intersectionState = multipleCutter.getSeqStateInfo()
stateName="union"
matchStateIdxList = multipleCutter.getSpecificStateIdxList(stateName)
matchStatePositionList = multipleCutter.getSpecificStatePositionList()

In [None]:
len(matchStatePositionList)

1474

In [106]:
# df = pd.DataFrame(columns=['startIdx', 'endIdx', 'length'])
# for i in matchStatePositionList:
#     df = df.append({'startIdx': i.startIdx, 'endIdx': i.endIdx, 'length': i.endIdx-i.startIdx}, ignore_index=True)
# df.to_csv(f'../outputFile/PartialSeqInfo/{stateName}.csv')

In [107]:
# df = pd.DataFrame(columns=['startIdx', 'endIdx', 'length'])
# for i in matchStatePositionList:
#     df = df.append({'startIdx': i.startIdx, 'endIdx': i.endIdx, 'length': i.endIdx-i.startIdx}, ignore_index=True)
# df['length'] = df['length'].astype('int32')
# df['length'].describe()
# df.to_csv(f'../outputFile/txtFile/{stateName}AndNonFilter_Position.csv')

In [108]:
# repeatPositionList = matchStatePositionList

In [109]:
# [ Mltiplecutter, Intersection or Union ]
dfam = DfamEvaluation(matchStatePositionList, hitFileName='chrX_LTR_dm6_dfam.nrph.hits')
repeatPositionLookupDic = dfam.positionBucketClassifier()
dfamPositionList = dfam.getDfamPositionList()

# from Dfam , check repeat
DRrepeatMatchList, DRmatchedFamilyAccList, DRmatchedFamilyNameList = dfam.checkDfamMatchWithRepeat()

# from repeat , check Dfam
# RDrepeatMatchList, RDmatchedFamilyAccList, RDmatchedFamilyNameList = dfam.checkRepeatMatchWithDfam()

# dfam.familyMatchRatio(DRmatchedFamilyAccList)
dfam.matchRatio(DRrepeatMatchList)
# unMatchDf = dfam.getUnmatchInfo(DRrepeatMatchList)

matchCount:255	dfamCount:602	Ratio:0.42358803986710963


0.42358803986710963

In [110]:
# [ Non-identical ]
dfam2 = DfamEvaluation(repeatPositionList, hitFileName='chrX_LTR_dm6_dfam.nrph.hits')
repeatPositionLookupDic2 = dfam2.positionBucketClassifier()
dfamPositionList2 = dfam2.getDfamPositionList()

DRrepeatMatchList2, DRmatchedFamilyAccList2, DRmatchedFamilyNameList2 = dfam2.checkDfamMatchWithRepeat()

dfam.matchRatio(DRrepeatMatchList2)

matchCount:597	dfamCount:602	Ratio:0.9916943521594684


0.9916943521594684

In [111]:
def getSequenceLengthAnalsis(inputLengthList, num=10):
    """
    1. count of common length 
    2. statistic info
    3. distribution plot
    """
    mostCommonTable(Counter(inputLengthList).most_common(num), num)
    getStatisticData(inputLengthList)
    sortedCounterList = listToSortedCounter(inputLengthList)
    basicPlot(sortedCounterList)

In [112]:
# totalDfam = DfamEvaluation(repeatPositionList, hitFileName="chrX_dm6_dfam.nrph.hits")
# totalRepeatPositionLookupDic = totalDfam.positionBucketClassifier()
# totalDfamPositionList = totalDfam.getDfamPositionList()
# unionAndFilter_Position = pd.read_csv('../outputFile/txtFile/unionAndFilter_Position.csv')
# intersectionAndFilter_Position = pd.read_csv('../outputFile/txtFile/intersectionAndFilter_Position.csv')
# Filter_Position = pd.read_csv('../outputFile/txtFile/Filter_Position.csv')
# NonFilter_Position = pd.read_csv('../outputFile/txtFile/NonFilter_Position.csv')

# unionAndFilter_Counter = listToSortedCounter((unionAndFilter_Position['length']))
# intersectionAndFilter_Counter = listToSortedCounter((intersectionAndFilter_Position['length']))

In [113]:
# df = pd.DataFrame(columns=["x", "y", "type"], dtype=float)
# for row in unionAndFilter_Counter:
#     df = df.append({"x": row[0], "y": row[1], "type": "unionAndFilter"}, ignore_index=True)
# for row in intersectionAndFilter_Counter:
#     df = df.append({"x": row[0], "y": row[1], "type": "intersectionAndFilter"}, ignore_index=True)

# df.fillna(np.nan, inplace=True)
# fig, ax = plt.subplots(figsize=(10, 6), dpi=300)
# sns.set_style("whitegrid")
# sns.lineplot(data=df, x="x", y="y", hue="type", palette="Set1")
# ax.set_xlabel("Length", size=15)
# ax.set_ylabel("Count", size=15)
# ax.set_xlim(0, 1500)

In [114]:
# Dfam ref sequence 
# dfamSeqLenList = [ i.endIdx - i.startIdx for i in dfamPositionList ]
# getSequenceLengthAnalsis(dfamSeqLenList)

In [115]:
# Repeat sequence
# repeatFragmentLenList = [ i.endIdx - i.startIdx for i in repeatPositionList ]
# getSequenceLengthAnalsis(repeatFragmentLenList)

In [116]:
# def consensusSeqSimilarity(consensusSeq, seqDf):
#     print("hihi", len(seqDf))
#     seqSimilarityList = []
#     for targetSeq in seqDf:
#         alignments = pairwise2.align.globalxx(targetSeq, consensusSeq)
#         targetLength = len(targetSeq)
#         similarityPercentage = round(alignments[0].score / targetLength, 2)
#         seqSimilarityList.append(similarityPercentage)
#     return seqSimilarityList

# repeatMatchIdxList = []
# for idx, value in enumerate(RDrepeatMatchList):
#     if value == True:
#         repeatMatchIdxList.append(idx)
# repeatBasePositionList = [repeatPositionList[i] for i in repeatMatchIdxList]
# repeatSeqDf = pd.DataFrame(columns=['startIdx','endIdx', 'length', 'seq'])
# for i in repeatBasePositionList:
#     repeatSeqDf = repeatSeqDf.append({'startIdx':i.startIdx ,'endIdx': i.endIdx, 'length': (i.endIdx- i.startIdx), 'seq': str(parseFastaA[0][i.startIdx:i.endIdx])}, ignore_index=True)
# conParseFasta = parseFasta(
#     "DF0001696_ROO_LTR",
#     "./Evaluation/Source/DF0001696_ROO_LTR.fa",
#     "*",
#     matchMode=False,
# )
# consensusSeq = conParseFasta[0].upper()
# repeatDf = pd.read_csv('./Evaluation/Source/chrX_ROO_LTR_repeatSeq.csv')
# seqDf = repeatDf["seq"]
# seqSimilarityList = consensusSeqSimilarity(consensusSeq, seqDf)
# pd.Series(seqSimilarityList).describe()

In [117]:
# # Test cutter A
# repeatPositionListA = seqA.getRepeatPositionList()
# dfamA = DfamEvaluation(repeatPositionListA)
# repeatPositionLookupDicA = dfamA.positionBucketClassifier()
# dfamPositionListA = dfamA.getDfamPositionList()
# dfamPositionLookupDicA = dfamA.positionBucketClassifier()
# DRrepeatMatchListA, DRmatchedFamilyAccListA, DRmatchedFamilyNameListA = dfamA.checkDfamMatchWithRepeat()

In [118]:
# totalLen = len(DRrepeatMatchListA)
# matchLenA = len(list(filter(lambda x: x, DRrepeatMatchListA)))
# ratio = matchLenA / totalLen
# print(f"matchCount:{matchLenA}\tdfamCount:{totalLen}\tRatio:{ratio}")

In [119]:
# # Test cutter B
# repeatPositionListB = seqB.getRepeatPositionList()
# dfamB = DfamEvaluation(repeatPositionListB)
# repeatPositionLookupDicB = dfamB.positionBucketClassifier()
# dfamPositionListB = dfamB.getDfamPositionList()
# dfamPositionLookupDicB = dfamB.positionBucketClassifier()
# DRrepeatMatchListB, DRmatchedFamilyAccListB, DRmatchedFamilyNameListB = dfamB.checkDfamMatchWithRepeat()

In [120]:
# totalLen = len(DRrepeatMatchListB)
# matchLenB = len(list(filter(lambda x: x, DRrepeatMatchListB)))
# ratio = matchLenB / totalLen
# print(f"matchCount:{matchLenB}\tdfamCount:{totalLen}\tRatio:{ratio}")

In [121]:
# total = 597
# middle = matchLenA+matchLenB - total
# plt.figure(linewidth=10, facecolor="white", dpi=1200)
# # plt.figure(linewidth=10, facecolor="white")
# v = venn2(subsets = (matchLenA-middle, matchLenB-middle, middle), set_labels = (f'CutterA - {cutterA} ', f'CutterB - {cutterB}'), set_colors=(colorA, colorB))
# plt.show()
