In [1]:
# !conda install -c bioconda seqkit
# !pip3 install biopython
# This 2.0 version of the telo boundary script will use a slightly more advanced algorithm to identify 
# the telomere boundary. The algorithm will in effect look for a discontinuity in the telomere pattern, 
# marking the point where the sequence changes from telomere to non-telomere as the boundary. 

from Bio import SeqIO
from Bio.Seq import Seq
import numpy as np
import re
import matplotlib.pyplot as plt
from TeloBP import *
import constants as c
import statistics

In [2]:
import threading
import time

def testFunction(output, sample):
    for i in range(sample-10, sample):
        output.append(i)
        # time.sleep(1)

sharedOut=[]

t1 = threading.Thread(target=testFunction, args=(sharedOut, 10,))
t2 = threading.Thread(target=testFunction, args=(sharedOut, 30,))

t1.start()
t2.start()

t1.join()
t2.join()
print("output: "+str(sharedOut))

output: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]


In [4]:


def bruteForceTeloBPArgs(offsetScoresByValuesIn, windowMin, windowMax):
    for teloWindowValue in range(windowMin, windowMax, 6):
        for nucleotideGraphSize in range(6, 600, 6):
            for windowStepValue in range(6,12,3):
                bed_data = []
                currentOffsetScores = []

                filename = "Data/GCA_009914755.4_T2T-CHM13v2.0_genomic.500kb.ends.fna"
                for record in SeqIO.parse(filename, "fasta"):
                    chrName = record.id
                    # print(chrName)

                    # if chrName != "chr18q":
                    #     continue
                    if "p" in chrName:
                        distanceFromStart = getTeloBoundary(record.seq[:9000], isGStrand=False, teloWindow=teloWindowValue,nucleotideGraphAreaWindowSize=nucleotideGraphSize, windowStep=windowStepValue,maxAreaThreshold=-60, minAreaThreshold=-20, showGraphs=False)
                        # print("p end: " + str(distanceFromStart))
                        pEndIndex = distanceFromStart
                        bed_data.append([chrName, pEndIndex-6, pEndIndex, chrName])
                        offset = testTeloLength(chrName, distanceFromStart, c.manualLabelsCHM13)
                        currentOffsetScores.append(abs(offset))
                    else:
                        distanceFromEnd = getTeloBoundary(record.seq[-9000:],isGStrand = True, teloWindow=teloWindowValue,nucleotideGraphAreaWindowSize=nucleotideGraphSize, windowStep=windowStepValue,maxAreaThreshold=-60, minAreaThreshold=-20, showGraphs=False)
                        # print("q end: " + str(distanceFromEnd))
                        qEndIndex = len(record.seq)-distanceFromEnd
                        bed_data.append([chrName, qEndIndex, qEndIndex+6, chrName])
                        offset = testTeloLength(chrName, distanceFromEnd, c.manualLabelsCHM13)
                        currentOffsetScores.append(abs(offset))
                offsetScoresByValuesIn.append([teloWindowValue, nucleotideGraphSize, windowStepValue, sum(currentOffsetScores), statistics.mean(currentOffsetScores), statistics.median(currentOffsetScores)])
# print(offsetScoresByValues)

    # break
# write_bed_file("./teloBoundaryOutput/TeloBP%2.0Values.bed", bed_data)

In [5]:
offsetScoresByValues = []

t1 = threading.Thread(target=bruteForceTeloBPArgs, args=(offsetScoresByValues, 30,60,))
t2 = threading.Thread(target=bruteForceTeloBPArgs, args=(offsetScoresByValues, 60,90,))
t3 = threading.Thread(target=bruteForceTeloBPArgs, args=(offsetScoresByValues, 90,120,))
t4 = threading.Thread(target=bruteForceTeloBPArgs, args=(offsetScoresByValues, 120,150,))
t5 = threading.Thread(target=bruteForceTeloBPArgs, args=(offsetScoresByValues, 150,180,))
t6 = threading.Thread(target=bruteForceTeloBPArgs, args=(offsetScoresByValues, 180,210,))

t1.start()
t2.start()
t3.start()
t4.start()
t5.start()
t6.start()

t1.join()
t2.join()
t3.join()
t4.join()
t5.join()
t6.join()
print(offsetScoresByValues)

chr01q offset: -96bp (obs - exp)
chr01q offset: 0bp (obs - exp)
chr01q offset: -24bp (obs - exp)
chr01q offset: -6bp (obs - exp)chr01q offset: -12bp (obs - exp)

chr02q offset: 24bp (obs - exp)
chr01q offset: -6bp (obs - exp)
chr02q offset: 6bp (obs - exp)
chr02q offset: 6bp (obs - exp)
chr02q offset: 6bp (obs - exp)
chr03q offset: 107bp (obs - exp)
chr02q offset: 0bp (obs - exp)
chr02q offset: 6bp (obs - exp)
chr03q offset: -1bp (obs - exp)
chr03q offset: 29bp (obs - exp)
chr04q offset: 31bp (obs - exp)
chr03q offset: -1bp (obs - exp)
chr03q offset: -1bp (obs - exp)
chr03q offset: -1bp (obs - exp)
chr05q offset: 60bp (obs - exp)
chr04q offset: -35bp (obs - exp)
chr04q offset: 1bp (obs - exp)
chr04q offset: -5bp (obs - exp)
chr04q offset: -11bp (obs - exp)
chr04q offset: -11bp (obs - exp)
chr06q offset: 84bp (obs - exp)
chr05q offset: -30bp (obs - exp)
chr06q offset: -24bp (obs - exp)
chr05q offset: 24bp (obs - exp)
chr07q offset: -2bp (obs - exp)
chr05q offset: -6bp (obs - exp)
chr05q

  areaList.append((area/windowSize))


chr19p offset: -4bp (obs - exp)
chrYp offset: -24bp (obs - exp)
No telo boundary found on q end
chr01q offset: -3211bp (obs - exp)
chr19p offset: -22bp (obs - exp)
No telo boundary found on q end
chr02q offset: -2623bp (obs - exp)
No telo boundary found on q end
chr01q offset: -3211bp (obs - exp)
chr22p offset: 1bp (obs - exp)
No telo boundary found on q end
chr02q offset: -2623bp (obs - exp)
chr20p offset: 3bp (obs - exp)
No telo boundary found on q end
chr02q offset: -2623bp (obs - exp)
No telo boundary found on q end
chr03q offset: -4616bp (obs - exp)
No telo boundary found on q end
chr03q offset: -4616bp (obs - exp)
No telo boundary found on q end
chr03q offset: -4616bp (obs - exp)
chrXp offset: -8bp (obs - exp)
chr20p offset: -3bp (obs - exp)
No telo boundary found on q end
chr04q offset: -2322bp (obs - exp)
No telo boundary found on q end
chr04q offset: -2322bp (obs - exp)
chr21p offset: -30bp (obs - exp)
chr21p offset: -36bp (obs - exp)
chrYp offset: -12bp (obs - exp)
No telo bo

chr16q offset: 83bp (obs - exp)
chr22q offset: -10bp (obs - exp)
chr22q offset: -22bp (obs - exp)
chr16q offset: -37bp (obs - exp)chr16p offset: -1bp (obs - exp)

chr17q offset: 81bp (obs - exp)
chrXq offset: 15bp (obs - exp)
chrXq offset: -21bp (obs - exp)
chr17p offset: 5bp (obs - exp)
chr10p offset: -3bp (obs - exp)
chrYq offset: 32bp (obs - exp)
chr17q offset: -39bp (obs - exp)
chr18q offset: 117bp (obs - exp)
chr18p offset: 289bp (obs - exp)
chrYq offset: -16bp (obs - exp)
chr01p offset: 2bp (obs - exp)
chr18q offset: -3bp (obs - exp)
chr11p offset: 107bp (obs - exp)
chr19q offset: 90bp (obs - exp)
chr19q offset: 90bp (obs - exp)
chr19p offset: -7bp (obs - exp)
chr02p offset: -17bp (obs - exp)
chr20q offset: -15bp (obs - exp)
chr12p offset: -11bp (obs - exp)
chr01p offset: 2bp (obs - exp)
chr20q offset: -39bp (obs - exp)
chr21q offset: -1bp (obs - exp)


In [6]:
# Save the offsetScoresByValues to a file output
with open("./output/offsetScoresByValues.txt", "w") as f:
    for item in offsetScoresByValues:
        f.write("%s\n" % item)

In [7]:
# print out the lowest offset scores for sum, mean, and median
sumSorted = sorted(offsetScoresByValues, key=lambda x: x[3])
meanSorted = sorted(offsetScoresByValues, key=lambda x: x[4])
medianSorted = sorted(offsetScoresByValues, key=lambda x: x[5])
print("sumSorted: "+str(sumSorted[0]))
print("meanSorted: "+str(meanSorted[0]))
print("medianSorted: "+str(medianSorted[0]))

sumSorted: [120, 6, 9, -142459, -2967.8958333333335, -2838.0]
meanSorted: [120, 6, 9, -142459, -2967.8958333333335, -2838.0]
medianSorted: [120, 6, 9, -142459, -2967.8958333333335, -2838.0]


In [3]:


bed_data = []
offsetScores = []

filename = "Data/GCA_009914755.4_T2T-CHM13v2.0_genomic.500kb.ends.fna"
for record in SeqIO.parse(filename, "fasta"):
    chrName = record.id
    print(chrName)

    # if chrName != "chr18q":
    #     continue
    if "p" in chrName:
        distanceFromStart = getTeloBoundary(record.seq[:], isGStrand=False, teloWindow=120,nucleotideGraphAreaWindowSize=120, windowStep=6,maxAreaThreshold=-60, minAreaThreshold=-20, showGraphs=False)
        print("p end: " + str(distanceFromStart))
        pEndIndex = distanceFromStart
        bed_data.append([chrName, pEndIndex-6, pEndIndex, chrName])
        offset = testTeloLength(chrName, distanceFromStart, c.manualLabelsCHM13)
        offsetScores.append([chrName, offset])
    else:
        distanceFromEnd = getTeloBoundary(record.seq[:],isGStrand = True, teloWindow=120,nucleotideGraphAreaWindowSize=120, windowStep=6,maxAreaThreshold=-60, minAreaThreshold=-20, showGraphs=False)
        print("q end: " + str(distanceFromEnd))
        qEndIndex = len(record.seq)-distanceFromEnd
        bed_data.append([chrName, qEndIndex, qEndIndex+6, chrName])
        offset = testTeloLength(chrName, distanceFromEnd, c.manualLabelsCHM13)
        offsetScores.append([chrName, offset])
        
    # break
# write_bed_file("./teloBoundaryOutput/TeloBP%2.0Values.bed", bed_data)

chr01q
q end: 3204
chr01q offset: -6bp (obs - exp)
chr02q
q end: 2718
chr02q offset: 96bp (obs - exp)
chr03q
q end: 4722
chr03q offset: 107bp (obs - exp)
chr04q
q end: 2310
chr04q offset: -11bp (obs - exp)
chr05q
q end: 1518
chr05q offset: -12bp (obs - exp)
chr06q
q end: 2754
chr06q offset: -24bp (obs - exp)
chr07q
q end: 2178
chr07q offset: -44bp (obs - exp)
chr08q
q end: 2616
chr08q offset: -5bp (obs - exp)
chr09q
q end: 2970
chr09q offset: -7bp (obs - exp)
chr10q
q end: 3162
chr10q offset: -41bp (obs - exp)
chr11q
q end: 2766
chr11q offset: 177bp (obs - exp)
chr12q
q end: 2304
chr12q offset: -35bp (obs - exp)
chr13q
q end: 3498
chr13q offset: -8bp (obs - exp)
chr14q
q end: 1650
chr14q offset: -2bp (obs - exp)
chr15q
q end: 2892
chr15q offset: -37bp (obs - exp)
chr16q
q end: 2676
chr16q offset: -7bp (obs - exp)
chr17q
q end: 3012
chr17q offset: -9bp (obs - exp)
chr18q
q end: 3576
chr18q offset: 87bp (obs - exp)
chr19q
q end: 3030
chr19q offset: 90bp (obs - exp)
chr20q
q end: 3132
chr

In [5]:


bed_data = []
offsetScores = []

filename = "Data/GCA_009914755.4_T2T-CHM13v2.0_genomic.500kb.ends.fna"
for record in SeqIO.parse(filename, "fasta"):
    chrName = record.id
    print(chrName)

    # if chrName != "chr18q":
    #     continue
    if "p" in chrName:
        distanceFromStart = getTeloBoundary(record.seq[:], isGStrand=False, teloWindow=120,nucleotideGraphAreaWindowSize=120, windowStep=6,maxAreaThreshold=-60, minAreaThreshold=-20, showGraphs=False)
        print("p end: " + str(distanceFromStart))
        pEndIndex = distanceFromStart
        bed_data.append([chrName, pEndIndex-6, pEndIndex, chrName])
        offset = testTeloLength(chrName, distanceFromStart, c.manualLabelsCHM13)
        offsetScores.append([chrName, offset])
    else:
        distanceFromEnd = getTeloBoundary(record.seq[:],isGStrand = True, teloWindow=120,nucleotideGraphAreaWindowSize=120, windowStep=6,maxAreaThreshold=-60, minAreaThreshold=-20, showGraphs=False)
        print("q end: " + str(distanceFromEnd))
        qEndIndex = len(record.seq)-distanceFromEnd
        bed_data.append([chrName, qEndIndex, qEndIndex+6, chrName])
        offset = testTeloLength(chrName, distanceFromEnd, c.manualLabelsCHM13)
        offsetScores.append([chrName, offset])
        
    # break
# write_bed_file("./teloBoundaryOutput/TeloBP%2.0Values.bed", bed_data)

chr01q
q end: 372660
chr01q offset: 369450bp (obs - exp)
chr02q
q end: 306870
chr02q offset: 304248bp (obs - exp)
chr03q
q end: 458322
chr03q offset: 453707bp (obs - exp)
chr04q
q end: 494844
chr04q offset: 492523bp (obs - exp)
chr05q
q end: 387504
chr05q offset: 385974bp (obs - exp)
chr06q
q end: 218106
chr06q offset: 215328bp (obs - exp)
chr07q
q end: 52110
chr07q offset: 49888bp (obs - exp)
chr08q
q end: 386724
chr08q offset: 384103bp (obs - exp)
chr09q
q end: 117714
chr09q offset: 114737bp (obs - exp)
chr10q
q end: 444996
chr10q offset: 441793bp (obs - exp)
chr11q
q end: 2766
chr11q offset: 177bp (obs - exp)
chr12q
q end: 319656
chr12q offset: 317317bp (obs - exp)
chr13q
q end: 410526
chr13q offset: 407020bp (obs - exp)
chr14q
q end: 1650
chr14q offset: -2bp (obs - exp)
chr15q
q end: 206382
chr15q offset: 203453bp (obs - exp)
chr16q
q end: 295308
chr16q offset: 292625bp (obs - exp)
chr17q
q end: 136104
chr17q offset: 133083bp (obs - exp)
chr18q
q end: 381240
chr18q offset: 377751bp