TeloNP Demo + Combining Reads with Alignment Tables

In [None]:
from Bio import SeqIO
import sys
import os
import gzip
import numpy as np
import pandas as pd

sys.path.insert(0, '../TeloBP/')
from TeloBP import *
import constants as c

Loading in data

In [None]:
def getSampleKeyFromFilename(file):
    NBtitle = [x for x in file.split('.') if "NB" in x][0]
    NBtitle = [x for x in NBtitle.split('_') if "NB" in x][0]
    NBtitle = NBtitle.replace("uq", "")
    Ftitle = [x for x in file.split('.') if "F" in x][0]
    Ftitle = [x for x in Ftitle.split('_') if "F" in x][0]
    sampleKey = Ftitle + "_" + NBtitle
    return sampleKey

In [None]:
# Here we read in the tables which contain read qnames and alignment information

sampleQnamesDir = "../../Data/sampleQnamesData/assignment/"

sampleQnames = {}

for root, dirs, files in os.walk(sampleQnamesDir):
    for file in files:
        if not file.endswith('.txt'):
            continue
        # skip empty files
        if os.stat(os.path.join(root, file)).st_size == 0:
            continue
        print(file)
        
        sampleKey = getSampleKeyFromFilename(file)
        print(sampleKey)
        # read in table and set columns
        sampleQnames[sampleKey] = pd.read_csv(os.path.join(root, file), delimiter='\t', header=None)
        sampleQnames[sampleKey].columns = ["qname", "seqLength", "sampleQnamesChr", "subTeloAlignLength"]


In [None]:
# This section will read in fastq files and extract the sequences for each read

fastqReadDir = "../../Data/Final.demultip.tagged.fastq"

for root, dirs, files in os.walk(fastqReadDir):
    for filename in files:
        print(filename)
        if not filename.endswith(".gz") and not filename.endswith(".fastq") or "AG" in filename:
            continue

        sampleKey = getSampleKeyFromFilename(filename)
        print(sampleKey)

        if sampleKey not in sampleQnames.keys():
            continue
        print(f"Processing {filename}")
        sampleDf = sampleQnames[sampleKey]

        qnameTeloValues = []
        file = os.path.join(root, filename)
        print(file)
        if filename.endswith("fastq.gz"):
            with gzip.open(file,"rt") as handle:
                records = SeqIO.parse(handle,"fastq")
                for record in records:
                    if record.id not in sampleDf["qname"].tolist():
                        continue
                    qnameTeloValues.append([record.id, record.seq])
        elif filename.endswith("fastq"):
            for record in SeqIO.parse(file,"fastq"):
                if record.id not in sampleDf["qname"].tolist():
                    continue
                qnameTeloValues.append([record.id, record.seq])
        qnameTeloValuesDf = pd.DataFrame(qnameTeloValues, columns = ["qname", "seq"])
        sampleQnames[sampleKey] = pd.merge(sampleDf, qnameTeloValuesDf, on='qname', how='left')


Processing the data

In [None]:
# For each table in sampleQnames, remove rows with NaN in seq column
# This is because some reads may not have been present in the fastq files
popKeys = []
sampleQnamesNan = {}
for sampleKey in sampleQnames.keys():
    # print(sampleKey)
    sampleDf = sampleQnames[sampleKey]
    if "seq" not in sampleDf.keys():
        popKeys.append(sampleKey)
        continue
    # print(len(sampleDf))
    for index, row in sampleDf.iterrows():
        if row["seq"] is np.nan:
            if sampleKey not in sampleQnamesNan.keys():
                sampleQnamesNan[sampleKey] = [row]
            else:
                sampleQnamesNan[sampleKey].append(row)
            sampleDf.drop(index, inplace=True)

print(popKeys)
for key in popKeys:
    sampleQnames.pop(key)


In [None]:
# Find any duplicated qnames in the tables:
# No output from this cell is good. It means there are no duplicated qnames.
for sampleKey in sampleQnames.keys():
    sampleDf = sampleQnames[sampleKey]
    # print out any duplicates in the qname column
    dupQnames = sampleDf[sampleDf.duplicated(['qname'])]["qname"].tolist()
    if len(dupQnames) > 0:
        # sort by qname
        print(sampleKey)
        sortedDf = sampleDf.sort_values(by=['qname'])
        print(sortedDf)

Run TeloBP on the data

In [None]:
from pandarallel import pandarallel

outputDir = "output/TeloNPInprogress"

def rowToTeloBP(row):
    # This if statement is to catch any rows which have NaN in the seq column. 
    # Ideally this should not be necessary, but it is here just in case.
    if row["seq"] is np.nan:
        return -1000
    
    teloLength = getTeloNPBoundary(row["seq"])
    return teloLength

# The following will multiprocess the rowToTeloBP function
for sampleKey in sampleQnames.keys():
    sampleDf = sampleQnames[sampleKey]
    
    pandarallel.initialize(progress_bar=True )
    sampleDf["teloBPLengths"] = sampleDf.parallel_apply(rowToTeloBP,axis=1)

    # I highly recommend multi-processing, but if you want to single process,
    # comment out the above two lines and uncomment the following line:
    # sampleDf["teloBPLengths"] = sampleDf.apply(lambda row: rowToTeloBP(row), axis=1)

    sampleQnames[sampleKey] = sampleDf

    # save the output to a csv file. 
    # Note that the seq column is removed from the table before saving 
    sampleQnames[sampleKey] = sampleQnames[sampleKey].drop(columns=["seq"])
    sampleQnames[sampleKey].to_csv(f"{outputDir}/{sampleKey}.csv")


Processing the TeloNP results (Much of this code can be reduced into a single cell, separated for clarity)

In [None]:
# Some of the inputted tables have different column names for their "end.aln" column, like "JH39.1ug.NB88uq.end.aln", so this section will fix this

for sampleKey in sampleQnames:
    if not ("end.aln" in sampleQnames[sampleKey].columns):
        # merge any columns containing end.aln
        colSearchList = [x for x in sampleQnames[sampleKey].columns if "end.aln" in x]
        if len(colSearchList) == 0:
            print(f"No end.aln column found in {sampleKey}")
            continue
        # merge
        sampleQnames[sampleKey]["end.aln"] = sampleQnames[sampleKey][colSearchList].max(axis=1)
        #drop old columns
        sampleQnames[sampleKey].drop(columns=colSearchList, inplace=True)

In [None]:
# TeloNP will return the length in bp from the end of the sequence to the telomere boundary, meaning it will include any 
# barcode or telotag sequence as telomere. Here we are adjusting for this. 

for sampleKey in sampleQnames:
    sampleDf = sampleQnames[sampleKey]
    sampleDf["end.aln + 10"] = sampleDf["end.aln"] + 10
    sampleDf["TeloNPCorrectedLength"] = sampleDf["teloBPLengths"] - sampleDf["end.aln + 10"]
    # if unnamed column exists, drop it
    if "Unnamed: 0" in sampleDf.columns:
        sampleDf.drop(columns=["Unnamed: 0"], inplace=True)

In [None]:
# Removing any rows with teloBPLengths < 0
for sampleKey in sampleQnames.keys():
    sampleDf = sampleQnames[sampleKey]
    sampleDf = sampleDf[sampleDf["TeloNPCorrectedLength"] > 0]
    sampleQnames[sampleKey] = sampleDf

In [None]:
# Save the processed files

outputDir = "output/TeloNPOutput"

for sampleKey in sampleQnames.keys():
    sampleQnames[sampleKey].to_csv(f"{outputDir}/{sampleKey}.csv")
