In [1]:
'''
This notebook filters mapped nanoCOP data to remove:
    polyadenylated transcripts,
    7SK transcripts,
    non-unique reads,
    and splicing intermeditates
    
    Data generated in Drexler et al. 2019 (GEO accession: GSE123191)
'''

'\nThis notebook filters mapped nanoCOP data to remove:\n    polyadenylated transcripts,\n    7SK transcripts,\n    non-unique reads,\n    and splicing intermeditates\n    \n    Data generated in Drexler et al. 2019 (GEO accession: GSE123191)\n'

In [2]:
import os
import sys
import re
import glob

import pysam
import pybedtools
from pybedtools import BedTool

import numpy as np
import pandas as pd

from plotnine import *
import warnings
warnings.filterwarnings('ignore')

import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42 # export pdfs with editable font types in Illustrator

In [14]:
# Read in data filenames and annotations used for filtering

bamFiles = [
            'SRR8268942_sorted.bam',
            'SRR8268943_sorted.bam',
            'SRR8268944_sorted.bam',
            'SRR8268945_sorted.bam',
            'SRR8268946_sorted.bam',
            'SRR8268947_sorted.bam',
            'SRR8268948_sorted.bam',
            'SRR8268949_sorted.bam',
            'SRR8932660_sorted.bam',
            'SRR8932661_sorted.bam',
            'SRR8932662_sorted.bam',
            'SRR8932663_sorted.bam',
            'SRR8932664_sorted.bam',
            'SRR8932665_sorted.bam',
            'SRR8932666_sorted.bam',
            'SRR8932667_sorted.bam',
            'SRR8932668_sorted.bam',
            'SRR10097603_sorted.bam',
            'SRR10097604_sorted.bam',
            'SRR10097605_sorted.bam',
            'SRR10097606_sorted.bam',
            'SRR10097607_sorted.bam'
]

In [4]:
# Define a function for filtering non-unique readnames from each data file
def filter_nonunique_reads(bed_file):
    
    def name_unique_reads(bed_file):
        name, ext = os.path.splitext(bed_file)
        return "{name}_{id}{ext}".format(name=name, id='unique', ext=ext)
    
    # first open and reorder coordinates of bed file to put 3'end in position for intersection
    all_data = pd.read_csv(bed_file, delimiter = '\t', names =  ['chr', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts'])
    grouped = all_data.groupby(['name']).size().to_frame(name = 'count').reset_index()

    # get read names that are unique and filter to keep only reads which have name count == 1
    is_unique =  grouped['count'] == 1
    unique = grouped[is_unique]
    unique_names = pd.Series(unique['name'].values) # create a series of readnames that have occur only once

    data_is_unique = all_data['name'].isin(unique_names)
    data_unique = all_data[data_is_unique] # filter data for readnames that are unique
    
    # save unique reads to a new file
    data_unique.to_csv(name_unique_reads(bed_file), 
               sep = '\t', 
               index = False, 
               columns = ['chr', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts'], 
               header = False)

In [7]:
# Convert BAM files to BED12    
sortedBAM = []
for file in glob.glob('./*_sorted.bam'):
    sortedBAM.append(file)

for file in sortedBAM:
    name, ext = os.path.splitext(file)
    bedfile = "{name}{ext}".format(name=name, ext='.bed')
    
    bam_file = pybedtools.BedTool(file)
    bedFile = bam_file.bam_to_bed(bed12 = True).saveas(bedfile)

In [8]:
# Filter non-unique intermediates from BED12 files
BED = []
for file in glob.glob('./*_sorted.bed'):
    BED.append(file)
    
for file in BED:
    filter_nonunique_reads(file)

In [12]:
# Count the number of reads in each file along the way

input_count = []
for file in glob.glob('./*_sorted.bam'):
    samfile = pysam.AlignmentFile(file, "rb")
    count = samfile.count()
    input_count.append(count)
        
unique_reads_count = []
for file in glob.glob('./*_sorted_unique.bed'):
    count = len(open(file).readlines())
    unique_reads_count.append(count)

In [15]:
# Make a table of read counts that are filtered at each step

counts_df = pd.DataFrame(list(zip(bamFiles, input_count, unique_reads_count, )), 
                        columns =['Sample', 'Mapped', 'Non-unique Reads'])

counts_df.to_csv('filtering_stats.csv', 
               sep = '\t', 
               index = True, 
               header = True)

In [16]:
# Add a row with column totals

# counts_df = pd.read_csv('filtering_stats.csv', delimiter = '\t', index_col = 0)
counts_df.loc['Total']= counts_df.sum()
counts_df['Sample']['Total'] = 'Total'
counts_df

# Print a report on the number of reads filtered at each step

mapped = counts_df['Mapped']['Total']
# polyA = counts_df['PolyA']['Total']
# sevenSK = counts_df['7SK']['Total']
non_unique = counts_df['Non-unique Reads']['Total']
# spl_int = counts_df['Splicing Intermediates']['Total']

print('Number of mapped reads is: ' + str(mapped))
print('Percent of total reads filtered is: ' + str(((mapped-non_unique)/mapped)*100))

Number of mapped reads is: 11681738
Percent of total reads filtered is: 59.895976095337865


In [17]:
counts_df

Unnamed: 0,Sample,Mapped,Non-unique Reads
0,SRR8268942_sorted.bam,823145,668041
1,SRR8268943_sorted.bam,940505,612588
2,SRR8268944_sorted.bam,613955,488365
3,SRR8268945_sorted.bam,1096080,114908
4,SRR8268946_sorted.bam,1419783,88316
5,SRR8268947_sorted.bam,726363,419802
6,SRR8268948_sorted.bam,603698,235384
7,SRR8268949_sorted.bam,508999,310172
8,SRR8932660_sorted.bam,528121,264930
9,SRR8932661_sorted.bam,531011,235505


In [18]:
# Melt counts table from wide to long format for plotting
df = pd.melt(counts_df, id_vars=['Sample'], value_vars=['Mapped', 'Non-unique Reads'])


variable_cat = pd.Categorical(df['variable'], categories = ['Mapped',
                                                            'Non-unique Reads', 
                                                            ])

df = df.assign(variable_cat = variable_cat)
df

Unnamed: 0,Sample,variable,value,variable_cat
0,SRR8268942_sorted.bam,Mapped,823145,Mapped
1,SRR8268943_sorted.bam,Mapped,940505,Mapped
2,SRR8268944_sorted.bam,Mapped,613955,Mapped
3,SRR8268945_sorted.bam,Mapped,1096080,Mapped
4,SRR8268946_sorted.bam,Mapped,1419783,Mapped
5,SRR8268947_sorted.bam,Mapped,726363,Mapped
6,SRR8268948_sorted.bam,Mapped,603698,Mapped
7,SRR8268949_sorted.bam,Mapped,508999,Mapped
8,SRR8932660_sorted.bam,Mapped,528121,Mapped
9,SRR8932661_sorted.bam,Mapped,531011,Mapped


In [20]:
# plot count values across all samples
plt = (
    ggplot(aes(x = 'variable_cat', y = 'value', fill = 'variable'), df) + 
    geom_bar(stat = 'identity', position = 'dodge') + 
    facet_wrap('Sample', scales = 'free_y') +
    theme_classic() +
    theme(subplots_adjust={'wspace':0.8}) +
    theme(axis_text_x=element_text(rotation=45, hjust=1))
)
plt
plt.save(filename = 'filtering_counts.pdf')