In [None]:
'''
This notebook filters mapped HBB-targeted LRS data to remove:
    polyadenylated transcripts,
    non-unique reads,
    and splicing intermeditates  
'''

In [1]:
import os
import sys
import re
import glob

import pysam
import pybedtools
from pybedtools import BedTool

import numpy as np
import pandas as pd

from plotnine import *
import warnings
warnings.filterwarnings('ignore')

import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42 # export pdfs with editable font types in Illustrator

In [2]:
# Read in data filenames and annotations used for filtering
samFiles_GLOBE = [
    '../../0_mapped_data/HBB_tLRS/HBB1_GLOBE.sam',
    '../../0_mapped_data/HBB_tLRS/HBB2_GLOBE.sam',
    '../../0_mapped_data/HBB_tLRS/HBB3_GLOBE.sam',
    '../../0_mapped_data/HBB_tLRS/IVS1_GLOBE.sam',
    '../../0_mapped_data/HBB_tLRS/IVS2_GLOBE.sam',
    '../../0_mapped_data/HBB_tLRS/IVS3_GLOBE.sam'
    ]

samFiles_mm10 = [
    '../../0_mapped_data/HBB_tLRS/HBB1_mm10.sam',
    '../../0_mapped_data/HBB_tLRS/HBB2_mm10.sam',
    '../../0_mapped_data/HBB_tLRS/HBB3_mm10.sam',
    '../../0_mapped_data/HBB_tLRS/IVS1_mm10.sam',
    '../../0_mapped_data/HBB_tLRS/IVS2_mm10.sam',
    '../../0_mapped_data/HBB_tLRS/IVS3_mm10.sam'
    ]

# annotation of 5'SS coordinates for GLOBE and mm10 b-globin introns
introns_5SS = '../../annotation_files/GLOBE_mm10_HBB_introns_5SS.bed'
# save as bedtool object for intersect
introns_5SS_bedtool = pybedtools.BedTool(introns_5SS)

In [3]:
# Define a function to filter out polyadenylated and non-polyadenylated reads from tLRS data mapped to GLOBE HBB locus

def filter_polyA_GLOBE(mapped_reads_file):
    
    def append_id_polyA(mapped_reads_file):
        filename = os.path.basename(mapped_reads_file)
        name, ext = os.path.splitext(filename)
        return "{name}_{id}{ext}".format(name=name, id='polyA_softclipped', ext=ext)
    
    def append_id_polyAfiltered(mapped_reads_file):
        filename = os.path.basename(mapped_reads_file)
        name, ext = os.path.splitext(filename)
        return "{name}_{id}{ext}".format(name=name, id='polyAfiltered_softclipped', ext=ext)
   
    output_pAfiltered = open(append_id_polyAfiltered(mapped_reads_file), 'w')
    output_pA = open(append_id_polyA(mapped_reads_file), 'w')
    
    with open(mapped_reads_file, 'r') as f:

        for line in f:
            line = line.strip('\n')
            col = line.split('\t')
            if col[0][0] == '@': # write header lines into output file
                output_pAfiltered.write(line + '\n')
                output_pA.write(line + '\n')
                continue
            cigar = str(col[5]) # gets cigar string from SAM file
            last100 = col[9][:100] # gets sequence of 100 leftmost bases in read
            
            if "S" in cigar: # if soft-clipping present in read and...
                if re.findall('[A-Z]', cigar)[0] == 'S': # ...if read contains soft-clipping as the first cigar operator
                    index = cigar.find('S') # finds position of left-most S in cigar string
                    length_clipped = int(cigar.split("S")[0]) # get digits before first S in CIGAR string
                    clipped_bases = col[9][:length_clipped] # get sequence of clipped bases from start to first S
                    read_end = int(col[3]) - length_clipped # read end is start of mapped sequence plus clipped bases (subtract because all reads are on - strand)
                    clip_start = int(col[3]) - 1 # start of mapped sequence minus 1
                    
                    if read_end < 2448 and last100.count("T") >= 70: # if soft clipping begins after the HBB insertion (mapping to downstream genomic region)
                        continue # this filters out readthrough reads with polyT tails
                    elif clip_start in range(4426,4480) and clipped_bases.count('T')/len(clipped_bases) >=0.7: # if clipping starts within 50nt of canonical polyA site and clipped bases are greater than 0.7T
                        output_pA.write(line + "\n") # write out normal polyT reads
                    elif clip_start in range(4426,4480) and clipped_bases.count('T') >=4: # if clipping starts within 50 nt of canonical polyA site and has more than 4T (accounts for even very small tails)
                        output_pA.write(line + "\n") # write out normal polyT reads with very small tails
                    elif read_end > 2448 and length_clipped >= 20: # any other long mismatch within GLOBE sequnce
                        continue # this gets rid of chimeric reads and weird mapping
                    elif read_end < 2448 and clip_start > 2448: # any other long mismatch within GLOBE sequence
                        continue # other long chimeric reads
                    else:
                        output_pAfiltered.write(line + "\n")
                else:
                    output_pAfiltered.write(line + "\n")
            else:
                output_pAfiltered.write(line + "\n")
                        
    f.close()
    output_pAfiltered.close()
    output_pA.close()

In [4]:
# Define a function to filter out polyadenylated and non-polyadenylated reads from tLRS data mapped to mm10 Hbb locus

def filter_polyA_mm10(mapped_reads_file):
    
    def append_id_polyA(mapped_reads_file):
        filename = os.path.basename(mapped_reads_file)
        name, ext = os.path.splitext(filename)
        return "{name}_{id}{ext}".format(name=name, id='polyA', ext=ext)
    
    def append_id_polyAfiltered(mapped_reads_file):
        filename = os.path.basename(mapped_reads_file)
        name, ext = os.path.splitext(filename)
        return "{name}_{id}{ext}".format(name=name, id='polyAfiltered', ext=ext)
   
    output_pAfiltered = open(append_id_polyAfiltered(mapped_reads_file), 'w')
    output_pA = open(append_id_polyA(mapped_reads_file), 'w')
    
    with open(mapped_reads_file, 'r') as f:

        for line in f:
            line = line.strip('\n')
            col = line.split('\t')
            if col[0][0] == '@': # write header lines into output file
                output_pAfiltered.write(line + '\n')
                output_pA.write(line + '\n')
                continue
            cigar = str(col[5]) # gets cigar string from SAM file
            last100 = col[9][:100] # gets sequence of 100 leftmost bases in read
            
            if "S" in cigar: # if soft-clipping present in read and...
                if re.findall('[A-Z]', cigar)[0] == 'S': # ...if read contains soft-clipping as the first cigar operator
                    index = cigar.find('S') # finds position of left-most S in cigar string
                    length_clipped = int(cigar.split("S")[0]) # get digits before first S in CIGAR string
                    clipped_bases = col[9][:length_clipped] # get sequence of clipped bases from start to first S
                    read_end = int(col[3]) - length_clipped # read end is start of mapped sequence plus clipped bases (subtract because all reads are on - strand)
                    clip_start = int(col[3]) - 1 # start of mapped sequence minus 1
                    
                    if clip_start in range(103826506,103826556) and clipped_bases.count('T')/len(clipped_bases) >=0.7: # if clipping starts within 50nt of canonical polyA site and clipped bases are greater than 70% T
                        output_pA.write(line + "\n") # write out normal polyT reads
                    elif clip_start in range(103826506,103826556) and clipped_bases.count('T') >=4: # if clipping starts within 50 nt of canonical polyA site and has more than 4T (accounts for even very small tails)
                        output_pA.write(line + "\n") # write out normal polyT reads with very small tails
                    elif read_end < 103826523 and last100.count("T") >= 70: # if soft clipping begins after the end of the gene
                        output_pA.write(line + "\n") #filter reads that very rarely use a downstream polyA site
                    elif read_end > 103826523 and length_clipped >= 20: # any other long mismatch within Hbb sequnce
                        continue
                    else:
                        output_pAfiltered.write(line + "\n")
                else:
                    output_pAfiltered.write(line + "\n")
            else:
                output_pAfiltered.write(line + "\n")
                        
    f.close()
    output_pAfiltered.close()
    output_pA.close()

In [5]:
# Define a function to fill in soft-clipped bases past the end of the GLOBE annotation for reads mapped to GLOBE
def fill_soft_clipping(sam_file):
    
    def append_id_filled(sam_file):
        name, ext = os.path.splitext(sam_file)
        return "{name}_{id}{ext}".format(name=name, id='filled', ext=ext)

    output_filled = open(append_id_filled(sam_file), 'w')
    
    with open(sam_file, 'r') as f:
        for line in f:
            line = line.strip('\n')
            col = line.split('\t')
            if col[0][0] == '@': # write header lines into output file
                output_filled.write(line + "\n")
                continue
            cigar = str(col[5]) # gets cigar string from SAM file
            map_start = col[3] # position of mapping start (end of soft-clipping)
            if "S" in cigar: # if soft-clipping present in read and...
                if re.findall('[A-Z]', cigar)[0] == 'S' and re.findall('[A-Z]', cigar)[1] == 'M': # ...if first cigar operator is 'S' and next operator is 'M'
                    position_first_S = cigar.find('S') # finds position of left-most 'S' in cigar string
                    length_first_S = int(cigar.split("S")[0]) # get digits before first 'S' in CIGAR string
                    position_first_M = cigar.find('M') # find first M
                    length_first_M = int(cigar.split("M")[0][position_first_S+1:]) # get length of first M
                    new_M_length = length_first_S + length_first_M
                    remainder = cigar[position_first_M+1:]
                    new_cigar = str(new_M_length) + "M" + str(remainder)
                    new_POS = int(map_start) - int(length_first_S)
                    
                    output_filled.write(col[0] + "\t" + col[1] + "\t" + col[2] + "\t" + str(new_POS) + "\t" + col[4] + "\t" + str(new_cigar) + "\t" + col[6] + "\t" + col[7] + "\t" + col[8] + "\t" + col[9] + "\t" + col[10] + "\n")
                else:
                    output_filled.write(line + "\n")
            else:
                output_filled.write(line + "\n")
    
    output_filled.close()

In [6]:
# Define a function for filtering non-unique readnames from each data file
def filter_nonunique_reads(bed_file):
    
    def name_unique_reads(bed_file):
        name, ext = os.path.splitext(bed_file)
        return "{name}_{id}{ext}".format(name=name, id='unique', ext=ext)
    
    # first open and reorder coordinates of bed file to put 3'end in position for intersection
    all_data = pd.read_csv(bed_file, delimiter = '\t', names =  ['chr', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts'])
    grouped = all_data.groupby(['name']).size().to_frame(name = 'count').reset_index()

    # get read names that are unique and filter to keep only reads which have name count == 1
    is_unique =  grouped['count'] == 1
    unique = grouped[is_unique]
    unique_names = pd.Series(unique['name'].values) # create a series of readnames that have occur only once

    data_is_unique = all_data['name'].isin(unique_names)
    data_unique = all_data[data_is_unique] # filter data for readnames that are unique
    
    # save unique reads to a new file
    data_unique.to_csv(name_unique_reads(bed_file), 
               sep = '\t', 
               index = False, 
               columns = ['chr', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts'], 
               header = False)

In [7]:
# Define a function for filtering splicing intermediates from each data file
def filter_splicing_intermediates(bed_file):
    
    def name_file_no_splicing_int(bed_file):
        name, ext = os.path.splitext(bed_file)
        return "{name}_{id}{ext}".format(name=name, id='no_splicing_int', ext=ext)
    
    def name_file_splicing_int(bed_file):
        name, ext = os.path.splitext(bed_file)
        return "{name}_{id}{ext}".format(name=name, id='splicing_int', ext=ext)
    
    # first open and reorder coordinates of bed file to put 3'end in position for intersection
    data = pd.read_csv(bed_file, delimiter = '\t', names =  ['chr', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts'])
    data.loc[data['strand'] == '+', 'threeEnd'] = data['end']
    data.loc[data['strand'] == '-', 'threeEnd'] = data['start']
    data.loc[data['strand'] == '+', 'fiveEnd'] = data['start']
    data.loc[data['strand'] == '-', 'fiveEnd'] = data['end']
    data.loc[data['strand'] == '+', 'newStart'] = data['threeEnd'] - 1
    data.loc[data['strand'] == '-', 'newStart'] = data['threeEnd']
    data.loc[data['strand'] == '+', 'newEnd'] = data['threeEnd']
    data.loc[data['strand'] == '-', 'newEnd'] = data['threeEnd'] + 1

    # convert coordinates back to integer values
    data['newStart'] = data['newStart'].astype(np.int64)
    data['newEnd'] = data['newEnd'].astype(np.int64)
    data['fiveEnd'] = data['fiveEnd'].astype(np.int64)
    data['threeEnd'] = data['threeEnd'].astype(np.int64)
    
    # save a temporary bed file with data 3'end coordinates
    data.to_csv('tmp.bed', 
               sep = '\t', 
               index = False, 
               columns = ['chr', 'newStart', 'newEnd', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts', 'start', 'end'], 
               header = False)
        
    # intersect data 3'end with intron 5'SS coordinates to get splicing intermediates and non-intermediates
    tmp_bedfile = open('tmp.bed')
    data_bedtool = pybedtools.BedTool(tmp_bedfile)
    intersect1 = data_bedtool.intersect(introns_5SS_bedtool, u = True).saveas('tmp_splicing_int.bed')
    
    tmp_bedfile = open('tmp.bed')
    data_bedtool = pybedtools.BedTool(tmp_bedfile)
    intersect2 = data_bedtool.intersect(introns_5SS_bedtool, v = True).saveas('tmp_no_splicing_int.bed')

    # reorder coordinates of data files
    data1 = pd.read_csv('tmp_splicing_int.bed', delimiter = '\t', names =  ['chr', 'newStart', 'newEnd', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts', 'start', 'end'])
    data1.to_csv(name_file_splicing_int(bed_file), 
               sep = '\t', 
               index = False, 
               columns = ['chr', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts'], 
               header = False)
    
    data2 = pd.read_csv('tmp_no_splicing_int.bed', delimiter = '\t', names =  ['chr', 'newStart', 'newEnd', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts', 'start', 'end'])
    data2.to_csv(name_file_no_splicing_int(bed_file), 
               sep = '\t', 
               index = False, 
               columns = ['chr', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts'], 
               header = False)
    
    # clean up temp files
    os.remove('tmp.bed')
    os.remove('tmp_no_splicing_int.bed')
    os.remove('tmp_splicing_int.bed')

In [8]:
# Filter polyadenylated reads
for file in samFiles_GLOBE:
    filter_polyA_GLOBE(file)
    
for file in samFiles_mm10:
    filter_polyA_mm10(file)

In [9]:
# Fill in soft-clipped bases for polyAfiltered GLOBE reads
soft_clipped = []
for file in glob.glob('./*_softclipped.sam'):
    soft_clipped.append(file)
    
for file in soft_clipped:
    fill_soft_clipping(file)

In [10]:
# Convert SAM files to BAM  files for further filtering
SAM = []
for file in glob.glob('./*_GLOBE_polyA_softclipped_filled.sam'):
    SAM.append(file)
for file in glob.glob('./*_GLOBE_polyAfiltered_softclipped_filled.sam'):
    SAM.append(file)
for file in glob.glob('./*_mm10_polyA.sam'):
    SAM.append(file)
for file in glob.glob('./*_mm10_polyAfiltered.sam'):
    SAM.append(file)
    
for samfile in SAM:
    name, ext = os.path.splitext(samfile)
    bamfile = "{name}{ext}".format(name=name, ext='.bam')
    pysam.view('-S', '-b', '-o', bamfile, samfile, catch_stdout=False)

In [11]:
# Sort and index BAM files
BAM = []
for file in glob.glob('./*.bam'):
    BAM.append(file)
    
for bamfile in BAM:
    name, ext = os.path.splitext(bamfile)
    bamfileSorted = "{name}_{id}{ext}".format(name=name, id='sorted', ext=ext)
    pysam.sort('-o', bamfileSorted, bamfile, catch_stdout=False)
    pysam.index(bamfileSorted, catch_stdout=False)
    
# Convert BAM files to BED12    
sortedBAM = []
for file in glob.glob('./*_sorted.bam'):
    sortedBAM.append(file)

for file in sortedBAM:
    name, ext = os.path.splitext(file)
    bedfile = "{name}{ext}".format(name=name, ext='.bed')
    
    bam_file = pybedtools.BedTool(file)
    bedFile = bam_file.bam_to_bed(bed12 = True).saveas(bedfile)

In [12]:
# Filter non-unique intermediates from BED12 files
BED_polyA = []
for file in glob.glob('./*_polyA_sorted.bed'):
    BED_polyA.append(file)
for file in glob.glob('./*_polyA_softclipped_filled_sorted.bed'):
    BED_polyA.append(file)
    
BED_polyAfiltered = []
for file in glob.glob('./*_polyAfiltered_sorted.bed'):
    BED_polyAfiltered.append(file)
for file in glob.glob('./*_polyAfiltered_softclipped_filled_sorted.bed'):
    BED_polyAfiltered.append(file)
    
for file in BED_polyA:
    filter_nonunique_reads(file)
for file in BED_polyAfiltered:
    filter_nonunique_reads(file)

In [13]:
# Filter splicing intermediates from BED12 files
BED_unique = []
for file in glob.glob('./*_polyAfiltered_sorted_unique.bed'):
    BED_unique.append(file)
for file in glob.glob('./*_polyAfiltered_softclipped_filled_sorted_unique.bed'):
    BED_unique.append(file)

for file in BED_unique:
    filter_splicing_intermediates(file)

chr7	103827783	103827784	intron_1_intermediate	-	0

chr7	103827783	103827784	intron_1_intermediate	-	0

chr7	103827783	103827784	intron_1_intermediate	-	0

chr7	103827783	103827784	intron_1_intermediate	-	0

chr7	103827783	103827784	intron_1_intermediate	-	0

chr7	103827783	103827784	intron_1_intermediate	-	0

chr7	103827783	103827784	intron_1_intermediate	-	0

chr7	103827783	103827784	intron_1_intermediate	-	0

chr7	103827783	103827784	intron_1_intermediate	-	0

chr7	103827783	103827784	intron_1_intermediate	-	0

chr7	103827783	103827784	intron_1_intermediate	-	0

chr7	103827783	103827784	intron_1_intermediate	-	0

chr7	103827783	103827784	intron_1_intermediate	-	0

chr7	103827783	103827784	intron_1_intermediate	-	0

chr7	103827783	103827784	intron_1_intermediate	-	0

chr7	103827783	103827784	intron_1_intermediate	-	0

chr7	103827783	103827784	intron_1_intermediate	-	0

chr7	103827783	103827784	intron_1_intermediate	-	0

chr7	103827783	103827784	intron_1_intermediate	-	0

chr7	1038277

In [14]:
# Fix naming to include soft_clipped_filled data

# # Count the number of reads in each file along the way

# samFiles = []
# for file in glob.glob('./*_GLOBE.sam'):
#     samFiles.append(file)
# for file in glob.glob('./*_mm10.sam'):
#     samFiles.append(file)

# input_count = []
# for file in samFiles:
#     samfile = pysam.AlignmentFile(file, "rb")
#     count = samfile.count()
#     input_count.append(count)
    
# polyA_filtered_count = []
# for file in glob.glob('./*_polyAfiltered.sam'):
#     samfile = pysam.AlignmentFile(file, "rb")
#     count = samfile.count()
#     polyA_filtered_count.append(count)
    
# polyA_count = []
# for file in glob.glob('./*_polyA.sam'):
#     samfile = pysam.AlignmentFile(file, "rb")
#     count = samfile.count()
#     polyA_count.append(count)
        
# polyA_unique_reads_count = []
# for file in glob.glob('./*_polyA_sorted_unique.bed'):
#     count = len(open(file).readlines())
#     polyA_unique_reads_count.append(count)
    
# polyAfiltered_unique_reads_count = []
# for file in glob.glob('./*_polyAfiltered_sorted_unique.bed'):
#     count = len(open(file).readlines())
#     polyAfiltered_unique_reads_count.append(count)
    
# no_splicing_int_count = []  
# for file in glob.glob('./*_no_splicing_int.bed'):
#     count = len(open(file).readlines())
#     no_splicing_int_count.append(count)
    
# splicing_int_count = []
# for file in glob.glob('./*_unique_splicing_int.bed'):
#     count = len(open(file).readlines())
#     splicing_int_count.append(count)

FileNotFoundError: [Errno 2] could not open alignment file `./HBB1_GLOBE_polyAfiltered.sam`: No such file or directory

In [90]:
# Make a table of read counts that are filtered at each step

counts_df = pd.DataFrame(list(zip(samFiles, input_count, polyA_count, polyA_filtered_count, polyA_unique_reads_count, polyAfiltered_unique_reads_count, splicing_int_count, no_splicing_int_count)), 
                        columns =['Sample', 'Mapped', 'PolyA (+)', 'PolyA (-)', 'PolyA (+) Unique Reads', 'PolyA (-) Unique Reads', 'Splicing Intermediates', 'Non-Intermediates'])

# Add a row with column totals
counts_df.loc['Total']= counts_df.sum()
counts_df['Sample']['Total'] = 'Total'
counts_df

Unnamed: 0,Sample,Mapped,PolyA (+),PolyA (-),PolyA (+) Unique Reads,PolyA (-) Unique Reads,Splicing Intermediates,Non-Intermediates
0,./HBB1_GLOBE.sam,24983,14450,9850,14450,9746,1122,8624
1,./IVS1_GLOBE.sam,25817,13258,9340,13258,9249,1071,8178
2,./IVS2_GLOBE.sam,26744,12741,9325,12741,9256,1088,8168
3,./HBB2_GLOBE.sam,23218,10871,14318,10871,14175,1326,12849
4,./HBB3_GLOBE.sam,22670,12710,13474,12710,13354,1256,12098
5,./IVS3_GLOBE.sam,26387,12275,13507,12275,13380,1403,11977
6,./HBB1_mm10.sam,24332,17215,7038,17215,6559,1687,4872
7,./HBB3_mm10.sam,21841,15104,6792,15104,6335,1573,4762
8,./IVS1_mm10.sam,26437,15307,6481,15307,6144,1680,4464
9,./IVS2_mm10.sam,26227,16009,10343,16009,9948,1215,8733


In [87]:
# Melt counts table from wide to long format for plotting
df = pd.melt(counts_df, id_vars=['Sample'], value_vars=['Mapped', 'PolyA (+)', 'PolyA (-)', 'PolyA (+) Unique Reads', 'PolyA (-) Unique Reads', 'Splicing Intermediates', 'Non-Intermediates'])

# add categorial variable to control the order of plotting
variable_cat = pd.Categorical(df['variable'], categories = ['Mapped', 'PolyA (+)', 'PolyA (-)', 'PolyA (+) Unique Reads', 'PolyA (-) Unique Reads', 'Splicing Intermediates', 'Non-Intermediates'])

df = df.assign(variable_cat = variable_cat)

Unnamed: 0,Sample,variable,value,variable_cat
0,./HBB1_GLOBE.sam,Mapped,24983,Mapped
1,./IVS1_GLOBE.sam,Mapped,25817,Mapped
2,./IVS2_GLOBE.sam,Mapped,26744,Mapped
3,./HBB2_GLOBE.sam,Mapped,23218,Mapped
4,./HBB3_GLOBE.sam,Mapped,22670,Mapped
...,...,...,...,...
86,./IVS1_mm10.sam,Non-Intermediates,4464,Non-Intermediates
87,./IVS2_mm10.sam,Non-Intermediates,8733,Non-Intermediates
88,./HBB2_mm10.sam,Non-Intermediates,8737,Non-Intermediates
89,./IVS3_mm10.sam,Non-Intermediates,9007,Non-Intermediates


In [89]:
# plot count values across all samples
plt = (
    ggplot(aes(x = 'variable_cat', y = 'value', fill = 'variable'), df) + 
    geom_bar(stat = 'identity', position = 'dodge') + 
    facet_wrap('Sample', scales = 'free_y') +
    theme_classic() +
    theme(subplots_adjust={'wspace':0.8}) +
    theme(axis_text_x=element_text(rotation=45, hjust=1))
)
plt


In [None]:
plt.save(filename = 'tLSR_filtering_counts.pdf')
counts_df.to_csv('tLSR_filtering_stats.csv', 
               sep = '\t', 
               index = True, 
               header = True)