In [1]:
import pandas as pd
import sys, os
import statistics

In [2]:
def process_raw_count(fn):
    df = pd.read_csv(fn, sep = '\t')
    # only include full length and spliced reads
    df = df[
        (df['best_category'].str.contains('full_length'))|
        (df['best_category'].str.contains('spliced'))
            ].reset_index(drop = True)
    
    df['best_category'] = df['best_category'].apply(lambda g: g.split('-')[1] if 'spliced' in g else g)
    
    # filter out barcodes with less than 10 reads
    total_count = df.groupby('reporter')['count'].sum()
    reporter_less_than_10 = total_count[total_count<=10].index
    filtered_df = df[~df['reporter'].isin(reporter_less_than_10)].reset_index(drop = True)
    
    # calculate fraction for each isoform
    filtered_df['fraction'] = filtered_df.groupby('reporter')['count'].apply(lambda g: g/g.sum())
    filtered_df['RE'] = filtered_df['reporter'].apply(lambda g: g.split('_')[0])
    
    return filtered_df

def filter_barcode_count(df, threshold):
    # filter out REs with less than 5 internal barcode replicates
    full_df = df[df['best_category']=='full_length'] 
    RE_count = full_df['RE'].value_counts()
    reporter_less_than_threshold = RE_count[RE_count<=threshold].index
    filtered_df = df[~df['RE'].isin(reporter_less_than_threshold)].reset_index(drop = True)
    return filtered_df

In [3]:
dna1count = process_raw_count('ptreseq_raw_count/DNA-1_raw_count.txt')
dna2count = process_raw_count('ptreseq_raw_count/DNA-2_raw_count.txt')

dna = pd.concat([dna1count, dna2count]) # concatenate data frames
dna_filtered = filter_barcode_count(dna, 5)
# calculate median splicing fraction for each reporters across internal barcode replicates
dna_med = dna_filtered.groupby(['RE', 'best_category'])['fraction'].median().to_frame().reset_index()

dna_med.to_csv('ptreseq_splicing_quantification/DNA_2rep_fraction.txt', sep = '\t', index = False)

In [4]:
hela1count = process_raw_count('ptreseq_raw_count/HELA-1_raw_count.txt')
hela2count = process_raw_count('ptreseq_raw_count/HELA-2_raw_count.txt')

hela = pd.concat([hela1count, hela2count]) # concatenate data frames
hela_filtered = filter_barcode_count(hela, 5)
# calculate median splicing fraction for each reporters across internal barcode replicates
hela_med = hela_filtered.groupby(['RE', 'best_category'])['fraction'].median().to_frame().reset_index()

hela_med.to_csv('ptreseq_splicing_quantification/HELA_2rep_fraction.txt', sep = '\t', index = False)

In [5]:
mara1count = process_raw_count('ptreseq_raw_count/ETOH-1_raw_count.txt')
mara2count = process_raw_count('ptreseq_raw_count/ETOH-2_raw_count.txt')

mara = pd.concat([mara1count, mara2count]) # concatenate data frames
mara_filtered = filter_barcode_count(mara, 5)
# calculate median splicing fraction for each reporters across internal barcode replicates
mara_med = mara_filtered.groupby(['RE', 'best_category'])['fraction'].median().to_frame().reset_index()

mara_med.to_csv('ptreseq_splicing_quantification/HELA-mara_2rep_fraction.txt', sep = '\t', index = False)

In [6]:
hek1count = process_raw_count('ptreseq_raw_count/HEK-1_raw_count.txt')
hek2count = process_raw_count('ptreseq_raw_count/HEK-2_raw_count.txt')
hek3count = process_raw_count('ptreseq_raw_count/HEK-3_raw_count.txt')

hek = pd.concat([hek1count, hek2count, hek3count]) # concatenate data frames
hek_filtered = filter_barcode_count(hek, 5)
# calculate median splicing fraction for each reporters across internal barcode replicates
hek_med = hek_filtered.groupby(['RE', 'best_category'])['fraction'].median().to_frame().reset_index()

hek_med.to_csv('ptreseq_splicing_quantification/HEK_3rep_fraction.txt', sep = '\t', index = False)

In [7]:
sh1count = process_raw_count('ptreseq_raw_count/SH-1_raw_count.txt')
sh2count = process_raw_count('ptreseq_raw_count/SH-2_raw_count.txt')
sh3count = process_raw_count('ptreseq_raw_count/SH-3_raw_count.txt')

sh = pd.concat([sh1count, sh2count, sh3count]) # concatenate data frames
sh_filtered = filter_barcode_count(sh, 5)
# calculate median splicing fraction for each reporters across internal barcode replicates
sh_med = sh_filtered.groupby(['RE', 'best_category'])['fraction'].median().to_frame().reset_index()

sh_med.to_csv('ptreseq_splicing_quantification/SH_3rep_fraction.txt', sep = '\t', index = False)

In [8]:
u871count = process_raw_count('ptreseq_raw_count/U87-1_raw_count.txt')
u872count = process_raw_count('ptreseq_raw_count/U87-2_raw_count.txt')
u873count = process_raw_count('ptreseq_raw_count/U87-3_raw_count.txt')

u87 = pd.concat([u871count, u872count, u873count])
u87_filtered = filter_barcode_count(u87, 5)
# calculate median splicing fraction for each reporters across internal barcode replicates
u87_med = u87_filtered.groupby(['RE', 'best_category'])['fraction'].median().to_frame().reset_index()

u87_med.to_csv('ptreseq_splicing_quantification/U87_3rep_fraction.txt', sep = '\t', index = False)