# Exploratory Data Analysis

First pass at analysing the output of blasting AML RNA-Seq data against the three tryptase sequences from Jonathon.


In [1]:
import pysam
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
def get_files(folder):
    
    return glob.glob('{folder}/*sam.sorted.bam'.format(folder=folder))

In [3]:
list_of_aml_results = get_files('../aml_results/')
list_of_control_results = get_files('../control_results/')

In [4]:
aml_file_names = [x.split('/')[len(x.split('/'))-1] for x in list_of_aml_results]
control_file_names = [x.split('/')[len(x.split('/'))-1] for x in list_of_control_results]

In [5]:
df_aml = pd.DataFrame(index=aml_file_names)
df_control = pd.DataFrame(index=control_file_names)

In [6]:
df_aml['file_location'] = list_of_aml_results
df_control['file_location'] = list_of_control_results

In [7]:
df_aml['source'] = 'AML'
df_control['source'] = 'control'

In [8]:
df_aml.head()

Unnamed: 0,file_location,source
aml_aabspliced_SRR948685.sam.sorted.bam,../aml_results/aml_aabspliced_SRR948685.sam.so...,AML
aml_aabspliced_SRR948686.sam.sorted.bam,../aml_results/aml_aabspliced_SRR948686.sam.so...,AML
aml_aabspliced_SRR1918637.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918637.sam.s...,AML
aml_aabspliced_SRR1918638.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918638.sam.s...,AML
aml_aabspliced_SRR1918639.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918639.sam.s...,AML


In [9]:
df_control.head()

Unnamed: 0,file_location,source


In [10]:
df = df_aml.append(df_control)

In [11]:
df.count()

file_location    2725
source           2725
dtype: int64

In [62]:
def get_read_count(df):

    return int(pysam.view("-c", df['file_location']))

In [63]:
df['alignment_count'] = df.apply(get_read_count, axis=1)

In [64]:
df.head()

Unnamed: 0,file_location,source,alignment_count
aml_aabspliced_SRR948685.sam.sorted.bam,../aml_results/aml_aabspliced_SRR948685.sam.so...,AML,197
aml_aabspliced_SRR948686.sam.sorted.bam,../aml_results/aml_aabspliced_SRR948686.sam.so...,AML,161
aml_aabspliced_SRR1918637.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918637.sam.s...,AML,3736
aml_aabspliced_SRR1918638.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918638.sam.s...,AML,3550
aml_aabspliced_SRR1918639.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918639.sam.s...,AML,182


In [None]:
df.shape

In [65]:
def get_transcript_count(df, transcript_name):
    
    """
    Return the count of the transcript i.e how many of each of the three are in there.
    Note - Does not look at the quality of the alignment.
    
    """
    
    sam_file_location = df['file_location']
    
    samfile = pysam.AlignmentFile(sam_file_location, "rb")
    
    count = 0
    
    for read in samfile:
        
        if read is not None and read.reference_name == transcript_name:
            
            count = count + 1
            
    return count

In [66]:
df['alpha_wt_count'] = df.apply(get_transcript_count, axis=1, args=['Alpha_GEX_64k_HEX'])

In [67]:
df['alpha_dup_count'] = df.apply(get_transcript_count, axis=1, args=['Alpha_GEX_79k_dup_FAM'])
df['beta_count'] = df.apply(get_transcript_count, axis=1, args=['BETA_new_GEX_FAM'])

In [68]:
df.head()

Unnamed: 0,file_location,source,alignment_count,alpha_wt_count,alpha_dup_count,beta_count
aml_aabspliced_SRR948685.sam.sorted.bam,../aml_results/aml_aabspliced_SRR948685.sam.so...,AML,197,8,9,180
aml_aabspliced_SRR948686.sam.sorted.bam,../aml_results/aml_aabspliced_SRR948686.sam.so...,AML,161,4,4,153
aml_aabspliced_SRR1918637.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918637.sam.s...,AML,3736,1776,1581,379
aml_aabspliced_SRR1918638.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918638.sam.s...,AML,3550,1669,1482,399
aml_aabspliced_SRR1918639.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918639.sam.s...,AML,182,78,70,34


In [69]:
def get_zero_edit_distance_count(df, transcript_name):
    
    """
    For a goven transcript e.g. Alpha_GEX_64k_HEX get how many of the matches have an NM tag of zero
    i.e. the match was exact.
    
    """
    
    sam_file_location = df['file_location']
    
    samfile = pysam.AlignmentFile(sam_file_location, "rb")
    
    count = 0
    
    for read in samfile:
        
        if read is not None and read.reference_name == transcript_name:
            
            edit_distance = read.get_tag('NM')
            
            if edit_distance == 0:
                
                count = count + 1
            
    return count

In [70]:
df['alpha_wt_zero_edit_count'] = df.apply(get_zero_edit_distance_count, axis=1, args=['Alpha_GEX_64k_HEX'])
df['alpha_dup_zero_edit_count'] = df.apply(get_zero_edit_distance_count, axis=1, args=['Alpha_GEX_79k_dup_FAM'])
df['beta_zero_edit_count'] = df.apply(get_zero_edit_distance_count, axis=1, args=['BETA_new_GEX_FAM'])

In [71]:
df.head()

Unnamed: 0,file_location,source,alignment_count,alpha_wt_count,alpha_dup_count,beta_count,alpha_wt_zero_edit_count,alpha_dup_zero_edit_count,beta_zero_edit_count
aml_aabspliced_SRR948685.sam.sorted.bam,../aml_results/aml_aabspliced_SRR948685.sam.so...,AML,197,8,9,180,5,6,108
aml_aabspliced_SRR948686.sam.sorted.bam,../aml_results/aml_aabspliced_SRR948686.sam.so...,AML,161,4,4,153,3,3,85
aml_aabspliced_SRR1918637.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918637.sam.s...,AML,3736,1776,1581,379,677,631,340
aml_aabspliced_SRR1918638.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918638.sam.s...,AML,3550,1669,1482,399,582,546,350
aml_aabspliced_SRR1918639.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918639.sam.s...,AML,182,78,70,34,39,37,26


In [72]:
def get_transcript_read_count_filtered(df, transcript_name, start, end):
    
    """
    Count hits which cover the bit of the reference we are interested in
    
    """
    
    sam_file_location = df['file_location']
    
    samfile = pysam.AlignmentFile(sam_file_location, "rb")
    
    iter = samfile.fetch(transcript_name, start, end)
    
    count =0
    
    for read in iter:
        
        if read.reference_start <=start and read.reference_end >= end:
            
            count = count +1
            
    return count

In [73]:
df['alpha_read_covers_snps_count'] = df.apply(get_transcript_read_count_filtered,
                                              axis=1,
                                              args=['Alpha_GEX_64k_HEX', 25,45])

In [74]:
df['alpha_dup_read_covers_snps_count'] = df.apply(get_transcript_read_count_filtered,
                                              axis=1,
                                              args=['Alpha_GEX_79k_dup_FAM', 25,45])

df['beta_read_covers_snps_count'] = df.apply(get_transcript_read_count_filtered,
                                              axis=1,
                                              args=['BETA_new_GEX_FAM', 25,45])

In [75]:
df.sort_values(by='alpha_dup_read_covers_snps_count', ascending=False).head()

Unnamed: 0,file_location,source,alignment_count,alpha_wt_count,alpha_dup_count,beta_count,alpha_wt_zero_edit_count,alpha_dup_zero_edit_count,beta_zero_edit_count,alpha_read_covers_snps_count,alpha_dup_read_covers_snps_count,beta_read_covers_snps_count
aml_aabspliced_SRR5626188.sam.sorted.bam,../aml_results/aml_aabspliced_SRR5626188.sam.s...,AML,173788,82692,90119,977,72328,78456,595,119,4060,50
aml_aabspliced_SRR1916270.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1916270.sam.s...,AML,126153,60754,65308,91,54343,58268,58,625,3108,3
aml_aabspliced_SRR1916268.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1916268.sam.s...,AML,128617,62044,66448,125,55942,59756,84,654,3078,1
aml_aabspliced_SRR1916269.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1916269.sam.s...,AML,127467,61392,65993,82,55298,59255,61,586,3078,1
aml_aabspliced_SRR5626173.sam.sorted.bam,../aml_results/aml_aabspliced_SRR5626173.sam.s...,AML,65055,30944,34015,96,27266,29823,85,4,1675,0


In [76]:
def get_transcript_read_count_filtered_exact(df, transcript_name, start, end):
    
    """
    Count hits which cover the bit of the reference we are interested in and which are exact.
    
    That is do they cross position 0 - 44 of the transcript ( given by transcipt_name) and \
    have an edit distance from the reference of 0. 
    
    Reads which cross this location will cross all four SNPs needed to tell the 3 transcripts apart.
    
    """
    
    sam_file_location = df['file_location']
    
    samfile = pysam.AlignmentFile(sam_file_location, "rb")
    
    iter = samfile.fetch(transcript_name, start, end)
    
    count =0
    
    for read in iter:
        
        if read.reference_start <=start and read.reference_end >=end:
            
            edit_distance = read.get_tag('NM')
            
            if edit_distance == 0:
                
                count = count + 1
                
    return count

In [77]:
df['alpha_read_covers_snps_count_exact'] = df.apply(get_transcript_read_count_filtered_exact,
                                              axis=1,
                                              args=['Alpha_GEX_64k_HEX', 25,45])

df['alpha_dup_read_covers_snps_count_exact'] = df.apply(get_transcript_read_count_filtered_exact,
                                              axis=1,
                                              args=['Alpha_GEX_79k_dup_FAM', 25,45])

df['beta_read_covers_snps_count_exact'] = df.apply(get_transcript_read_count_filtered_exact,
                                              axis=1,
                                              args=['BETA_new_GEX_FAM', 25,45])

In [78]:
# Quick check to see if any such reads exist

#df.sort_values(by='alpha_dup_read_covers_snps_count_exact', ascending=False).head()

df.head()

Unnamed: 0,file_location,source,alignment_count,alpha_wt_count,alpha_dup_count,beta_count,alpha_wt_zero_edit_count,alpha_dup_zero_edit_count,beta_zero_edit_count,alpha_read_covers_snps_count,alpha_dup_read_covers_snps_count,beta_read_covers_snps_count,alpha_read_covers_snps_count_exact,alpha_dup_read_covers_snps_count_exact,beta_read_covers_snps_count_exact
aml_aabspliced_SRR948685.sam.sorted.bam,../aml_results/aml_aabspliced_SRR948685.sam.so...,AML,197,8,9,180,5,6,108,0,0,4,0,0,2
aml_aabspliced_SRR948686.sam.sorted.bam,../aml_results/aml_aabspliced_SRR948686.sam.so...,AML,161,4,4,153,3,3,85,0,0,6,0,0,3
aml_aabspliced_SRR1918637.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918637.sam.s...,AML,3736,1776,1581,379,677,631,340,106,0,21,0,0,16
aml_aabspliced_SRR1918638.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918638.sam.s...,AML,3550,1669,1482,399,582,546,350,105,0,28,1,0,21
aml_aabspliced_SRR1918639.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918639.sam.s...,AML,182,78,70,34,39,37,26,4,0,0,0,0,0


## Sanity Check

Print out some of the reads. I then did a manual BLAST and alignment to check the code was working as expected

In [79]:
def get_transcript_read_count_filtered_exact_print(sam_file_location, transcript_name, start, end):
    
    """
    Same as get_transcript_read_count_filtered_exact() except we just print out the reads instead of \
    counting them.
    
    
    """
    
    samfile = pysam.AlignmentFile(sam_file_location, "rb")
    
    iter = samfile.fetch(transcript_name, start, end)
    
    for read in iter:
        
        if read.reference_start <=start and read.reference_end >= end:
            
            edit_distance = read.get_tag('NM')
            
            if edit_distance == 0:
                
                print (read)

In [80]:
#Pick a random bam file with some read alignments in this area

get_transcript_read_count_filtered_exact_print('../aml_results/aml_aabspliced_SRR5626188.sam.sorted.bam',
                                               'Alpha_GEX_64k_HEX',
                                               25,
                                               45 )

SRR5626188.47984736	163	0	0	255	7S94M	0	5	94	GGGAGAGCCCGCTGGGTAGAAGGAACAGGGAGTGGCCAGGATGCTGAGCCTGCTGCTGCTGGCGCTGCCCGTCCTGGCGAGCCGCGCCTACGCGGCCCCTG	None	[('NH', 1), ('AS', 94), ('NM', 0)]
SRR5626188.59402515	163	0	0	255	6S95M	0	150	95	GGAGAGCCCGCTGGGTAGAAGGAACAGGGAGTGGCCAGGATGCTGAGCCTGCTGCTGCTGGCGCTGCCCGTCCTGGCGAGCCGCGCCTACGCGGCCCCTGC	None	[('NH', 1), ('AS', 95), ('NM', 0)]
SRR5626188.63045359	163	0	0	255	24S77M	0	18	77	GAAGGATAAATGGGGAGGGGAGAGCCCGCTGGGTAGAAGGAACAGGGAGTGGCCAGGATGCTGAGCCTGCTGCTGCTGGCGCTGCCCGTCCTGGCGAGCCG	None	[('NH', 1), ('AS', 77), ('NM', 0)]
SRR5626188.63657800	163	0	0	255	6S95M	0	125	95	GGAGAGCCCGCTGGGTAGAAGGAACAGGGAGTGGCCAGGATGCTGAGCCTGCTGCTGCTGGCGCTGCCCGTCCTGGCGAGCCGCGCCTACGCGGCCCCTGC	None	[('NH', 1), ('AS', 95), ('NM', 0)]
SRR5626188.68846470	99	0	0	255	6S95M	0	41	95	GGAGAGCCCGCTGGGTAGAAGGAACAGGGAGTGGCCAGGATGCTGAGCCTGCTGCTGCTGGCGCTGCCCGTCCTGGCGAGCCGCGCCTACGCGGCCCCTGC	None	[('NH', 1), ('AS', 95), ('NM', 0)]
SRR5626188.16757617	89	0	0	255	41S60M	-1	-1	60	CGCCCCCTCCTG

In [81]:
df.describe()

Unnamed: 0,alignment_count,alpha_wt_count,alpha_dup_count,beta_count,alpha_wt_zero_edit_count,alpha_dup_zero_edit_count,beta_zero_edit_count,alpha_read_covers_snps_count,alpha_dup_read_covers_snps_count,beta_read_covers_snps_count,alpha_read_covers_snps_count_exact,alpha_dup_read_covers_snps_count_exact,beta_read_covers_snps_count_exact
count,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0
mean,2260.302018,868.292844,888.048073,503.961101,640.96,667.158165,327.904954,27.590459,21.880734,37.339817,13.159266,17.89578,30.640367
std,8527.026419,3775.567801,4360.133257,1465.435173,3204.826485,3642.627229,957.264726,101.067512,179.470645,125.224842,58.553842,150.958731,106.189714
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,32.0,7.0,7.0,7.0,5.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,220.0,48.0,47.0,45.0,25.0,26.0,33.0,1.0,0.0,1.0,0.0,0.0,1.0
75%,1552.0,440.0,400.0,331.0,224.0,214.0,216.0,15.0,0.0,21.0,2.0,0.0,17.0
max,173788.0,82692.0,90119.0,20264.0,72328.0,78456.0,12437.0,1916.0,4060.0,2020.0,1007.0,3477.0,1575.0


In [82]:
df.to_csv('data.3snps.csv')

In [83]:
aml_total_hits = df[df['source']=='AML'].sum(axis=0)

In [84]:
aml_total_hits

file_location                             ../aml_results/aml_aabspliced_SRR948685.sam.so...
source                                    AMLAMLAMLAMLAMLAMLAMLAMLAMLAMLAMLAMLAMLAMLAMLA...
alignment_count                                                                     6159323
alpha_wt_count                                                                      2366098
alpha_dup_count                                                                     2419931
beta_count                                                                          1373294
alpha_wt_zero_edit_count                                                            1746616
alpha_dup_zero_edit_count                                                           1818006
beta_zero_edit_count                                                                 893541
alpha_read_covers_snps_count                                                          75184
alpha_dup_read_covers_snps_count                                                

In [85]:
control_total_hits = df[df['source']=='control'].sum(axis=0)
control_total_hits

file_location                             0.0
source                                    0.0
alignment_count                           0.0
alpha_wt_count                            0.0
alpha_dup_count                           0.0
beta_count                                0.0
alpha_wt_zero_edit_count                  0.0
alpha_dup_zero_edit_count                 0.0
beta_zero_edit_count                      0.0
alpha_read_covers_snps_count              0.0
alpha_dup_read_covers_snps_count          0.0
beta_read_covers_snps_count               0.0
alpha_read_covers_snps_count_exact        0.0
alpha_dup_read_covers_snps_count_exact    0.0
beta_read_covers_snps_count_exact         0.0
dtype: float64

In [86]:
df.head()

Unnamed: 0,file_location,source,alignment_count,alpha_wt_count,alpha_dup_count,beta_count,alpha_wt_zero_edit_count,alpha_dup_zero_edit_count,beta_zero_edit_count,alpha_read_covers_snps_count,alpha_dup_read_covers_snps_count,beta_read_covers_snps_count,alpha_read_covers_snps_count_exact,alpha_dup_read_covers_snps_count_exact,beta_read_covers_snps_count_exact
aml_aabspliced_SRR948685.sam.sorted.bam,../aml_results/aml_aabspliced_SRR948685.sam.so...,AML,197,8,9,180,5,6,108,0,0,4,0,0,2
aml_aabspliced_SRR948686.sam.sorted.bam,../aml_results/aml_aabspliced_SRR948686.sam.so...,AML,161,4,4,153,3,3,85,0,0,6,0,0,3
aml_aabspliced_SRR1918637.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918637.sam.s...,AML,3736,1776,1581,379,677,631,340,106,0,21,0,0,16
aml_aabspliced_SRR1918638.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918638.sam.s...,AML,3550,1669,1482,399,582,546,350,105,0,28,1,0,21
aml_aabspliced_SRR1918639.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918639.sam.s...,AML,182,78,70,34,39,37,26,4,0,0,0,0,0


In [87]:
def get_bases_at_pos(df, transcript, pos):
    """
    For a given position in the reference what are the bases in the reads which align at \
    that position.
    
    NB - only counts bases that are properly aligned (see same flag 163) so the number of bases \
    may be less than the count of reads aligned fully over the area.
    
    """
    
    sam_file_location = df['file_location']
    
    samfile = pysam.AlignmentFile(sam_file_location, "rb")
    
    iter = samfile.pileup(contig=transcript)
    
    for column in iter:
        
        #print (column.reference_name, column.reference_pos)
        
        if column.reference_pos == pos:
            
            base_list = []
              
            for read in column.pileups:

                if read.is_del == 0:

                    base_list.append(read.alignment.query_sequence[read.query_position])

                    #print(read)

                    #print (read.query_position)

                    #print (read.alignment.query_sequence[read.query_position])

                else:

                    base_list.append('_')
                    
            return ''.join(base_list)

In [88]:
df['alpha_wt_pos3_pileup'] = df.apply(get_bases_at_pos, axis=1, args=['Alpha_GEX_64k_HEX', 3])
df['alpha_dp_pos3_pileup'] = df.apply(get_bases_at_pos, axis=1, args=['Alpha_GEX_79k_dup_FAM', 3])
df['beta_pos3_pileup'] = df.apply(get_bases_at_pos, axis=1, args=['BETA_new_GEX_FAM', 3])

In [89]:
df.head(5)

Unnamed: 0,file_location,source,alignment_count,alpha_wt_count,alpha_dup_count,beta_count,alpha_wt_zero_edit_count,alpha_dup_zero_edit_count,beta_zero_edit_count,alpha_read_covers_snps_count,alpha_dup_read_covers_snps_count,beta_read_covers_snps_count,alpha_read_covers_snps_count_exact,alpha_dup_read_covers_snps_count_exact,beta_read_covers_snps_count_exact,alpha_wt_pos3_pileup,alpha_dp_pos3_pileup,beta_pos3_pileup
aml_aabspliced_SRR948685.sam.sorted.bam,../aml_results/aml_aabspliced_SRR948685.sam.so...,AML,197,8,9,180,5,6,108,0,0,4,0,0,2,,,
aml_aabspliced_SRR948686.sam.sorted.bam,../aml_results/aml_aabspliced_SRR948686.sam.so...,AML,161,4,4,153,3,3,85,0,0,6,0,0,3,,,AA
aml_aabspliced_SRR1918637.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918637.sam.s...,AML,3736,1776,1581,379,677,631,340,106,0,21,0,0,16,,,
aml_aabspliced_SRR1918638.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918638.sam.s...,AML,3550,1669,1482,399,582,546,350,105,0,28,1,0,21,GGGG,,
aml_aabspliced_SRR1918639.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918639.sam.s...,AML,182,78,70,34,39,37,26,4,0,0,0,0,0,,,


In [90]:
df['alpha_wt_pos25_pileup'] = df.apply(get_bases_at_pos, axis=1, args=['Alpha_GEX_64k_HEX', 25])
df['alpha_dp_pos25_pileup'] = df.apply(get_bases_at_pos, axis=1, args=['Alpha_GEX_79k_dup_FAM', 25])
df['beta_pos25_pileup'] = df.apply(get_bases_at_pos, axis=1, args=['BETA_new_GEX_FAM', 25])

In [91]:
df['alpha_wt_pos40_pileup'] = df.apply(get_bases_at_pos, axis=1, args=['Alpha_GEX_64k_HEX', 40])
df['alpha_dp_pos40_pileup'] = df.apply(get_bases_at_pos, axis=1, args=['Alpha_GEX_79k_dup_FAM', 40])
df['beta_pos40_pileup'] = df.apply(get_bases_at_pos, axis=1, args=['BETA_new_GEX_FAM', 40])

In [92]:
df['alpha_wt_pos41_pileup'] = df.apply(get_bases_at_pos, axis=1, args=['Alpha_GEX_64k_HEX', 41])
df['alpha_dp_pos41_pileup'] = df.apply(get_bases_at_pos, axis=1, args=['Alpha_GEX_79k_dup_FAM', 41])
df['beta_pos41_pileup'] = df.apply(get_bases_at_pos, axis=1, args=['BETA_new_GEX_FAM', 41])

In [93]:
df.head(50)

Unnamed: 0,file_location,source,alignment_count,alpha_wt_count,alpha_dup_count,beta_count,alpha_wt_zero_edit_count,alpha_dup_zero_edit_count,beta_zero_edit_count,alpha_read_covers_snps_count,...,beta_pos3_pileup,alpha_wt_pos25_pileup,alpha_dp_pos25_pileup,beta_pos25_pileup,alpha_wt_pos40_pileup,alpha_dp_pos40_pileup,beta_pos40_pileup,alpha_wt_pos41_pileup,alpha_dp_pos41_pileup,beta_pos41_pileup
aml_aabspliced_SRR948685.sam.sorted.bam,../aml_results/aml_aabspliced_SRR948685.sam.so...,AML,197,8,9,180,5,6,108,0,...,,,,CC,,,AAAAAAAAAA,,,TTTTTTTTTT
aml_aabspliced_SRR948686.sam.sorted.bam,../aml_results/aml_aabspliced_SRR948686.sam.so...,AML,161,4,4,153,3,3,85,0,...,AA,,,CCCC,,,AAAAAAAAAA,,,TTTTTTTTTT
aml_aabspliced_SRR1918637.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918637.sam.s...,AML,3736,1776,1581,379,677,631,340,106,...,,TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT...,,CCCCCCCCCCCCCCCCCCC,GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG...,,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,,TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT...
aml_aabspliced_SRR1918638.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918638.sam.s...,AML,3550,1669,1482,399,582,546,350,105,...,,TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT...,,CCCCCCCCCCCCCCCCCC,GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG...,,AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,,TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT...
aml_aabspliced_SRR1918639.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918639.sam.s...,AML,182,78,70,34,39,37,26,4,...,,TTTT,,,GGGGGGGGG,,,CCCCCCCCC,,
aml_aabspliced_SRR1918640.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918640.sam.s...,AML,179,78,64,37,38,36,28,7,...,,TTTTTTT,,,GGGGGGGGGGGGGGGG,,,CCCCCCCCCCCCCCCC,,
aml_aabspliced_SRR1918641.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918641.sam.s...,AML,85,21,19,45,9,9,30,1,...,,T,,CCCCC,GGG,,AAAAAAAA,CCC,,TTTTTTTT
aml_aabspliced_SRR1918642.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918642.sam.s...,AML,91,19,19,53,10,10,44,0,...,,,,CCCC,GGG,,AAAAAAAAA,CCC,,TTTTTTTTT
aml_aabspliced_SRR1918643.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918643.sam.s...,AML,2519,1250,1108,161,463,439,127,79,...,A,TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT...,C,CCCCCCC,GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG...,G,AAAAAAAAA,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,C,TTTTTTTTT
aml_aabspliced_SRR1918644.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918644.sam.s...,AML,2396,1180,1054,162,450,427,124,73,...,,TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT...,,CCCC,GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG...,,AAAAAAAA,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,,TTTTTTTT


In [94]:
df[['file_location', 'source','alpha_wt_pos3_pileup',
       'alpha_dp_pos3_pileup', 'alpha_wt_pos25_pileup',
       'alpha_dp_pos25_pileup',  'alpha_wt_pos40_pileup',
       'alpha_dp_pos40_pileup',  'alpha_wt_pos41_pileup',
       'alpha_dp_pos41_pileup']].head(50)

Unnamed: 0,file_location,source,alpha_wt_pos3_pileup,alpha_dp_pos3_pileup,alpha_wt_pos25_pileup,alpha_dp_pos25_pileup,alpha_wt_pos40_pileup,alpha_dp_pos40_pileup,alpha_wt_pos41_pileup,alpha_dp_pos41_pileup
aml_aabspliced_SRR948685.sam.sorted.bam,../aml_results/aml_aabspliced_SRR948685.sam.so...,AML,,,,,,,,
aml_aabspliced_SRR948686.sam.sorted.bam,../aml_results/aml_aabspliced_SRR948686.sam.so...,AML,,,,,,,,
aml_aabspliced_SRR1918637.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918637.sam.s...,AML,,,TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT...,,GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG...,,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,
aml_aabspliced_SRR1918638.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918638.sam.s...,AML,GGGG,,TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT...,,GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG...,,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,
aml_aabspliced_SRR1918639.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918639.sam.s...,AML,,,TTTT,,GGGGGGGGG,,CCCCCCCCC,
aml_aabspliced_SRR1918640.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918640.sam.s...,AML,,,TTTTTTT,,GGGGGGGGGGGGGGGG,,CCCCCCCCCCCCCCCC,
aml_aabspliced_SRR1918641.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918641.sam.s...,AML,,,T,,GGG,,CCC,
aml_aabspliced_SRR1918642.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918642.sam.s...,AML,,,,,GGG,,CCC,
aml_aabspliced_SRR1918643.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918643.sam.s...,AML,GG,,TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT...,C,GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG...,G,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,C
aml_aabspliced_SRR1918644.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918644.sam.s...,AML,GG,,TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT...,,GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG...,,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...,


In [95]:
def combine_pileups(df,columns):
    
    final_string = ''
    
    for column in columns:
        
        if df[column] is not None:
        
            final_string = final_string + df[column]
        
    return final_string

In [96]:
df['pos3_pileup'] = df.apply(combine_pileups, axis=1, args=[['alpha_wt_pos3_pileup','alpha_dp_pos3_pileup']])
df['pos25_pileup'] = df.apply(combine_pileups, axis=1, args=[['alpha_wt_pos25_pileup','alpha_dp_pos25_pileup']])
df['pos40_pileup'] = df.apply(combine_pileups, axis=1, args=[['alpha_wt_pos40_pileup','alpha_dp_pos40_pileup']])
df['pos41_pileup'] = df.apply(combine_pileups, axis=1, args=[['alpha_wt_pos41_pileup','alpha_wt_pos41_pileup']])

In [97]:
df[['file_location', 'source','pos3_pileup','pos25_pileup','pos40_pileup', 'pos41_pileup']].tail()

Unnamed: 0,file_location,source,pos3_pileup,pos25_pileup,pos40_pileup,pos41_pileup
aml_aabspliced_SRR949125.sam.sorted.bam,../aml_results/aml_aabspliced_SRR949125.sam.so...,AML,GGGGG,TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT,GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...
aml_aabspliced_SRR949126.sam.sorted.bam,../aml_results/aml_aabspliced_SRR949126.sam.so...,AML,GGGG,TTTTTGTTTTTTTTTTTTTTTTTTTTTTTTTTTTT,GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...
aml_aabspliced_SRR949127.sam.sorted.bam,../aml_results/aml_aabspliced_SRR949127.sam.so...,AML,,TTTTTTTTT,GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...
aml_aabspliced_SRR949128.sam.sorted.bam,../aml_results/aml_aabspliced_SRR949128.sam.so...,AML,,TTTTTTTTTTT,GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...
aml_aabspliced_SRR949129.sam.sorted.bam,../aml_results/aml_aabspliced_SRR949129.sam.so...,AML,G,TTTT,GGGGGGGGGGGGGGNGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...


In [98]:
def max_base(df, column):
    
    base_dict = {}
    
    for base in df[column]:
        
        if base not in base_dict:
            
            base_dict[base]=1
            
        else:
            
            base_dict[base] = base_dict[base] +1
            
    return base_dict

In [99]:
df['pos3_call'] = df.apply(max_base, axis=1, args=['pos3_pileup'])
df['pos25_call'] = df.apply(max_base, axis=1, args=['pos25_pileup'])
df['pos40_call'] = df.apply(max_base, axis=1, args=['pos40_pileup'])
df['pos41_call'] = df.apply(max_base, axis=1, args=['pos41_pileup'])

In [100]:
df[['file_location', 'source','pos3_call','pos25_call','pos40_call','pos41_call']].head()

Unnamed: 0,file_location,source,pos3_call,pos25_call,pos40_call,pos41_call
aml_aabspliced_SRR948685.sam.sorted.bam,../aml_results/aml_aabspliced_SRR948685.sam.so...,AML,{},{},{},{}
aml_aabspliced_SRR948686.sam.sorted.bam,../aml_results/aml_aabspliced_SRR948686.sam.so...,AML,{},{},{},{}
aml_aabspliced_SRR1918637.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918637.sam.s...,AML,{},{'T': 90},"{'G': 280, 'T': 1}",{'C': 564}
aml_aabspliced_SRR1918638.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918638.sam.s...,AML,{'G': 4},{'T': 84},{'G': 268},{'C': 540}
aml_aabspliced_SRR1918639.sam.sorted.bam,../aml_results/aml_aabspliced_SRR1918639.sam.s...,AML,{},{'T': 4},{'G': 9},{'C': 18}
