# Exploratory Data Analysis

First pass at analysing the output of blasting AML RNA-Seq data against the three tryptase sequences from Jonathon.


In [303]:
import pysam
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [304]:
def get_files(folder):
    
    return glob.glob('{folder}/*.filtered.bam'.format(folder=folder))

In [305]:
list_of_aml_results = get_files('../aml_results/')
list_of_control_results = get_files('../control_results/')

In [306]:
aml_file_names = [x.split('/')[len(x.split('/'))-1] for x in list_of_aml_results]
control_file_names = [x.split('/')[len(x.split('/'))-1] for x in list_of_control_results]

In [307]:
df_aml = pd.DataFrame(index=aml_file_names)
df_control = pd.DataFrame(index=control_file_names)

In [308]:
df_aml['file_location'] = list_of_aml_results
df_control['file_location'] = list_of_control_results

In [309]:
df_aml['source'] = 'AML'
df_control['source'] = 'control'

In [310]:
df_aml.head()

Unnamed: 0,file_location,source
aml_aabspliced_ERR1024254.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024254.sam.s...,AML
aml_aabspliced_ERR1024255.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024255.sam.s...,AML
aml_aabspliced_ERR1024256.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024256.sam.s...,AML
aml_aabspliced_ERR1024257.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024257.sam.s...,AML
aml_aabspliced_ERR1024258.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024258.sam.s...,AML


In [311]:
df_control.head()

Unnamed: 0,file_location,source


In [312]:
df = df_aml.append(df_control)

In [313]:
df.count()

file_location    2725
source           2725
dtype: int64

In [314]:
def get_read_count(df):

    return int(pysam.view("-c", df['file_location']))

In [315]:
df['alignment_count'] = df.apply(get_read_count, axis=1)

In [316]:
df.head()

Unnamed: 0,file_location,source,alignment_count
aml_aabspliced_ERR1024254.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024254.sam.s...,AML,74
aml_aabspliced_ERR1024255.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024255.sam.s...,AML,62
aml_aabspliced_ERR1024256.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024256.sam.s...,AML,0
aml_aabspliced_ERR1024257.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024257.sam.s...,AML,6
aml_aabspliced_ERR1024258.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024258.sam.s...,AML,6


In [317]:
def get_transcript_count(df, transcript_name):
    
    """
    Return the count of the transcript i.e how many of each of the three are in there.
    Note - Does not look at the quality of the alignment.
    
    """
    
    sam_file_location = df['file_location']
    
    samfile = pysam.AlignmentFile(sam_file_location, "rb")
    
    count = 0
    
    for read in samfile:
        
        if read is not None and read.reference_name == transcript_name:
            
            count = count + 1
            
    return count

In [318]:
df['alpha_wt_count'] = df.apply(get_transcript_count, axis=1, args=['Alpha_GEX_64k_HEX'])

In [319]:
df['alpha_dup_count'] = df.apply(get_transcript_count, axis=1, args=['Alpha_GEX_79k_dup_FAM'])
df['beta_count'] = df.apply(get_transcript_count, axis=1, args=['BETA_new_GEX_FAM'])

In [320]:
df.head()

Unnamed: 0,file_location,source,alignment_count,alpha_wt_count,alpha_dup_count,beta_count
aml_aabspliced_ERR1024254.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024254.sam.s...,AML,74,39,35,0
aml_aabspliced_ERR1024255.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024255.sam.s...,AML,62,32,30,0
aml_aabspliced_ERR1024256.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024256.sam.s...,AML,0,0,0,0
aml_aabspliced_ERR1024257.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024257.sam.s...,AML,6,3,3,0
aml_aabspliced_ERR1024258.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024258.sam.s...,AML,6,3,3,0


In [321]:
def get_zero_edit_distance_count(df, transcript_name):
    
    """
    For a goven transcript e.g. Alpha_GEX_64k_HEX get how many of the matches have an NM tag of zero
    i.e. the match was exact.
    
    """
    
    sam_file_location = df['file_location']
    
    samfile = pysam.AlignmentFile(sam_file_location, "rb")
    
    count = 0
    
    for read in samfile:
        
        if read is not None and read.reference_name == transcript_name:
            
            edit_distance = read.get_tag('NM')
            
            if edit_distance == 0:
                
                count = count + 1
            
    return count

In [322]:
df['alpha_wt_zero_edit_count'] = df.apply(get_zero_edit_distance_count, axis=1, args=['Alpha_GEX_64k_HEX'])
df['alpha_dup_zero_edit_count'] = df.apply(get_zero_edit_distance_count, axis=1, args=['Alpha_GEX_79k_dup_FAM'])
df['beta_zero_edit_count'] = df.apply(get_zero_edit_distance_count, axis=1, args=['BETA_new_GEX_FAM'])

In [323]:
df.head()

Unnamed: 0,file_location,source,alignment_count,alpha_wt_count,alpha_dup_count,beta_count,alpha_wt_zero_edit_count,alpha_dup_zero_edit_count,beta_zero_edit_count
aml_aabspliced_ERR1024254.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024254.sam.s...,AML,74,39,35,0,21,19,0
aml_aabspliced_ERR1024255.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024255.sam.s...,AML,62,32,30,0,17,16,0
aml_aabspliced_ERR1024256.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024256.sam.s...,AML,0,0,0,0,0,0,0
aml_aabspliced_ERR1024257.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024257.sam.s...,AML,6,3,3,0,1,1,0
aml_aabspliced_ERR1024258.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024258.sam.s...,AML,6,3,3,0,2,2,0


In [324]:
def get_transcript_read_count_filtered(df, transcript_name, start, end):
    
    """
    Count hits which cover the bit of the reference we are interested in
    
    """
    
    sam_file_location = df['file_location']
    
    samfile = pysam.AlignmentFile(sam_file_location, "rb")
    
    iter = samfile.fetch(transcript_name, start, end)
    
    count =0
    
    for read in iter:
        
        if read.reference_start <=3 and read.reference_end >= 41:
            
            count = count +1
            
    return count

In [325]:
df['alpha_read_covers_snps_count'] = df.apply(get_transcript_read_count_filtered,
                                              axis=1,
                                              args=['Alpha_GEX_64k_HEX', 0,45])

In [326]:
df['alpha_dup_read_covers_snps_count'] = df.apply(get_transcript_read_count_filtered,
                                              axis=1,
                                              args=['Alpha_GEX_79k_dup_FAM', 0,45])

df['beta_read_covers_snps_count'] = df.apply(get_transcript_read_count_filtered,
                                              axis=1,
                                              args=['BETA_new_GEX_FAM', 0,45])

In [327]:
df.sort_values(by='alpha_dup_read_covers_snps_count', ascending=False).head()

Unnamed: 0,file_location,source,alignment_count,alpha_wt_count,alpha_dup_count,beta_count,alpha_wt_zero_edit_count,alpha_dup_zero_edit_count,beta_zero_edit_count,alpha_read_covers_snps_count,alpha_dup_read_covers_snps_count,beta_read_covers_snps_count
aml_aabspliced_SRR5626173.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_SRR5626173.sam.s...,AML,64959,30944,34015,0,27266,29823,0,0,113,0
aml_aabspliced_SRR5626188.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_SRR5626188.sam.s...,AML,172811,82692,90119,0,72328,78456,0,7,101,0
aml_aabspliced_SRR1916268.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_SRR1916268.sam.s...,AML,128492,62044,66448,0,55942,59756,0,18,92,0
aml_aabspliced_SRR1916270.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_SRR1916270.sam.s...,AML,126062,60754,65308,0,54343,58268,0,17,80,0
aml_aabspliced_SRR1916269.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_SRR1916269.sam.s...,AML,127385,61392,65993,0,55298,59255,0,19,76,0


In [328]:
def get_transcript_read_count_filtered_exact(df, transcript_name, start, end):
    
    """
    Count hits which cover the bit of the reference we are interested in and which are exact.
    
    That is do they cross position 0 - 44 of the transcript ( given by transcipt_name) and \
    have an edit distance from the reference of 0. 
    
    Reads which cross this location will cross all four SNPs needed to tell the 3 transcripts apart.
    
    """
    
    sam_file_location = df['file_location']
    
    samfile = pysam.AlignmentFile(sam_file_location, "rb")
    
    iter = samfile.fetch(transcript_name, start, end)
    
    count =0
    
    for read in iter:
        
        if read.reference_start <=3 and read.reference_end >= 41:
            
            edit_distance = read.get_tag('NM')
            
            if edit_distance == 0:
                
                count = count + 1
                
    return count

In [329]:
df['alpha_read_covers_snps_count_exact'] = df.apply(get_transcript_read_count_filtered_exact,
                                              axis=1,
                                              args=['Alpha_GEX_64k_HEX', 0,45])

df['alpha_dup_read_covers_snps_count_exact'] = df.apply(get_transcript_read_count_filtered_exact,
                                              axis=1,
                                              args=['Alpha_GEX_79k_dup_FAM', 0,45])

df['beta_read_covers_snps_count_exact'] = df.apply(get_transcript_read_count_filtered_exact,
                                              axis=1,
                                              args=['BETA_new_GEX_FAM', 0,45])

In [330]:
# Quick check to see if any such reads exist

df.sort_values(by='alpha_dup_read_covers_snps_count_exact', ascending=False).head()

Unnamed: 0,file_location,source,alignment_count,alpha_wt_count,alpha_dup_count,beta_count,alpha_wt_zero_edit_count,alpha_dup_zero_edit_count,beta_zero_edit_count,alpha_read_covers_snps_count,alpha_dup_read_covers_snps_count,beta_read_covers_snps_count,alpha_read_covers_snps_count_exact,alpha_dup_read_covers_snps_count_exact,beta_read_covers_snps_count_exact
aml_aabspliced_SRR5626173.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_SRR5626173.sam.s...,AML,64959,30944,34015,0,27266,29823,0,0,113,0,0,100,0
aml_aabspliced_SRR5626188.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_SRR5626188.sam.s...,AML,172811,82692,90119,0,72328,78456,0,7,101,0,6,85,0
aml_aabspliced_SRR1916268.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_SRR1916268.sam.s...,AML,128492,62044,66448,0,55942,59756,0,18,92,0,17,81,0
aml_aabspliced_SRR1916270.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_SRR1916270.sam.s...,AML,126062,60754,65308,0,54343,58268,0,17,80,0,13,70,0
aml_aabspliced_SRR1916269.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_SRR1916269.sam.s...,AML,127385,61392,65993,0,55298,59255,0,19,76,0,17,66,0


## Sanity Check

Print out some of the reads. I then did a manual BLAST and alignment to check the code was working as expected

In [331]:
def get_transcript_read_count_filtered_exact_print(sam_file_location, transcript_name, start, end):
    
    """
    Same as get_transcript_read_count_filtered_exact() except we just print out the reads instead of \
    counting them.
    
    
    """
    
    samfile = pysam.AlignmentFile(sam_file_location, "rb")
    
    iter = samfile.fetch(transcript_name, start, end)
    
    for read in iter:
        
        if read.reference_start <=3 and read.reference_end >= 41:
            
            edit_distance = read.get_tag('NM')
            
            if edit_distance == 0:
                
                print (read)

In [332]:
#Pick a random bam file with some read alignments in this area

get_transcript_read_count_filtered_exact_print('../aml_results/aml_aabspliced_SRR5626188.sam.sorted.bam',
                                               'Alpha_GEX_64k_HEX',
                                               0,
                                               45 )

SRR5626188.47984736	163	0	0	255	7S94M	0	5	94	GGGAGAGCCCGCTGGGTAGAAGGAACAGGGAGTGGCCAGGATGCTGAGCCTGCTGCTGCTGGCGCTGCCCGTCCTGGCGAGCCGCGCCTACGCGGCCCCTG	None	[('NH', 1), ('AS', 94), ('NM', 0)]
SRR5626188.59402515	163	0	0	255	6S95M	0	150	95	GGAGAGCCCGCTGGGTAGAAGGAACAGGGAGTGGCCAGGATGCTGAGCCTGCTGCTGCTGGCGCTGCCCGTCCTGGCGAGCCGCGCCTACGCGGCCCCTGC	None	[('NH', 1), ('AS', 95), ('NM', 0)]
SRR5626188.63045359	163	0	0	255	24S77M	0	18	77	GAAGGATAAATGGGGAGGGGAGAGCCCGCTGGGTAGAAGGAACAGGGAGTGGCCAGGATGCTGAGCCTGCTGCTGCTGGCGCTGCCCGTCCTGGCGAGCCG	None	[('NH', 1), ('AS', 77), ('NM', 0)]
SRR5626188.63657800	163	0	0	255	6S95M	0	125	95	GGAGAGCCCGCTGGGTAGAAGGAACAGGGAGTGGCCAGGATGCTGAGCCTGCTGCTGCTGGCGCTGCCCGTCCTGGCGAGCCGCGCCTACGCGGCCCCTGC	None	[('NH', 1), ('AS', 95), ('NM', 0)]
SRR5626188.68846470	99	0	0	255	6S95M	0	41	95	GGAGAGCCCGCTGGGTAGAAGGAACAGGGAGTGGCCAGGATGCTGAGCCTGCTGCTGCTGGCGCTGCCCGTCCTGGCGAGCCGCGCCTACGCGGCCCCTGC	None	[('NH', 1), ('AS', 95), ('NM', 0)]
SRR5626188.16757617	89	0	0	255	41S60M	-1	-1	60	CGCCCCCTCCTG

In [333]:
df.describe()

Unnamed: 0,alignment_count,alpha_wt_count,alpha_dup_count,beta_count,alpha_wt_zero_edit_count,alpha_dup_zero_edit_count,beta_zero_edit_count,alpha_read_covers_snps_count,alpha_dup_read_covers_snps_count,beta_read_covers_snps_count,alpha_read_covers_snps_count_exact,alpha_dup_read_covers_snps_count_exact,beta_read_covers_snps_count_exact
count,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0,2725.0
mean,1756.340917,868.292844,888.048073,0.0,640.96,667.158165,0.0,0.85945,0.474862,0.0,0.508624,0.407339,0.0
std,8072.591647,3775.567801,4360.133257,0.0,3204.826485,3642.627229,0.0,2.418356,4.545315,0.0,1.545807,3.981227,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,14.0,7.0,7.0,0.0,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,94.0,48.0,47.0,0.0,25.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,835.0,440.0,400.0,0.0,224.0,214.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,172811.0,82692.0,90119.0,0.0,72328.0,78456.0,0.0,50.0,113.0,0.0,17.0,100.0,0.0


In [334]:
aml_total_hits = df[df['source']=='AML'].sum(axis=0)

In [335]:
aml_total_hits

file_location                             ../aml_results/aml_aabspliced_ERR1024254.sam.s...
source                                    AMLAMLAMLAMLAMLAMLAMLAMLAMLAMLAMLAMLAMLAMLAMLA...
alignment_count                                                                     4786029
alpha_wt_count                                                                      2366098
alpha_dup_count                                                                     2419931
beta_count                                                                                0
alpha_wt_zero_edit_count                                                            1746616
alpha_dup_zero_edit_count                                                           1818006
beta_zero_edit_count                                                                      0
alpha_read_covers_snps_count                                                           2342
alpha_dup_read_covers_snps_count                                                

In [336]:
control_total_hits = df[df['source']=='control'].sum(axis=0)
control_total_hits

file_location                             0.0
source                                    0.0
alignment_count                           0.0
alpha_wt_count                            0.0
alpha_dup_count                           0.0
beta_count                                0.0
alpha_wt_zero_edit_count                  0.0
alpha_dup_zero_edit_count                 0.0
beta_zero_edit_count                      0.0
alpha_read_covers_snps_count              0.0
alpha_dup_read_covers_snps_count          0.0
beta_read_covers_snps_count               0.0
alpha_read_covers_snps_count_exact        0.0
alpha_dup_read_covers_snps_count_exact    0.0
beta_read_covers_snps_count_exact         0.0
dtype: float64

In [337]:
df.head()

Unnamed: 0,file_location,source,alignment_count,alpha_wt_count,alpha_dup_count,beta_count,alpha_wt_zero_edit_count,alpha_dup_zero_edit_count,beta_zero_edit_count,alpha_read_covers_snps_count,alpha_dup_read_covers_snps_count,beta_read_covers_snps_count,alpha_read_covers_snps_count_exact,alpha_dup_read_covers_snps_count_exact,beta_read_covers_snps_count_exact
aml_aabspliced_ERR1024254.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024254.sam.s...,AML,74,39,35,0,21,19,0,0,0,0,0,0,0
aml_aabspliced_ERR1024255.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024255.sam.s...,AML,62,32,30,0,17,16,0,0,0,0,0,0,0
aml_aabspliced_ERR1024256.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024256.sam.s...,AML,0,0,0,0,0,0,0,0,0,0,0,0,0
aml_aabspliced_ERR1024257.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024257.sam.s...,AML,6,3,3,0,1,1,0,0,0,0,0,0,0
aml_aabspliced_ERR1024258.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024258.sam.s...,AML,6,3,3,0,2,2,0,0,0,0,0,0,0


In [338]:
def get_bases_at_pos(df, transcript, pos):
    """
    For a given position in the reference what are the bases in the reads which align at \
    that position.
    
    NB - only counts bases that are properly aligned (see same flag 163) so the number of bases \
    may be less than the count of reads aligned fully over the area.
    
    """
    
    sam_file_location = df['file_location']
    
    samfile = pysam.AlignmentFile(sam_file_location, "rb")
    
    iter = samfile.pileup(contig=transcript)
    
    for column in iter:
        
        #print (column.reference_name, column.reference_pos)
        
        if column.reference_pos == pos:
            
            base_list = []
              
            for read in column.pileups:

                if read.is_del == 0:

                    base_list.append(read.alignment.query_sequence[read.query_position])

                    #print(read)

                    #print (read.query_position)

                    #print (read.alignment.query_sequence[read.query_position])

                else:

                    base_list.append('_')
                    
            return ''.join(base_list)

In [339]:
df['alpha_wt_pos3_pileup'] = df.apply(get_bases_at_pos, axis=1, args=['Alpha_GEX_64k_HEX', 3])
df['alpha_dp_pos3_pileup'] = df.apply(get_bases_at_pos, axis=1, args=['Alpha_GEX_79k_dup_FAM', 3])
df['beta_pos3_pileup'] = df.apply(get_bases_at_pos, axis=1, args=['BETA_new_GEX_FAM', 3])

In [340]:
df.head(5)

Unnamed: 0,file_location,source,alignment_count,alpha_wt_count,alpha_dup_count,beta_count,alpha_wt_zero_edit_count,alpha_dup_zero_edit_count,beta_zero_edit_count,alpha_read_covers_snps_count,alpha_dup_read_covers_snps_count,beta_read_covers_snps_count,alpha_read_covers_snps_count_exact,alpha_dup_read_covers_snps_count_exact,beta_read_covers_snps_count_exact,alpha_wt_pos3_pileup,alpha_dp_pos3_pileup,beta_pos3_pileup
aml_aabspliced_ERR1024254.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024254.sam.s...,AML,74,39,35,0,21,19,0,0,0,0,0,0,0,,,
aml_aabspliced_ERR1024255.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024255.sam.s...,AML,62,32,30,0,17,16,0,0,0,0,0,0,0,,,
aml_aabspliced_ERR1024256.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024256.sam.s...,AML,0,0,0,0,0,0,0,0,0,0,0,0,0,,,
aml_aabspliced_ERR1024257.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024257.sam.s...,AML,6,3,3,0,1,1,0,0,0,0,0,0,0,,,
aml_aabspliced_ERR1024258.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024258.sam.s...,AML,6,3,3,0,2,2,0,0,0,0,0,0,0,,,


In [341]:
df['alpha_wt_pos25_pileup'] = df.apply(get_bases_at_pos, axis=1, args=['Alpha_GEX_64k_HEX', 25])
df['alpha_dp_pos25_pileup'] = df.apply(get_bases_at_pos, axis=1, args=['Alpha_GEX_79k_dup_FAM', 25])
df['beta_pos25_pileup'] = df.apply(get_bases_at_pos, axis=1, args=['BETA_new_GEX_FAM', 25])

In [342]:
df['alpha_wt_pos40_pileup'] = df.apply(get_bases_at_pos, axis=1, args=['Alpha_GEX_64k_HEX', 40])
df['alpha_dp_pos40_pileup'] = df.apply(get_bases_at_pos, axis=1, args=['Alpha_GEX_79k_dup_FAM', 40])
df['beta_pos40_pileup'] = df.apply(get_bases_at_pos, axis=1, args=['BETA_new_GEX_FAM', 40])

In [343]:
df['alpha_wt_pos41_pileup'] = df.apply(get_bases_at_pos, axis=1, args=['Alpha_GEX_64k_HEX', 41])
df['alpha_dp_pos41_pileup'] = df.apply(get_bases_at_pos, axis=1, args=['Alpha_GEX_79k_dup_FAM', 41])
df['beta_pos41_pileup'] = df.apply(get_bases_at_pos, axis=1, args=['BETA_new_GEX_FAM', 41])

In [344]:
df.head(50)

Unnamed: 0,file_location,source,alignment_count,alpha_wt_count,alpha_dup_count,beta_count,alpha_wt_zero_edit_count,alpha_dup_zero_edit_count,beta_zero_edit_count,alpha_read_covers_snps_count,...,beta_pos3_pileup,alpha_wt_pos25_pileup,alpha_dp_pos25_pileup,beta_pos25_pileup,alpha_wt_pos40_pileup,alpha_dp_pos40_pileup,beta_pos40_pileup,alpha_wt_pos41_pileup,alpha_dp_pos41_pileup,beta_pos41_pileup
aml_aabspliced_ERR1024254.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024254.sam.s...,AML,74,39,35,0,21,19,0,0,...,,TT,,,GGGGGG,,,CCCCCC,,
aml_aabspliced_ERR1024255.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024255.sam.s...,AML,62,32,30,0,17,16,0,0,...,,T,,,GGGGG,,,CCCCC,,
aml_aabspliced_ERR1024256.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024256.sam.s...,AML,0,0,0,0,0,0,0,0,...,,,,,,,,,,
aml_aabspliced_ERR1024257.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024257.sam.s...,AML,6,3,3,0,1,1,0,0,...,,,,,,,,,,
aml_aabspliced_ERR1024258.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024258.sam.s...,AML,6,3,3,0,2,2,0,0,...,,,,,,,,,,
aml_aabspliced_ERR1024259.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024259.sam.s...,AML,8,4,4,0,2,2,0,0,...,,,,,,,,,,
aml_aabspliced_ERR1024260.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024260.sam.s...,AML,2,1,1,0,1,1,0,0,...,,,,,,,,,,
aml_aabspliced_ERR1024261.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024261.sam.s...,AML,6,3,3,0,2,2,0,0,...,,,,,,,,,,
aml_aabspliced_ERR1024262.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024262.sam.s...,AML,6,4,2,0,1,1,0,0,...,,T,,,G,,,C,,
aml_aabspliced_ERR1024263.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024263.sam.s...,AML,10,5,5,0,4,4,0,0,...,,,,,,,,,,


In [345]:
df[['file_location', 'source','alpha_wt_pos3_pileup',
       'alpha_dp_pos3_pileup', 'alpha_wt_pos25_pileup',
       'alpha_dp_pos25_pileup',  'alpha_wt_pos40_pileup',
       'alpha_dp_pos40_pileup',  'alpha_wt_pos41_pileup',
       'alpha_dp_pos41_pileup']].head(50)

Unnamed: 0,file_location,source,alpha_wt_pos3_pileup,alpha_dp_pos3_pileup,alpha_wt_pos25_pileup,alpha_dp_pos25_pileup,alpha_wt_pos40_pileup,alpha_dp_pos40_pileup,alpha_wt_pos41_pileup,alpha_dp_pos41_pileup
aml_aabspliced_ERR1024254.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024254.sam.s...,AML,,,TT,,GGGGGG,,CCCCCC,
aml_aabspliced_ERR1024255.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024255.sam.s...,AML,,,T,,GGGGG,,CCCCC,
aml_aabspliced_ERR1024256.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024256.sam.s...,AML,,,,,,,,
aml_aabspliced_ERR1024257.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024257.sam.s...,AML,,,,,,,,
aml_aabspliced_ERR1024258.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024258.sam.s...,AML,,,,,,,,
aml_aabspliced_ERR1024259.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024259.sam.s...,AML,,,,,,,,
aml_aabspliced_ERR1024260.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024260.sam.s...,AML,,,,,,,,
aml_aabspliced_ERR1024261.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024261.sam.s...,AML,,,,,,,,
aml_aabspliced_ERR1024262.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024262.sam.s...,AML,,,T,,G,,C,
aml_aabspliced_ERR1024263.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024263.sam.s...,AML,,,,,,,,


In [346]:
def combine_pileups(df,columns):
    
    final_string = ''
    
    for column in columns:
        
        if df[column] is not None:
        
            final_string = final_string + df[column]
        
    return final_string

In [347]:
df['pos3_pileup'] = df.apply(combine_pileups, axis=1, args=[['alpha_wt_pos3_pileup','alpha_dp_pos3_pileup']])
df['pos25_pileup'] = df.apply(combine_pileups, axis=1, args=[['alpha_wt_pos25_pileup','alpha_dp_pos25_pileup']])
df['pos40_pileup'] = df.apply(combine_pileups, axis=1, args=[['alpha_wt_pos40_pileup','alpha_dp_pos40_pileup']])
df['pos41_pileup'] = df.apply(combine_pileups, axis=1, args=[['alpha_wt_pos41_pileup','alpha_wt_pos41_pileup']])

In [348]:
df[['file_location', 'source','pos3_pileup','pos25_pileup','pos40_pileup', 'pos41_pileup']].tail()

Unnamed: 0,file_location,source,pos3_pileup,pos25_pileup,pos40_pileup,pos41_pileup
aml_aabspliced_SRR949125.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_SRR949125.sam.so...,AML,GGGGG,TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT,GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...
aml_aabspliced_SRR949126.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_SRR949126.sam.so...,AML,GGGG,TTTTTGTTTTTTTTTTTTTTTTTTTTTTTTTTTTT,GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...
aml_aabspliced_SRR949127.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_SRR949127.sam.so...,AML,,TTTTTTTTT,GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...
aml_aabspliced_SRR949128.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_SRR949128.sam.so...,AML,,TTTTTTTTTTT,GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG...,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...
aml_aabspliced_SRR949129.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_SRR949129.sam.so...,AML,G,TTTT,GGGGGGGGGGGGGGNGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...


In [364]:
def max_base(df, column):
    
    base_dict = {}
    
    for base in df[column]:
        
        if base not in base_dict:
            
            base_dict[base]=1
            
        else:
            
            base_dict[base] = base_dict[base] +1
            
    total_count = sum(base_dict.values()
            
    return base_dict

In [365]:
df['pos3_call'] = df.apply(max_base, axis=1, args=['pos3_pileup'])
df['pos25_call'] = df.apply(max_base, axis=1, args=['pos25_pileup'])
df['pos40_call'] = df.apply(max_base, axis=1, args=['pos40_pileup'])
df['pos41_call'] = df.apply(max_base, axis=1, args=['pos41_pileup'])

In [366]:
df[['file_location', 'source','pos3_call','pos25_call','pos40_call','pos41_call']].head()

Unnamed: 0,file_location,source,pos3_call,pos25_call,pos40_call,pos41_call
aml_aabspliced_ERR1024254.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024254.sam.s...,AML,0,2,6,12
aml_aabspliced_ERR1024255.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024255.sam.s...,AML,0,1,5,10
aml_aabspliced_ERR1024256.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024256.sam.s...,AML,0,0,0,0
aml_aabspliced_ERR1024257.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024257.sam.s...,AML,0,0,0,0
aml_aabspliced_ERR1024258.sam.sorted.bam.filtered.bam,../aml_results/aml_aabspliced_ERR1024258.sam.s...,AML,0,0,0,0
