# Exploratory Data Analysis

First pass at analysing the output of blasting AML RNA-Seq data against the three tryptase sequences from Jonathon.


In [1]:
import pysam
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
def get_files(folder):
    
    return glob.glob('{folder}/*sorted.bam'.format(folder=folder))

In [3]:
list_of_control_results = get_files('../control_results/')

In [4]:
control_file_names = [x.split('/')[len(x.split('/'))-1] for x in list_of_control_results]

In [5]:
df_control = pd.DataFrame(index=control_file_names)

In [6]:
df_control['file_location'] = list_of_control_results

In [7]:
df_control['source'] = 'control'

In [8]:
df_control.head()

Unnamed: 0,file_location,source


In [9]:
df = df_control

In [10]:
df.count()

file_location    0
source           0
dtype: int64

In [11]:
def get_read_count(df):

    return int(pysam.view("-c", df['file_location']))

In [12]:
df['alignment_count'] = df.apply(get_read_count, axis=1)

ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series

In [None]:
df.head()

In [None]:
def get_transcript_count(df, transcript_name):
    
    """
    Return the count of the transcript i.e how many of each of the three are in there.
    Note - Does not look at the quality of the alignment.
    
    """
    
    sam_file_location = df['file_location']
    
    samfile = pysam.AlignmentFile(sam_file_location, "rb")
    
    count = 0
    
    for read in samfile:
        
        if read is not None and read.reference_name == transcript_name:
            
            count = count + 1
            
    return count

In [None]:
df['alpha_wt_count'] = df.apply(get_transcript_count, axis=1, args=['Alpha_GEX_64k_HEX'])

In [None]:
df['alpha_dup_count'] = df.apply(get_transcript_count, axis=1, args=['Alpha_GEX_79k_dup_FAM'])
df['beta_count'] = df.apply(get_transcript_count, axis=1, args=['BETA_new_GEX_FAM'])

In [None]:
df.head()

In [None]:
def get_zero_edit_distance_count(df, transcript_name):
    
    """
    For a goven transcript e.g. Alpha_GEX_64k_HEX get how many of the matches have an NM tag of zero
    i.e. the match was exact.
    
    """
    
    sam_file_location = df['file_location']
    
    samfile = pysam.AlignmentFile(sam_file_location, "rb")
    
    count = 0
    
    for read in samfile:
        
        if read is not None and read.reference_name == transcript_name:
            
            edit_distance = read.get_tag('NM')
            
            if edit_distance == 0:
                
                count = count + 1
            
    return count

In [None]:
df['alpha_wt_zero_edit_count'] = df.apply(get_zero_edit_distance_count, axis=1, args=['Alpha_GEX_64k_HEX'])
df['alpha_dup_zero_edit_count'] = df.apply(get_zero_edit_distance_count, axis=1, args=['Alpha_GEX_79k_dup_FAM'])
df['beta_zero_edit_count'] = df.apply(get_zero_edit_distance_count, axis=1, args=['BETA_new_GEX_FAM'])

In [None]:
df.head()

In [None]:
def get_transcript_read_count_filtered(df, transcript_name, start, end):
    
    """
    Count hits which cover the bit of the reference we are interested in
    
    """
    
    sam_file_location = df['file_location']
    
    samfile = pysam.AlignmentFile(sam_file_location, "rb")
    
    iter = samfile.fetch(transcript_name, start, end)
    
    count =0
    
    for read in iter:
        
        if read.reference_start <=start and read.reference_end >= end:
            
            count = count +1
            
    return count

In [None]:
df['alpha_read_covers_snps_count'] = df.apply(get_transcript_read_count_filtered,
                                              axis=1,
                                              args=['Alpha_GEX_64k_HEX', 25,45])

In [None]:
df['alpha_dup_read_covers_snps_count'] = df.apply(get_transcript_read_count_filtered,
                                              axis=1,
                                              args=['Alpha_GEX_79k_dup_FAM', 25,45])

df['beta_read_covers_snps_count'] = df.apply(get_transcript_read_count_filtered,
                                              axis=1,
                                              args=['BETA_new_GEX_FAM', 25,45])

In [None]:
df.sort_values(by='alpha_dup_read_covers_snps_count', ascending=False).head()

In [None]:
def get_transcript_read_count_filtered_exact(df, transcript_name, start, end):
    
    """
    Count hits which cover the bit of the reference we are interested in and which are exact.
    
    That is do they cross position 0 - 44 of the transcript ( given by transcipt_name) and \
    have an edit distance from the reference of 0. 
    
    Reads which cross this location will cross all four SNPs needed to tell the 3 transcripts apart.
    
    """
    
    sam_file_location = df['file_location']
    
    samfile = pysam.AlignmentFile(sam_file_location, "rb")
    
    iter = samfile.fetch(transcript_name, start, end)
    
    count =0
    
    for read in iter:
        
        if read.reference_start <=start and read.reference_end >=end:
            
            edit_distance = read.get_tag('NM')
            
            if edit_distance == 0:
                
                count = count + 1
                
    return count

In [None]:
df['alpha_read_covers_snps_count_exact'] = df.apply(get_transcript_read_count_filtered_exact,
                                              axis=1,
                                              args=['Alpha_GEX_64k_HEX', 25,45])

df['alpha_dup_read_covers_snps_count_exact'] = df.apply(get_transcript_read_count_filtered_exact,
                                              axis=1,
                                              args=['Alpha_GEX_79k_dup_FAM', 25,45])

df['beta_read_covers_snps_count_exact'] = df.apply(get_transcript_read_count_filtered_exact,
                                              axis=1,
                                              args=['BETA_new_GEX_FAM', 25,45])

In [None]:
# Quick check to see if any such reads exist

#df.sort_values(by='alpha_dup_read_covers_snps_count_exact', ascending=False).head()

df.head()

## Sanity Check

Print out some of the reads. I then did a manual BLAST and alignment to check the code was working as expected

In [None]:
def get_transcript_read_count_filtered_exact_print(sam_file_location, transcript_name, start, end):
    
    """
    Same as get_transcript_read_count_filtered_exact() except we just print out the reads instead of \
    counting them.
    
    
    """
    
    samfile = pysam.AlignmentFile(sam_file_location, "rb")
    
    iter = samfile.fetch(transcript_name, start, end)
    
    for read in iter:
        
        if read.reference_start <=start and read.reference_end >= end:
            
            edit_distance = read.get_tag('NM')
            
            if edit_distance == 0:
                
                print (read)

In [None]:
#Pick a random bam file with some read alignments in this area

get_transcript_read_count_filtered_exact_print('../aml_results/aml_aabspliced_SRR5626188.sam.sorted.bam',
                                               'Alpha_GEX_64k_HEX',
                                               25,
                                               45 )

In [None]:
df.describe()

In [None]:
df.to_csv('data.control.3snps.csv')