# A pipeline to search for the presence of a listed set of non-synonymous mutaitons in the reads of a given sorted `.bam` file

In [215]:
import os
import subprocess

import numpy as np
import pandas as pd

from collections import Counter
import re

import ConfigParser

## I. User inputs

Later, this will be: (1) a sorted `.bam` file and (2) an `.ini` file containing mutation information (3) a downsamlping flag.

In [216]:
input_bam = "results/ont/uk/2019-03-20/BC03.pf.bam"
input_path = os.path.dirname(input_bam)

In [217]:
gene_ini = "data/resources/pf-regions/kelch13.ini"

In [218]:
config = ConfigParser.ConfigParser()
config.read(gene_ini)

['data/resources/pf-regions/kelch13.ini']

In [219]:
gene_dt = {}
gene_dt["genome"] = config.get("Location", "genome")
gene_dt["chromosome"] = config.get("Location", "chromosome")
gene_dt["start"] = config.getint("Location", "start")
gene_dt["end"] = config.getint("Location", "end")
gene_dt["strand"] = config.get("Location", "strand")

In [220]:
downsample = True

In [221]:
# Output directory
output_path = input_path.replace("results", "analysis")
if not os.path.isdir(output_path):
    os.mkdir(output_path)

In [222]:
print input_path
print output_path

results/ont/uk/2019-03-20
analysis/ont/uk/2019-03-20


## II. Execute Downsampling

Useful in cases where there is more read data than necessary -- or when looking to speed up analysis (e.g. beta-testing).

In [223]:
if downsample:
    print "Downsampling from SAM file..."
    n_reads = 1000
    
    # You have to downsample from the `.sam` file
    input_sam = input_bam.replace("bam", "sam")
    dwn_sam = os.path.join(output_path, os.path.basename(input_sam))
    dwn_bam = dwn_sam.replace("sam", "bam")
    dwn_sorted_bam = dwn_bam.replace("bam", "sorted.bam")
    
    # Downsample by shuffling lines, first extract header
    # Get header
    os.system("grep '^@' %s > %s" % (input_sam, dwn_sam))

    # Downsample
    os.system("sed '/^@/d' %s > no_header.tmp.bam" % input_sam)
    os.system("gshuf -n %d no_header.tmp.bam >> %s" % (n_reads, dwn_sam))
    os.system("rm no_header.tmp.bam")
    
    # Prepare file for pileup
    print "Converting to BAM..."
    os.system("samtools view -S -b %s > %s" % (dwn_sam, dwn_bam))
    print "Sorting BAM..."
    os.system("samtools sort %s -o %s" % (dwn_sam, dwn_sorted_bam))
    print "Indexing BAM..."
    os.system("samtools index %s" % (dwn_sorted_bam))
    print "Done."
    
    pileup_bam = dwn_sorted_bam
else:
    pileup_bam = input_bam

Downsampling from SAM file...
Converting to BAM...
Sorting BAM...
Indexing BAM...
Done.


## III. Excute `samtools mpileup`

In [102]:
pileup_path = pileup_bam.replace("sorted.bam", "pileup")
position = gene_dt["chromosome"] + ":" + str(gene_dt["start"]) + "-" + str(gene_dt["end"])

cmd = "samtools mpileup -f %s -r %s -Q 0 -aa -B %s > %s" % (gene_dt["genome"], position, 
                                                            pileup_bam, pileup_path)
print "Generating pileup..."
print cmd
os.system(cmd)
print "Done."

Generating pileup...
samtools mpileup -f data/resources/plasmodb/39/PlasmoDB-39_Pfalciparum3D7_Genome.fasta -r Pf3D7_13_v3:1724817-1726997 -Q 0 -aa -B analysis/ont/uk/2019-03-20/BC03.pf.sorted.bam > analysis/ont/uk/2019-03-20/BC03.pf.pileup
Done.


## IV. Search for Mutations

In [104]:
# Make necessary prepartions if the gene is on the reverse strand
if gene_dt['strand'] == 'reverse':
    print "Gene is on reverse strand, inverting pileup."
    print "Note: still need to perform reverse complementation."
    pileup_path_reverse = pileup_path.replace("pileup", "reverse.pileup")
    os.system('tail -r %s > %s' % (pileup_path, pileup_path_reverse))
    pileup_path = pileup_path_reverse
else:
    print "Gene is on forward strand."

Gene is on reverse strand, inverting pileup.
Note: still need to perform reverse complementation.


In [211]:
mutations = config.get("Mutations", "listed").split(", ")
mutations
n_mutations = len(mutations)

mutation_dt = {
    "mutation": [],
    "detected": [],
    "total_count": [],
    "major_amino": [],
    "major_count": [],
    "mutation_amino": [],
    "mutation_count": [],
    "n_aminos": [],
}

In [212]:
for mutation in mutations:
    
    # Parse Mutation Information
    print("====================================================================================================")
    print("Searching for...")
    print("  Mutation:", mutation)
    codon = int(mutation[1:-1])
    codon_nts = np.arange(3*(codon - 1), 3*codon)
    print("  Codon:", codon)
    print("  Corresponding bases:", codon_nts)
    amino_alt = mutation[-1]
    
    with open(pileup_path, "r") as fn:
        
        # Extract pileups corresponding to codon position
        codon_ref_nts = []
        codon_pileup = []
        for i, line in enumerate(fn):
            if i in codon_nts:
                
                # Extract pileup information
                chrom, pos, ref, coverage, pileup, _ = line.split("\t")
                
                # Convert the pileup string to A, T, C, G, +, -
                processed_pileup = process_pileup(pileup, ref)
                
                # Reverse complement if necessary
                if gene_dt["strand"] == 'reverse':
                    complement_map = {"A": "T", "T": "A", 
                                      "G": "C", "C": "G", 
                                      "-": "-", "+": "+" }
                    ref = complement_map[ref]
                    processed_pileup = "".join([complement_map[base] for base in processed_pileup])
                
                # Append 
                codon_ref_nts.append(ref)
                codon_pileup.append(processed_pileup)
                
        # Check complete pileup has been extracted
        assert len(codon_ref_nts) == 3
        assert len(codon_pileup) == 3
        
        # Pre-process from lists into strings
        codon_ref = "".join(codon_ref_nts)
        codon_pileup = ["".join(c) for c in zip(*codon_pileup)]
        
        # Get frequencies of codons (i.e. nucleotide level)
        codon_frequencies = Counter(codon_pileup)
        major_codon, major_codon_count = codon_frequencies.most_common(1)[0]
        ref_codon_count = codon_frequencies[codon_ref]
        total_codon_count = sum(codon_frequencies.values())
        
        print("Discovered...")
        print("  Reference codon (from 3D7):", codon_ref)
        print("  Majority codon (from pileup):", major_codon)
        print("  Number of unique codons discovered (including indels):", len(codon_frequencies))
        print("  Reference codon count:", ref_codon_count)
        print("  Majority codon count:", major_codon_count)
        print("  Total codon count:", total_codon_count)
        print("")
        
        # Get frequencies of amino acids
        amino_ref = codon_to_amino(codon_ref, genetic_code)
        amino_frequencies = Counter([codon_to_amino(c, genetic_code) for c in codon_pileup])
        # next line removes indels which have prevented making amino acid calls
        amino_frequencies = Counter(dict([(k, v) for k, v in amino_frequencies.iteritems() if k != None]))
        major_amino, major_amino_count = amino_frequencies.most_common(1)[0]
        ref_amino_count = amino_frequencies[amino_ref]
        total_amino_count = sum(amino_frequencies.values())
        
        print("  Reference amino (from 3D7):", amino_ref)
        print("  Majority amino (from pileup):", major_amino)
        print("  Number of unique aminos discovered (including indels):", len(amino_frequencies))
        print("  Reference amino count:", ref_amino_count)
        print("  Majority amino count:", major_amino_count)
        print("  Total amino count:", total_amino_count)
            
        # Finally, check for non-synonymous change of interest
        if amino_alt in amino_frequencies.keys():
            mutation_detected = True
            mutation_count = amino_frequencies[amino_alt]
        else:
            mutation_detected = False
            mutation_count = 0
            
        print("Mutation detected?:", mutation_detected)
        print("Mutation count:", mutation_count)
        print("Percent of total: %.02f%%" % (100*float(mutation_count)/total_amino_count))
        
        
        # Store output
        mutation_dt["mutation"].append(mutation)
        mutation_dt["detected"].append(mutation_detected)
        mutation_dt["total_count"].append(total_amino_count)
        mutation_dt["major_amino"].append(major_amino)
        mutation_dt["major_count"].append(major_amino_count)
        mutation_dt["mutation_amino"].append(amino_alt)
        mutation_dt["mutation_count"].append(mutation_count)
        mutation_dt["n_aminos"].append(len(amino_frequencies))
        
        print("====================================================================================================")

Searching for...
  Mutation: N458Y
  Codon: 458
  Corresponding bases: [1371 1372 1373]
Discovered...
  Reference codon (from 3D7): AAT
  Majority codon (from pileup): AAT
  Number of unique codons discovered (including indels): 21
  Reference codon count: 409
  Majority codon count: 409
  Total codon count: 495

  Reference amino (from 3D7): N
  Majority amino (from pileup): N
  Number of unique aminos discovered (including indels): 8
  Reference amino count: 410
  Majority amino count: 410
  Total amino count: 428
Mutation detected?: True
Mutation count: 1
Percent of total: 0.23%
Searching for...
  Mutation: Y493H
  Codon: 493
  Corresponding bases: [1476 1477 1478]
Discovered...
  Reference codon (from 3D7): TAC
  Majority codon (from pileup): TAC
  Number of unique codons discovered (including indels): 32
  Reference codon count: 339
  Majority codon count: 339
  Total codon count: 504

  Reference amino (from 3D7): Y
  Majority amino (from pileup): Y
  Number of unique aminos disc

KeyError: "['major_amino_count'] not in index"

In [213]:
mutation_df = pd.DataFrame(mutation_dt)
mutation_df = mutation_df[["mutation", "detected", 
                           "total_count",
                           "major_amino", "major_count",
                           "mutation_amino", "mutation_count",
                           "n_aminos"]]

In [214]:
mutation_df

Unnamed: 0,mutation,detected,total_count,major_amino,major_count,mutation_amino,mutation_count,n_aminos
0,N458Y,True,428,N,410,Y,1,8
1,Y493H,True,403,Y,353,H,4,8
2,R539T,True,382,R,364,T,1,5
3,I543T,True,396,I,376,T,5,7
4,R561H,True,417,R,386,H,17,7
5,C580Y,True,439,C,412,Y,10,5
6,P441L,True,440,P,429,L,5,4


## -. Library

In [194]:
def process_pileup(pileup, ref):
    """
    Process a read pileup from 
    `samtools mpileup` into
    A,T,C,G,+,-
    
    Note, the solution is ugly, but
    re.sub(\+([0-9])[ATCGatcg]+, ... fails when a variant
    follows directly after an insertion deltion.
    
    """
    
    n_pileup = len(pileup)
    processed_pileup = ''
    i = 0
    while i < n_pileup:
        p = pileup[i]
        if p in ["+", "-"]:
            size = int(pileup[i + 1])
            j = size + 2
            processed_pileup += p
        elif p == "*":
            j = 1
            processed_pileup += "-"
        else:
            j = 1
            processed_pileup += p
        i += j
    
    processed_pileup = re.sub("\$|\^\]", "", processed_pileup)
    processed_pileup = re.sub("\.|,", ref, processed_pileup)
    processed_pileup = processed_pileup.upper()
    
    return processed_pileup

In [165]:
genetic_code = { 
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                  
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 
        'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 
        'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', 
    }

In [166]:
def codon_to_amino(codon, genetic_code):
    if codon in genetic_code.keys():
        amino = genetic_code[codon]
    else:
        amino = None
    return amino