In [14]:
# MMPP: Mobile Malaria Project Pipelines
# ----------------------------------------
# Search for the presence of 
# a listed set of non-synonymous mutations
# in a specified gene 
# amongst the reads of a sorted `.bam` file
# ----------------------------------------
# JHendry, 2019/03/27

In [15]:
import os
import sys
import configparser
import getopt
import numpy as np
import pandas as pd
from collections import Counter
from lib.mutation import *

## User Inputs

In [16]:
input_bam = "data/ont/uk/2019-03-28/BC08.sorted.bam"
gene_ini = "data/resources/pf-regions/kelch13.ini"
min_freq = 0.1
downsample = False

In [17]:
input_path = os.path.dirname(input_bam)
output_path = input_path.replace("data", "analysis")
if not os.path.isdir(output_path):
    os.mkdir(output_path)

In [18]:
config = configparser.ConfigParser()
config.read(gene_ini)

# hold gene location information
gene_dt = {}
gene_dt["name"] = config.get("Location", "name")
gene_dt["genome"] = config.get("Location", "genome")
gene_dt["chromosome"] = config.get("Location", "chromosome")
gene_dt["start"] = config.getint("Location", "start")
gene_dt["end"] = config.getint("Location", "end")
gene_dt["strand"] = config.get("Location", "strand")
     
# hold mutation information
mutations = config.get("Mutations", "listed").split(", ")
n_mutations = len(mutations)

In [19]:
print("================================================================================")
print("MMP Mutation Search Pipeline")
print("--------------------------------------------------------------------------------")
print("Gene:", gene_dt["name"])
print("Chromosome:", gene_dt["chromosome"])
print("Start:", gene_dt["start"])
print("End:", gene_dt["end"])
print("Strand:", gene_dt["strand"])
print("")
print("Searching for %d mutations." % n_mutations)
print("")
print("Input BAM:", input_bam)
print("Output path:", output_path)
print("Reference genome:", gene_dt["genome"])
print("")
print("Downsampling?", downsample)
print("================================================================================")

MMP Mutation Search Pipeline
--------------------------------------------------------------------------------
Gene: KELCH13
Chromosome: Pf3D7_13_v3
Start: 1724817
End: 1726997
Strand: reverse

Searching for 31 mutations.

Input BAM: data/ont/uk/2019-03-28/BC08.sorted.bam
Output path: analysis/ont/uk/2019-03-28
Reference genome: data/resources/plasmodb/39/PlasmoDB-39_Pfalciparum3D7_Genome.fasta

Downsampling? False


In [20]:
pileup_bam = input_bam

In [21]:
# Generate read pileup using samtools
pileup_path = pileup_bam.replace("sorted.bam", "pileup")
position = gene_dt["chromosome"] + ":" + str(gene_dt["start"]) + "-" + str(gene_dt["end"])

cmd = "samtools mpileup -f %s -r %s -Q 0 -aa -B %s > %s" % (gene_dt["genome"], 
                                                            position, 
                                                            pileup_bam, 
                                                            pileup_path)
print("Generating pileup...")
print(cmd)
os.system(cmd)
print("Done.")

Generating pileup...
samtools mpileup -f data/resources/plasmodb/39/PlasmoDB-39_Pfalciparum3D7_Genome.fasta -r Pf3D7_13_v3:1724817-1726997 -Q 0 -aa -B data/ont/uk/2019-03-28/BC08.sorted.bam > data/ont/uk/2019-03-28/BC08.pileup
Done.


In [22]:
# Make necessary prepartions if the gene is on the reverse strand
if gene_dt['strand'] == 'reverse':
    print("Gene is on reverse strand, inverting pileup.")
    print("Note: still need to perform reverse complementation.")
    pileup_path_reverse = pileup_path.replace("pileup", "reverse.pileup")
    os.system('tail -r %s > %s' % (pileup_path, pileup_path_reverse))
    pileup_path = pileup_path_reverse
else:
    print("Gene is on forward strand.")

Gene is on reverse strand, inverting pileup.
Note: still need to perform reverse complementation.


In [23]:
# Search for mutations
mutation_dt = {
    "position": [],
    "total_counts": [],
    "total_counts_noindel": [],
    "ref_codon": [],
    "ref_amino": [],
    "ref_freq": [],
    "major_codon": [],
    "major_amino": [],
    "major_freq": [],
    "n_mutation_types": [],
    "mutation_codon": [],
    "mutation_amino": [],
    "mutation_freq": [],
}

In [24]:
with open(pileup_path, "r") as fn:
    
    # want to loop until you have a full codon
    nts = 0
    codon_position = 0
    codon_ref_nts = []
    codon_pileup = []
    for i, line in enumerate(fn):
        nts += 1
        chrom, pos, ref, coverage, pileup, _ = line.split("\t")
        processed_pileup = process_pileup(pileup, ref)
        if gene_dt["strand"] == 'reverse':
            ref = complement_map[ref]
            processed_pileup = "".join([complement_map[base] for base in processed_pileup])
        codon_ref_nts.append(ref)
        codon_pileup.append(processed_pileup)

        if nts == 3:
            codon_position += 1
            print("--------------------------------------------------------------------------------")
            print("Checking Codon:", codon_position)
            # Search the codon for non-synonymous mutations above a specified frequency
            assert len(codon_ref_nts) == 3
            assert len(codon_pileup) == 3
            
            # Pre-process from lists into strings
            codon_ref = "".join(codon_ref_nts)
            amino_ref = codon_to_amino(codon_ref, genetic_code)
            codon_pileup = ["".join(c) for c in zip(*codon_pileup)]
            
            # Get frequencies of codons (i.e. nucleotide level)
            codon_frequencies = Counter(codon_pileup)
            major_codon, major_codon_count = codon_frequencies.most_common(1)[0]
            ref_codon_count = codon_frequencies[codon_ref]
            total_codon_count = sum(codon_frequencies.values())

            print("Discovered...")
            print("  Total codon count (including indels):", total_codon_count)
            print("  Reference codon (from 3D7):", codon_ref)
            print("  Reference codon count:", ref_codon_count)
            print("  Majority codon (from pileup):", major_codon)
            print("  Majority codon count:", major_codon_count)
            print("  Number of unique codons discovered (including indels):", len(codon_frequencies))
            print("")

            # Restrict to codon observations without indels
            # - these can be used to look for non-synonymous changes
            codon_noindel_frequencies = Counter(dict([(c, v) for c, v in codon_frequencies.items() 
                                                      if not "+" in c and not "-" in c]))
            total_noindel_count = sum(codon_noindel_frequencies.values())

            # Now get non-synonymous frequencies
            codon_nonsyn_frequencies = Counter(dict([(c, v) for c, v in codon_noindel_frequencies.items()
                                                     if codon_to_amino(c, genetic_code) != amino_ref]))
            
            
            
            
            print("  Total codon count (excluding indels):", total_noindel_count)
            print("    Unique types:", len(codon_noindel_frequencies))
            print("    Non-synonymous:", len(codon_nonsyn_frequencies))
            print("")
            if len(codon_nonsyn_frequencies) > 0:
                highest_nonsyn_codon, highest_nonsyn_count = codon_nonsyn_frequencies.most_common(1)[0]
                print("    Highest frequency non-synonymous: %s = %s" 
                      % (highest_nonsyn_codon, codon_to_amino(highest_nonsyn_codon, genetic_code)))
                print("                        at frequency: %.05f" 
                      % (highest_nonsyn_count/total_noindel_count))
                
                # Finally, check if any of the non-synonymous mutations are above
                # set threshold
                above = Counter(dict([(c, v) for c, v in codon_nonsyn_frequencies.items() 
                      if v/total_noindel_count > min_freq]))
                if len(above) > 0:
                    n_above = len(above)
                    print("Detected %d non-synonymous mutations above frequency threshold of %.02f."
                          % (n_above, min_freq))
                    print("Storing results.")
                    
                    for mut_codon, mut_counts in above.items():
                        mutation_dt["position"].append(codon_position)
                        mutation_dt["total_counts"].append(total_codon_count)
                        mutation_dt["total_counts_noindel"].append(total_noindel_count)
                        
                        mutation_dt["ref_codon"].append(codon_ref)
                        mutation_dt["ref_amino"].append(amino_ref)
                        mutation_dt["ref_freq"].append(ref_codon_count/total_codon_count)
                        
                        mutation_dt["major_codon"].append(major_codon)
                        mutation_dt["major_amino"].append(codon_to_amino(major_codon, genetic_code))
                        mutation_dt["major_freq"].append(major_codon_count/total_noindel_count)
                        
                        mutation_dt["n_mutation_types"].append(len(codon_nonsyn_frequencies))
                        mutation_dt["mutation_codon"].append(mut_codon)
                        mutation_dt["mutation_amino"].append(codon_to_amino(mut_codon, genetic_code))
                        mutation_dt["mutation_freq"].append(mut_counts/total_noindel_count)

            else:
                print("    No non-synonymous mutations discovered.")
            print("")
            
            # Once mutation search is complete, reset codon and continue
            nts = 0
            codon_ref_nts = []
            codon_pileup = []

--------------------------------------------------------------------------------
Checking Codon: 1
Discovered...
  Total codon count (including indels): 373
  Reference codon (from 3D7): ATG
  Reference codon count: 291
  Majority codon (from pileup): ATG
  Majority codon count: 291
  Number of unique codons discovered (including indels): 18

  Total codon count (excluding indels): 304
    Unique types: 7
    Non-synonymous: 6

    Highest frequency non-synonymous: ACG = T
                        at frequency: 0.00987

--------------------------------------------------------------------------------
Checking Codon: 2
Discovered...
  Total codon count (including indels): 373
  Reference codon (from 3D7): GAA
  Reference codon count: 308
  Majority codon (from pileup): GAA
  Majority codon count: 308
  Number of unique codons discovered (including indels): 15

  Total codon count (excluding indels): 312
    Unique types: 5
    Non-synonymous: 3

    Highest frequency non-synonymous: GAT =

--------------------------------------------------------------------------------
Checking Codon: 45
Discovered...
  Total codon count (including indels): 371
  Reference codon (from 3D7): TTT
  Reference codon count: 307
  Majority codon (from pileup): TTT
  Majority codon count: 307
  Number of unique codons discovered (including indels): 15

  Total codon count (excluding indels): 313
    Unique types: 4
    Non-synonymous: 3

    Highest frequency non-synonymous: TCT = S
                        at frequency: 0.00639

--------------------------------------------------------------------------------
Checking Codon: 46
Discovered...
  Total codon count (including indels): 372
  Reference codon (from 3D7): ATG
  Reference codon count: 271
  Majority codon (from pileup): ATG
  Majority codon count: 271
  Number of unique codons discovered (including indels): 24

  Total codon count (excluding indels): 297
    Unique types: 9
    Non-synonymous: 8

    Highest frequency non-synonymous: ATA

KeyError: '6'

In [27]:
pileup

',.,,,..-1C.-1C,.......*..........-1C...-6CTAGTT....,,,,,,,,,,,......-1C...........*...,,,,,,,,,,,,*,,,,,,,,,,,,,........................*..*......**........,,,,,,,,,,,,,,,,,,,,,.........,,,,,,,............T........-1C.*......*....+1C.......G.....-1C.............,,-106ctagttagattcataaatgaattagaatcattctcgctactacttccgcttttatcatcactattgctgttaccaccagattccctatcatacgtcatagaaaaat,,,,,,,,....T,..-1C..,,....*.-1C....,,,..-1C,,,...,,,,,,,,.,..,,,.,,-1c,,,,,,..,,,-2ct,,.*.,,,,,,...,,*,,,,*,.,*,.,,..,,.,,*,,,,.-1C,,,-1c,T,,.,,.,*,,,,,.,'

In [None]:
mutation_df = pd.DataFrame(mutation_dt)
columns = ["position", "total_counts", "total_counts_noindel", 
           "ref_codon", "ref_amino", "ref_freq",
           "major_codon", "major_amino", "major_freq",
           "n_mutation_types",
           "mutation_codon", "mutation_amino", "mutation_freq"]
mutation_df = mutation_df[columns]
print("--------------------------------------------------------------------------------")
print("Mutation scan complete.")
print("================================================================================")