In [14]:
# MMPP: Mobile Malaria Project Pipelines
# ----------------------------------------
# Search for the presence of 
# a listed set of non-synonymous mutations
# in a specified gene 
# amongst the reads of a sorted `.bam` file
# ----------------------------------------
# JHendry, 2019/03/27

In [15]:
import os
import sys
import configparser
import getopt
import numpy as np
import pandas as pd
from collections import Counter
from lib.mutation import *

## User Inputs

In [16]:
input_bam = "data/ont/uk/2019-03-28/BC08.sorted.bam"
gene_ini = "data/resources/pf-regions/kelch13.ini"
min_freq = 0.1
downsample = False

In [17]:
input_path = os.path.dirname(input_bam)
output_path = input_path.replace("data", "analysis")
if not os.path.isdir(output_path):
    os.mkdir(output_path)

In [18]:
config = configparser.ConfigParser()
config.read(gene_ini)

# hold gene location information
gene_dt = {}
gene_dt["name"] = config.get("Location", "name")
gene_dt["genome"] = config.get("Location", "genome")
gene_dt["chromosome"] = config.get("Location", "chromosome")
gene_dt["start"] = config.getint("Location", "start")
gene_dt["end"] = config.getint("Location", "end")
gene_dt["strand"] = config.get("Location", "strand")
     
# hold mutation information
mutations = config.get("Mutations", "listed").split(", ")
n_mutations = len(mutations)

In [19]:
print("================================================================================")
print("MMP Mutation Search Pipeline")
print("--------------------------------------------------------------------------------")
print("Gene:", gene_dt["name"])
print("Chromosome:", gene_dt["chromosome"])
print("Start:", gene_dt["start"])
print("End:", gene_dt["end"])
print("Strand:", gene_dt["strand"])
print("")
print("Searching for %d mutations." % n_mutations)
print("")
print("Input BAM:", input_bam)
print("Output path:", output_path)
print("Reference genome:", gene_dt["genome"])
print("")
print("Downsampling?", downsample)
print("================================================================================")

MMP Mutation Search Pipeline
--------------------------------------------------------------------------------
Gene: KELCH13
Chromosome: Pf3D7_13_v3
Start: 1724817
End: 1726997
Strand: reverse

Searching for 31 mutations.

Input BAM: data/ont/uk/2019-03-28/BC08.sorted.bam
Output path: analysis/ont/uk/2019-03-28
Reference genome: data/resources/plasmodb/39/PlasmoDB-39_Pfalciparum3D7_Genome.fasta

Downsampling? False


In [20]:
pileup_bam = input_bam

In [21]:
# Generate read pileup using samtools
pileup_path = pileup_bam.replace("sorted.bam", "pileup")
position = gene_dt["chromosome"] + ":" + str(gene_dt["start"]) + "-" + str(gene_dt["end"])

cmd = "samtools mpileup -f %s -r %s -Q 0 -aa -B %s > %s" % (gene_dt["genome"], 
                                                            position, 
                                                            pileup_bam, 
                                                            pileup_path)
print("Generating pileup...")
print(cmd)
os.system(cmd)
print("Done.")

Generating pileup...
samtools mpileup -f data/resources/plasmodb/39/PlasmoDB-39_Pfalciparum3D7_Genome.fasta -r Pf3D7_13_v3:1724817-1726997 -Q 0 -aa -B data/ont/uk/2019-03-28/BC08.sorted.bam > data/ont/uk/2019-03-28/BC08.pileup
Done.


In [160]:
# Make necessary prepartions if the gene is on the reverse strand
if gene_dt['strand'] == 'reverse':
    print("Gene is on reverse strand, inverting pileup.")
    print("Note: still need to perform reverse complementation.")
    pileup_path_reverse = pileup_path.replace("pileup", "reverse.pileup")
    os.system('tail -r %s > %s' % (pileup_path, pileup_path_reverse))
    pileup_path = pileup_path_reverse
else:
    print("Gene is on forward strand.")

Gene is on reverse strand, inverting pileup.
Note: still need to perform reverse complementation.


In [161]:
# Search for mutations
mutation_dt = {
    "position": [],
    "total_counts": [],
    "total_counts_noindel": [],
    "ref_codon": [],
    "ref_amino": [],
    "ref_freq": [],
    "major_codon": [],
    "major_amino": [],
    "major_freq": [],
    "n_mutation_types": [],
    "mutation_codon": [],
    "mutation_amino": [],
    "mutation_freq": [],
}

In [162]:
with open(pileup_path, "r") as fn:
    
    # want to loop until you have a full codon
    nts = 0
    codon_position = 0
    codon_ref_nts = []
    codon_pileup = []
    for i, line in enumerate(fn):
        nts += 1
        chrom, pos, ref, coverage, pileup, _ = line.split("\t")
        processed_pileup = process_pileup(pileup, ref)
        if gene_dt["strand"] == 'reverse':
            ref = complement_map[ref]
            processed_pileup = "".join([complement_map[base] for base in processed_pileup])
        codon_ref_nts.append(ref)
        codon_pileup.append(processed_pileup)

        if nts == 3:
            codon_position += 1
            print("--------------------------------------------------------------------------------")
            print("Checking Codon:", codon_position)
            # Search the codon for non-synonymous mutations above a specified frequency
            assert len(codon_ref_nts) == 3
            assert len(codon_pileup) == 3
            
            # Pre-process from lists into strings
            codon_ref = "".join(codon_ref_nts)
            amino_ref = codon_to_amino(codon_ref, genetic_code)
            codon_pileup = ["".join(c) for c in zip(*codon_pileup)]
            
            # Get frequencies of codons (i.e. nucleotide level)
            codon_frequencies = Counter(codon_pileup)
            major_codon, major_codon_count = codon_frequencies.most_common(1)[0]
            ref_codon_count = codon_frequencies[codon_ref]
            total_codon_count = sum(codon_frequencies.values())

            print("Discovered...")
            print("  Total codon count (including indels):", total_codon_count)
            print("  Reference codon (from 3D7):", codon_ref)
            print("  Reference codon count:", ref_codon_count)
            print("  Majority codon (from pileup):", major_codon)
            print("  Majority codon count:", major_codon_count)
            print("  Number of unique codons discovered (including indels):", len(codon_frequencies))
            print("")

            # Restrict to codon observations without indels
            # - these can be used to look for non-synonymous changes
            codon_noindel_frequencies = Counter(dict([(c, v) for c, v in codon_frequencies.items() 
                                                      if not "+" in c and not "-" in c]))
            total_noindel_count = sum(codon_noindel_frequencies.values())

            # Now get non-synonymous frequencies
            codon_nonsyn_frequencies = Counter(dict([(c, v) for c, v in codon_noindel_frequencies.items()
                                                     if codon_to_amino(c, genetic_code) != amino_ref]))
            
            
            
            
            print("  Total codon count (excluding indels):", total_noindel_count)
            print("    Unique types:", len(codon_noindel_frequencies))
            print("    Non-synonymous:", len(codon_nonsyn_frequencies))
            print("")
            if len(codon_nonsyn_frequencies) > 0:
                highest_nonsyn_codon, highest_nonsyn_count = codon_nonsyn_frequencies.most_common(1)[0]
                print("    Highest frequency non-synonymous: %s = %s" 
                      % (highest_nonsyn_codon, codon_to_amino(highest_nonsyn_codon, genetic_code)))
                print("                        at frequency: %.05f" 
                      % (highest_nonsyn_count/total_noindel_count))
                
                # Finally, check if any of the non-synonymous mutations are above
                # set threshold
                above = Counter(dict([(c, v) for c, v in codon_nonsyn_frequencies.items() 
                      if v/total_noindel_count > min_freq]))
                if len(above) > 0:
                    n_above = len(above)
                    print("Detected %d non-synonymous mutations above frequency threshold of %.02f."
                          % (n_above, min_freq))
                    print("Storing results.")
                    
                    for mut_codon, mut_counts in above.items():
                        mutation_dt["position"].append(codon_position)
                        mutation_dt["total_counts"].append(total_codon_count)
                        mutation_dt["total_counts_noindel"].append(total_noindel_count)
                        
                        mutation_dt["ref_codon"].append(codon_ref)
                        mutation_dt["ref_amino"].append(amino_ref)
                        mutation_dt["ref_freq"].append(ref_codon_count/total_codon_count)
                        
                        mutation_dt["major_codon"].append(major_codon)
                        mutation_dt["major_amino"].append(codon_to_amino(major_codon, genetic_code))
                        mutation_dt["major_freq"].append(major_codon_count/total_noindel_count)
                        
                        mutation_dt["n_mutation_types"].append(len(codon_nonsyn_frequencies))
                        mutation_dt["mutation_codon"].append(mut_codon)
                        mutation_dt["mutation_amino"].append(codon_to_amino(mut_codon, genetic_code))
                        mutation_dt["mutation_freq"].append(mut_counts/total_noindel_count)

            else:
                print("    No non-synonymous mutations discovered.")
            print("")
            
            # Once mutation search is complete, reset codon and continue
            nts = 0
            codon_ref_nts = []
            codon_pileup = []

--------------------------------------------------------------------------------
Checking Codon: 1
Discovered...
  Total codon count (including indels): 373
  Reference codon (from 3D7): ATG
  Reference codon count: 291
  Majority codon (from pileup): ATG
  Majority codon count: 291
  Number of unique codons discovered (including indels): 15

  Total codon count (excluding indels): 304
    Unique types: 7
    Non-synonymous: 6

    Highest frequency non-synonymous: ACG = T
                        at frequency: 0.00987

--------------------------------------------------------------------------------
Checking Codon: 2
Discovered...
  Total codon count (including indels): 373
  Reference codon (from 3D7): GAA
  Reference codon count: 308
  Majority codon (from pileup): GAA
  Majority codon count: 308
  Number of unique codons discovered (including indels): 13

  Total codon count (excluding indels): 312
    Unique types: 5
    Non-synonymous: 3

    Highest frequency non-synonymous: GAT =

    Non-synonymous: 7

    Highest frequency non-synonymous: TCA = S
                        at frequency: 0.01429

--------------------------------------------------------------------------------
Checking Codon: 64
Discovered...
  Total codon count (including indels): 373
  Reference codon (from 3D7): AAT
  Reference codon count: 294
  Majority codon (from pileup): AAT
  Majority codon count: 294
  Number of unique codons discovered (including indels): 19

  Total codon count (excluding indels): 322
    Unique types: 10
    Non-synonymous: 8

    Highest frequency non-synonymous: AAG = K
                        at frequency: 0.01863

--------------------------------------------------------------------------------
Checking Codon: 65
Discovered...
  Total codon count (including indels): 373
  Reference codon (from 3D7): AAT
  Reference codon count: 314
  Majority codon (from pileup): AAT
  Majority codon count: 314
  Number of unique codons discovered (including indels): 16

  Total cod

  Majority codon (from pileup): TTG
  Majority codon count: 278
  Number of unique codons discovered (including indels): 21

  Total codon count (excluding indels): 296
    Unique types: 11
    Non-synonymous: 8

    Highest frequency non-synonymous: TGG = W
                        at frequency: 0.01014

--------------------------------------------------------------------------------
Checking Codon: 120
Discovered...
  Total codon count (including indels): 369
  Reference codon (from 3D7): TTA
  Reference codon count: 275
  Majority codon (from pileup): TTA
  Majority codon count: 275
  Number of unique codons discovered (including indels): 22

  Total codon count (excluding indels): 288
    Unique types: 9
    Non-synonymous: 6

    Highest frequency non-synonymous: TTC = F
                        at frequency: 0.01042

--------------------------------------------------------------------------------
Checking Codon: 121
Discovered...
  Total codon count (including indels): 374
  Refere

    Highest frequency non-synonymous: ACA = T
                        at frequency: 0.04364

--------------------------------------------------------------------------------
Checking Codon: 175
Discovered...
  Total codon count (including indels): 361
  Reference codon (from 3D7): GCA
  Reference codon count: 235
  Majority codon (from pileup): GCA
  Majority codon count: 235
  Number of unique codons discovered (including indels): 23

  Total codon count (excluding indels): 266
    Unique types: 11
    Non-synonymous: 8

    Highest frequency non-synonymous: ACA = T
                        at frequency: 0.04511

--------------------------------------------------------------------------------
Checking Codon: 176
Discovered...
  Total codon count (including indels): 357
  Reference codon (from 3D7): AAT
  Reference codon count: 278
  Majority codon (from pileup): AAT
  Majority codon count: 278
  Number of unique codons discovered (including indels): 13

  Total codon count (excluding i

  Majority codon (from pileup): TTA
  Majority codon count: 278
  Number of unique codons discovered (including indels): 20

  Total codon count (excluding indels): 291
    Unique types: 7
    Non-synonymous: 5

    Highest frequency non-synonymous: TTT = F
                        at frequency: 0.01718

--------------------------------------------------------------------------------
Checking Codon: 231
Discovered...
  Total codon count (including indels): 354
  Reference codon (from 3D7): AAA
  Reference codon count: 291
  Majority codon (from pileup): AAA
  Majority codon count: 291
  Number of unique codons discovered (including indels): 16

  Total codon count (excluding indels): 297
    Unique types: 7
    Non-synonymous: 5

    Highest frequency non-synonymous: TAT = Y
                        at frequency: 0.00337

--------------------------------------------------------------------------------
Checking Codon: 232
Discovered...
  Total codon count (including indels): 354
  Referen

    Highest frequency non-synonymous: TTT = F
                        at frequency: 0.01832

--------------------------------------------------------------------------------
Checking Codon: 286
Discovered...
  Total codon count (including indels): 360
  Reference codon (from 3D7): AAT
  Reference codon count: 283
  Majority codon (from pileup): AAT
  Majority codon count: 283
  Number of unique codons discovered (including indels): 16

  Total codon count (excluding indels): 300
    Unique types: 8
    Non-synonymous: 6

    Highest frequency non-synonymous: GAT = D
                        at frequency: 0.02333

--------------------------------------------------------------------------------
Checking Codon: 287
Discovered...
  Total codon count (including indels): 360
  Reference codon (from 3D7): GGT
  Reference codon count: 281
  Majority codon (from pileup): GGT
  Majority codon count: 281
  Number of unique codons discovered (including indels): 19

  Total codon count (excluding in

  Majority codon count: 135
  Number of unique codons discovered (including indels): 12

  Total codon count (excluding indels): 32
    Unique types: 3
    Non-synonymous: 2

    Highest frequency non-synonymous: TTT = F
                        at frequency: 0.03125

--------------------------------------------------------------------------------
Checking Codon: 342
Discovered...
  Total codon count (including indels): 357
  Reference codon (from 3D7): GTT
  Reference codon count: 293
  Majority codon (from pileup): GTT
  Majority codon count: 293
  Number of unique codons discovered (including indels): 20

  Total codon count (excluding indels): 316
    Unique types: 9
    Non-synonymous: 7

    Highest frequency non-synonymous: GGT = G
                        at frequency: 0.01582

--------------------------------------------------------------------------------
Checking Codon: 343
Discovered...
  Total codon count (including indels): 357
  Reference codon (from 3D7): GAT
  Reference 


--------------------------------------------------------------------------------
Checking Codon: 397
Discovered...
  Total codon count (including indels): 349
  Reference codon (from 3D7): GAT
  Reference codon count: 296
  Majority codon (from pileup): GAT
  Majority codon count: 296
  Number of unique codons discovered (including indels): 10

  Total codon count (excluding indels): 306
    Unique types: 5
    Non-synonymous: 3

    Highest frequency non-synonymous: AAT = N
                        at frequency: 0.01634

--------------------------------------------------------------------------------
Checking Codon: 398
Discovered...
  Total codon count (including indels): 350
  Reference codon (from 3D7): AGG
  Reference codon count: 264
  Majority codon (from pileup): AGG
  Majority codon count: 264
  Number of unique codons discovered (including indels): 18

  Total codon count (excluding indels): 282
    Unique types: 7
    Non-synonymous: 5

    Highest frequency non-synonymous: 

  Majority codon count: 288
  Number of unique codons discovered (including indels): 10

  Total codon count (excluding indels): 294
    Unique types: 5
    Non-synonymous: 4

    Highest frequency non-synonymous: GGT = G
                        at frequency: 0.01020

--------------------------------------------------------------------------------
Checking Codon: 453
Discovered...
  Total codon count (including indels): 344
  Reference codon (from 3D7): GGT
  Reference codon count: 256
  Majority codon (from pileup): GGT
  Majority codon count: 256
  Number of unique codons discovered (including indels): 18

  Total codon count (excluding indels): 274
    Unique types: 9
    Non-synonymous: 5

    Highest frequency non-synonymous: TGT = C
                        at frequency: 0.01460

--------------------------------------------------------------------------------
Checking Codon: 454
Discovered...
  Total codon count (including indels): 353
  Reference codon (from 3D7): GTA
  Reference


--------------------------------------------------------------------------------
Checking Codon: 508
Discovered...
  Total codon count (including indels): 340
  Reference codon (from 3D7): ACT
  Reference codon count: 258
  Majority codon (from pileup): ACT
  Majority codon count: 258
  Number of unique codons discovered (including indels): 20

  Total codon count (excluding indels): 297
    Unique types: 9
    Non-synonymous: 6

    Highest frequency non-synonymous: GCT = A
                        at frequency: 0.04377

--------------------------------------------------------------------------------
Checking Codon: 509
Discovered...
  Total codon count (including indels): 349
  Reference codon (from 3D7): GAG
  Reference codon count: 238
  Majority codon (from pileup): GAG
  Majority codon count: 238
  Number of unique codons discovered (including indels): 24

  Total codon count (excluding indels): 267
    Unique types: 11
    Non-synonymous: 9

    Highest frequency non-synonymous:

  Majority codon count: 247
  Number of unique codons discovered (including indels): 19

  Total codon count (excluding indels): 262
    Unique types: 9
    Non-synonymous: 8

    Highest frequency non-synonymous: CAA = Q
                        at frequency: 0.01527

--------------------------------------------------------------------------------
Checking Codon: 564
Discovered...
  Total codon count (including indels): 349
  Reference codon (from 3D7): GCA
  Reference codon count: 249
  Majority codon (from pileup): GCA
  Majority codon count: 249
  Number of unique codons discovered (including indels): 18

  Total codon count (excluding indels): 263
    Unique types: 9
    Non-synonymous: 5

    Highest frequency non-synonymous: GTA = V
                        at frequency: 0.01901

--------------------------------------------------------------------------------
Checking Codon: 565
Discovered...
  Total codon count (including indels): 344
  Reference codon (from 3D7): TGG
  Reference

    Highest frequency non-synonymous: TCA = S
                        at frequency: 0.01439

--------------------------------------------------------------------------------
Checking Codon: 619
Discovered...
  Total codon count (including indels): 346
  Reference codon (from 3D7): TTA
  Reference codon count: 281
  Majority codon (from pileup): TTA
  Majority codon count: 281
  Number of unique codons discovered (including indels): 17

  Total codon count (excluding indels): 292
    Unique types: 6
    Non-synonymous: 4

    Highest frequency non-synonymous: ATA = I
                        at frequency: 0.01370

--------------------------------------------------------------------------------
Checking Codon: 620
Discovered...
  Total codon count (including indels): 357
  Reference codon (from 3D7): GAA
  Reference codon count: 219
  Majority codon (from pileup): GAA
  Majority codon count: 219
  Number of unique codons discovered (including indels): 17

  Total codon count (excluding in

  Majority codon count: 143
  Number of unique codons discovered (including indels): 19

  Total codon count (excluding indels): 166
    Unique types: 4
    Non-synonymous: 3

    Highest frequency non-synonymous: AGA = R
                        at frequency: 0.09036

--------------------------------------------------------------------------------
Checking Codon: 675
Discovered...
  Total codon count (including indels): 359
  Reference codon (from 3D7): GCT
  Reference codon count: 222
  Majority codon (from pileup): GCT
  Majority codon count: 222
  Number of unique codons discovered (including indels): 21

  Total codon count (excluding indels): 247
    Unique types: 8
    Non-synonymous: 4

    Highest frequency non-synonymous: CCT = P
                        at frequency: 0.00405

--------------------------------------------------------------------------------
Checking Codon: 676
Discovered...
  Total codon count (including indels): 348
  Reference codon (from 3D7): GCC
  Reference

In [153]:
def process_pileup(pileup, ref):
    """
    Process a read pileup from
    `samtools mpileup` into
    A,T,C,G,+,-
    
    Note, the solution is ugly, but
    re.sub([-+][0-9]+[ATCGatcg]+, ... fails when a variant
    follows directly after an indel.
    
    params
        pileup: str
            String giving all nucleotides mapped to a specific
            position, with characters defined by 
            `samtools mpileup`.
        ref: str
            String giving the reference base at this position.
    returns
        pileup:
            String with length <= pileup. Characters given in
            `samtools mpileup` are converted to the appropriate
            A, T, C, and G values. Insertions are represented
            by a single '+', deletions by a single '-'.
    """
    # indels are initiated with [+-][0-9]+[ATCGatcg]
    for indel_size in set(re.findall("\d+", pileup)):
        indel_size = int(indel_size)
        pileup = re.sub(r"[+-]%d[ATCGatcg]{%d}" % (indel_size, indel_size), "-", pileup)
   
    pileup = re.sub("\*", "-", pileup)  
    pileup = re.sub("\$|\^.", "", pileup)
    pileup = re.sub("\.|,", ref, pileup)
    pileup = pileup.upper()
    return pileup

In [163]:
mutation_df = pd.DataFrame(mutation_dt)
columns = ["position", "total_counts", "total_counts_noindel", 
           "ref_codon", "ref_amino", "ref_freq",
           "major_codon", "major_amino", "major_freq",
           "n_mutation_types",
           "mutation_codon", "mutation_amino", "mutation_freq"]
mutation_df = mutation_df[columns]
print("--------------------------------------------------------------------------------")
print("Mutation scan complete.")
print("================================================================================")

--------------------------------------------------------------------------------
Mutation scan complete.


In [164]:
mutation_df

Unnamed: 0,position,total_counts,total_counts_noindel,ref_codon,ref_amino,ref_freq,major_codon,major_amino,major_freq,n_mutation_types,mutation_codon,mutation_amino,mutation_freq
0,189,354,193,AAA,K,0.141243,ACA,T,0.42487,15,ACA,T,0.42487
1,189,354,193,AAA,K,0.141243,ACA,T,0.42487,15,CAA,Q,0.196891
2,413,351,206,CCG,P,0.384615,CCG,P,0.65534,10,CTG,L,0.131068
3,553,347,274,CCG,P,0.628242,CCG,P,0.79562,7,CTG,L,0.105839
4,603,352,267,GTA,V,0.65625,GTA,V,0.865169,3,GCA,A,0.108614
5,708,349,239,CTT,L,0.555874,CTT,L,0.811715,4,TTT,F,0.108787
6,715,349,256,CCC,P,0.510029,CCC,P,0.695312,9,CTC,L,0.136719
