* Load up the sequencing counts produced by proteins.py
* Combine data from two sequencing gates
* Take input FASTA files
* Perform needle alignment for input files
* Retrieve counts from sequencing for those variants

In [1]:
# Perform the initialization and imports
import sys
import pickle
import re
import os
import csv
import argparse
import math
import pprint

from string import ascii_lowercase
from collections import Counter, defaultdict

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from Bio import SeqIO, AlignIO
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio.Emboss.Applications import NeedleallCommandline

# Demand Python 3.
if sys.version_info[0] < 3:
    print("Python 3 is required, but you are using Python %i.%i.%i") % (
        sys.version_info[0], sys.version_info[1], sys.version_info[2])
    sys.exit(1)



This cell specifies where the InDelScanner scripts are located: modify `indels_path` as needed.

In [2]:
# Retrieve the specific functions from ind and proteins.py
indels_path="/home/maya/InDelScanner/indels"  # /PATH/TO/InDelScanner
if indels_path not in sys.path:
    sys.path.append(indels_path)
from indels.ind import trim_read, findEnds, endMatch, findGap #, gapAlign
from indels.composition import find_dna_diff, find_dna_hgvs, find_protein_diff
from indels.proteins import protein_needle, find_protein_short

### Results specific to MEK1 protein libraries: interrogating the sequencing counts

Change directory to where the sequencing count dictionaries are located, load them and combine counts to get the `mek` Counter containing sequencing results.

In [None]:
os.chdir("/mnt/c/Users/Maya/Dropbox/mek_results")

with open('Remkes_protein.p', 'rb') as f:
    all_ref = pickle.load(f)
with open('Remkes_protein_low.p', 'rb') as f:
    low = pickle.load(f)

all_ref['mek']['low-v2'] = low['mek']['low-v2']

mek = {}
for fraction in ['high', 'med']:
    mek[fraction] = Counter(all_ref['mek'][fraction])
mek['low-t'] = Counter(all_ref['mek']['low']) + Counter(all_ref['mek']['low-v2'])

In [None]:
# Extract information from individually sequenced clones

def retrieve_ind_clones(mek, filename, outfile):

    columns = ['Sample', 'Protein'] + list(mek.keys())

    with open(outfile, 'w') as f:
        writer = csv.DictWriter(f, delimiter=',', fieldnames=columns)
        writer.writeheader()

        for pair in AlignIO.parse(filename, "fasta", seq_count=2):
            # both read and ref are MutableSeq
            ref = str(pair[0].seq)
            read = str(pair[1].seq)
            readname = pair[1].id

            ref, read = trim_read(ref, read)

            # check that there is no frame shift or gross mistranslation
            ends = findEnds(read, ref, 0)
            if not endMatch(read, ref, ends, 2):
                continue

            protein = find_protein_short(read, ref, ends)
            row = {'Sample': readname, 'Protein': protein}
            for fraction in mek.keys():
                row[fraction] = mek[fraction][protein]

            writer.writerow(row)
    


In [None]:
protein_needle(['table1.fa'], 'Xref.fa')

In [None]:
retrieve_ind_clones(mek, 'table1.aln', 'table1.csv')

### PTE single variants for TRIAD manuscript

Adapted from `sanger.py`

In [3]:
os.chdir("/mnt/c/Users/Maya/Dropbox/PTE_sanger/results")

In [None]:
# print mutations present in a list of clones
def print_Sanger_variants(alnfile, outfile, start_offset=6, end_trail=6, debug=False, refname='PTE-R0'):
    
    columns = ['Sample', 'DNA_hgvs', 'Protein_short', 'Protein_tuple', 'DNA_tuple']
    
    with open(outfile, 'w', encoding='utf-8') as f:
        writer = csv.DictWriter(f, delimiter=',', fieldnames=columns)
        writer.writeheader()

        for pair in AlignIO.parse(alnfile, "fasta", seq_count=2):
            # both read and ref are MutableSeq
            ref = pair[0].seq.tomutable()
            read = pair[1].seq.tomutable()
            readname = pair[1].id
            ref, read = trim_read(ref, read)
            
            ends = findEnds(read, ref, 0)
            if not endMatch(read, ref, ends, 2):
                print(readname, 'ends not match')
                continue

            dna_errors, dna_hgvs, prot_errors, prot_short = None, None, None, None
            # after setting the blank values, look for mutations
            dna_errors = find_dna_diff(read, ref, debug, start_offset, end_trail)  # errors = a tuple
            dna_hgvs = find_dna_hgvs(read, ref, refname, debug, start_offset, end_trail)  # string in HGVS format (ish)
            prot_errors, prot_short = find_protein_diff(read, ref, debug, start_offset, end_trail)

                
            row = {'Sample': readname, 'DNA_hgvs': dna_hgvs, 'Protein_short': prot_short, 
                   'Protein_tuple': prot_errors, 'DNA_tuple': dna_errors}
            writer.writerow(row)

print_Sanger_variants('PTE_plate1_filtered.aln', 'PTE_plate1.csv')
print_Sanger_variants('PTE_plate2_filtered.aln', 'PTE_plate2.csv')

In [None]:
print_Sanger_variants('PTE_plate1_filtered.aln', 'PTE_plate1.csv')