* Load up the sequencing counts produced by proteins.py
* Combine data from two sequencing gates
* Take input FASTA files
* Perform needle alignment for input files
* Retrieve counts from sequencing for those variants

In [None]:
# Perform the initialization and imports
import sys
import pickle
import re
import os
import csv
import argparse
import math
import pprint

from string import ascii_lowercase
from collections import Counter, defaultdict

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from Bio import SeqIO, AlignIO
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio.Emboss.Applications import NeedleallCommandline

# Demand Python 3.
if sys.version_info[0] < 3:
    print("Python 3 is required, but you are using Python %i.%i.%i") % (
        sys.version_info[0], sys.version_info[1], sys.version_info[2])
    sys.exit(1)



This cell specifies where the InDelScanner scripts are located: modify `indels_path` as needed.

In [None]:
# Retrieve the specific functions from ind and proteins.py
indels_path="/home/maya/InDelScanner"  # /PATH/TO/InDelScanner
if indels_path not in sys.path:
    sys.path.append(indels_path)
from indels.ind import trim_read, findEnds, endMatch, findGap, gapAlign
from indels.proteins import protein_needle

Change directory to where the sequencing count dictionaries are located, load them and combine counts to get the `mek` Counter containing sequencing results.

In [None]:
os.chdir("/mnt/c/Users/Maya/VMShare/mek_results")

with open('Remkes_protein.p', 'rb') as f:
    all_ref = pickle.load(f)
with open('Remkes_protein_low.p', 'rb') as f:
    low = pickle.load(f)

all_ref['mek']['low-v2'] = low['mek']['low-v2']

mek = {}
for fraction in ['high', 'med']:
    mek[fraction] = Counter(all_ref['mek'][fraction])
mek['low-t'] = Counter(all_ref['mek']['low']) + Counter(all_ref['mek']['low-v2'])

In [None]:
# Extract information from individually sequenced clones

def retrieve_ind_clones(mek, filename, outfile):

    columns = ['Sample', 'Protein'] + list(mek.keys())

    with open(outfile, 'w') as f:
        writer = csv.DictWriter(f, delimiter=',', fieldnames=columns)
        writer.writeheader()

        for pair in AlignIO.parse(filename, "fasta", seq_count=2):
            # both read and ref are MutableSeq
            ref = str(pair[0].seq)
            read = str(pair[1].seq)
            readname = pair[1].id

            ref, read = trim_read(ref, read)

            # check that there is no frame shift or gross mistranslation
            ends = findEnds(read, ref, 0)
            if not endMatch(read, ref, ends, 2):
                continue

            protein = find_protein_short(read, ref, ends)
            row = {'Sample': readname, 'Protein': protein}
            for fraction in mek.keys():
                row[fraction] = mek[fraction][protein]

            writer.writerow(row)
    


In [None]:
aln_file = protein_needle('FRET tested.fasta', args.reference)
retrieve_ind_clones(mek, 'FRET tested.aln', 'FRET_counts.csv')