* Load up the sequencing counts produced by proteins.py
* Combine data from two sequencing gates
* Take input FASTA files
* Perform needle alignment for input files
* Retrieve counts from sequencing for those variants

In [1]:
# Perform the initialization and imports
import sys
import pickle
import re
import os
import csv
import argparse
import math
import pprint

from string import ascii_lowercase
from collections import Counter, defaultdict

import pandas as pd
import matplotlib.pyplot as plt
#import seaborn as sns   This is just for plots

from Bio import SeqIO, AlignIO
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
from Bio.Emboss.Applications import NeedleallCommandline

# Demand Python 3.
if sys.version_info[0] < 3:
    print("Python 3 is required, but you are using Python %i.%i.%i") % (
        sys.version_info[0], sys.version_info[1], sys.version_info[2])
    sys.exit(1)

This cell specifies where the InDelScanner scripts are located: modify `indels_path` as needed.

In [2]:
# Retrieve the specific functions from ind and proteins.py
indels_path="/home/maya/InDelScanner"  # /PATH/TO/InDelScanner
if indels_path not in sys.path:
    sys.path.append(indels_path)
from indels.ind import trim_read, findEnds, endMatch, findGap, gapAlign
from indels.proteins import protein_needle, indel_len, find_protein_short

Change directory to where the sequencing count dictionaries are located, load them and combine counts to get the `mek` Counter containing sequencing results.

In [3]:
os.chdir("/mnt/c/Users/Maya/Dropbox/mek_results")

with open('mek_counter.pickle', 'rb') as f:
    mek = pickle.load(f)

In [12]:
# Extract information from individually sequenced clones
def retrieve_ind_clones(mek, filename, outfile):
    
    variants = []

    columns = ['Sample', 'Protein'] + list(mek.keys())

    with open(outfile, 'w') as f:
        writer = csv.DictWriter(f, delimiter=',', fieldnames=columns)
        writer.writeheader()

        for pair in AlignIO.parse(filename, "fasta", seq_count=2):
            # both read and ref are MutableSeq
            ref = str(pair[0].seq)
            read = str(pair[1].seq)
            readname = pair[1].id
            print(readname)

            ref, read = trim_read(ref, read)

            # check that there is no frame shift or gross mistranslation
            ends = findEnds(read, ref, 0)
            if not endMatch(read, ref, ends, 2):
                continue

            protein = find_protein_short(read, ref, ends)
            print(protein)
            variants.append(protein)
            row = {'Sample': readname, 'Protein': protein}
            for fraction in mek.keys():
                row[fraction] = mek[fraction][protein]

            writer.writerow(row)
            
        
    return variants

In [9]:
aln_file = protein_needle(['Lib1-10.fa'], 'Xref.fa')

In [13]:
variants = retrieve_ind_clones(mek, 'Lib1-10.aln', 'single_sub.csv')

Lib1
6L/7aI/8aA/9L/11F/13M
Lib2
6F/7aP/9W/11L/13M
Lib3
6L/7aF/9L/11I/13I
Lib4
6A/7aI/8aA/9L/11L/13I
Lib5
6W/7aI/9F/11L/13V
Lib6
6A/7aP/8aA/9L/11V/13W
Lib7
6L/7aI/8aA/9M/11W/13W
Lib8
6V/7aP/8aA/9F/11F/13M
Lib9
6A/7aK/9L/11L/13W
Lib10
6A/7aI/8aA/9L/11V/13M
WT
6P/9I/11L/13P
ILAA
6P/9A/11A/13P
Consensus
6L/7aL/9L/11L/13I


In [14]:
variants

['6L/7aI/8aA/9L/11F/13M',
 '6F/7aP/9W/11L/13M',
 '6L/7aF/9L/11I/13I',
 '6A/7aI/8aA/9L/11L/13I',
 '6W/7aI/9F/11L/13V',
 '6A/7aP/8aA/9L/11V/13W',
 '6L/7aI/8aA/9M/11W/13W',
 '6V/7aP/8aA/9F/11F/13M',
 '6A/7aK/9L/11L/13W',
 '6A/7aI/8aA/9L/11V/13M',
 '6P/9I/11L/13P',
 '6P/9A/11A/13P',
 '6L/7aL/9L/11L/13I']