# Find substitutions

In [1]:
import pandas as pd
import os
from Bio import Seq, SeqIO

In [2]:
cj_plasmid = [seq for seq in SeqIO.parse('Cj_plasmid.fna', 'fasta')][0]

In [3]:
start = 150
end = 3213

In [4]:
threshold = 0.05

In [5]:
protein_seq = cj_plasmid[start:end].translate().seq
protein_seq

Seq('MGKPIPNPLLGLDSTSPKKKRKVEASARILAFDIGISSIGWAFSENDELKDCGV...AS*')

In [18]:
data = pd.read_csv('CjCas9_cDNA_4th_round.pileup', sep='\t')

In [None]:
data_in_range = data[data['pos'].between(start, end)]

In [20]:
data_in_range = data_in_range.assign(substitutions=['-']*data_in_range.shape[0],
                                    below_threshold = [False]*data_in_range.shape[0])

In [21]:
# for each position, find which bases have relevant alternative bases
# then see which aa is in that position, replace base and see new aa
for i in data_in_range.index:
    pos = data_in_range.loc[i, 'pos'] - 1 #pileup is 1-indexed, sequence is 0-indexed
    original_aa = str(protein_seq[(pos-start)//3])
    new_substitutions = []
    for base in ['A', 'C', 'T', 'G']:
        if base == data_in_range.loc[i, 'ref']: continue
        if data_in_range.loc[i, base]/data_in_range.loc[i, 'cov'] > threshold/2:
            if data_in_range.loc[i, base]/data_in_range.loc[i, 'cov'] < threshold:
                data_in_range.loc[i, 'below_threshold'] = True
            new_nucl_seq = cj_plasmid.seq[start:pos] + base + cj_plasmid.seq[pos+1:end]
            new_prot_seq = new_nucl_seq.translate()
            new_aa = str(new_prot_seq[(pos-start)//3])
            new_substitutions.append(original_aa + str(((pos-start)//3)+1) + new_aa)
            if original_aa == new_aa:
                print('Warning: synonymous substitution at position ' + str(pos+1) + '!')
    if len(new_substitutions) != 0:
        data_in_range.loc[i, 'substitutions'] = ', '.join(new_substitutions)



In [22]:
data_in_range

Unnamed: 0,chr,pos,ref,A,C,G,T,af,cov,substitutions,below_threshold
149,Cj_plasmid,150,A,32667,1,2,1,0.000122,32671,-,False
150,Cj_plasmid,151,A,32254,0,2,3,0.000155,32259,-,False
151,Cj_plasmid,152,T,9,2,3,32532,0.000430,32546,-,False
152,Cj_plasmid,153,G,4,2,32384,17,0.000710,32407,-,False
153,Cj_plasmid,154,G,323,8,32313,5,0.010291,32649,-,False
...,...,...,...,...,...,...,...,...,...,...,...
3208,Cj_plasmid,3209,G,25,2,39847,128,0.003875,40002,-,False
3209,Cj_plasmid,3210,C,28,40921,1,118,0.003579,41068,-,False
3210,Cj_plasmid,3211,T,48,5,9,41335,0.001498,41397,-,False
3211,Cj_plasmid,3212,A,41136,3,83,6,0.002231,41228,-,False
