# Notes

- Manuscript: https://www.nature.com/articles/s41598-017-17081-y
- Data was collected from supplementary material: https://static-content.springer.com/esm/art%3A10.1038%2Fs41598-017-17081-y/MediaObjects/41598_2017_17081_MOESM2_ESM.xlsx

In [1]:
!pip install xlrd
!pip install openpyxl



In [2]:
import math

from pathlib import Path
from pandas import read_excel, DataFrame

In [3]:
data_path = Path('') / '..' / 'data' / 'cas'
cas_data_path = data_path / '41598_2017_17081_MOESM2_ESM.xlsx'

task_path = Path('') / '..' / 'tasks' / 'cas'

sheet_names = ['Positive Selection ', 'Negative Selection', 'All Count Data', 'Mutability Scores']

In [4]:
cas_sheets = read_excel(cas_data_path, sheet_name=sheet_names)

In [5]:
cas_all_counts = cas_sheets['All Count Data']

In [6]:
cas_all_counts.columns

Index(['Nucleotide Position', 'AA Position', 'WT Codon', 'WT AA', 'Domain',
       'Mutant AA', 'Mutant nucleotide', 'Mutant Codon', 'Initial Counts',
       'Initial Total Counts', 'Synonymous Mutation ', 'HindIII NdeI Cut Site',
       'WT Nucleotide', 'Error Counts', 'Error Sum Reads per Position',
       'Negative 1 Counts', 'Negative 1 Sum Reads per Position',
       'Negative 2 Counts', 'Negative 2 Sum Reads per Position',
       'Negative 3 Counts', 'Negative 3 Sum Reads per Position',
       'Positive 1 Counts', 'Positive 1 Sum Reads per Position',
       'Positive 2 Counts', 'Positive 2 Sum Reads per Position',
       'Positive 3 Counts', 'Positive 3 Sum Reads per Position',
       'Negative Counts', 'Negative Total Counts', 'Positive Counts',
       'Positive Total Counts', 'Negative Adjusted Fisher P Values',
       'Positive Adjusted Fisher P Values',
       'Log2 Fold Change after Negative Selection',
       'Log2 Fold Change after Positive Selection'],
      dtype='object

In [7]:
# Quick and dirty check that all sequence positions are unique
t = {}

for i in cas_all_counts[['AA Position', 'WT AA']].drop_duplicates()['AA Position'].values:
    if t.get(i):
        t[i] = t[i]+1
        print(t[i])
    else:
        t[i] = 0

In [8]:
# Infer WT sequence from data
wt_sequence = ''.join(cas_all_counts[['AA Position', 'WT AA']].drop_duplicates()['WT AA'].values)

print(wt_sequence)

MDKKYSIGLDIGTNSVGWAVITDEYKVPSKKFKVLGNTDRHSIKKNLIGALLFDSGETAEATRLKRTARRRYTRRKNRICYLQEIFSNEMAKVDDSFFHRLEESFLVEEDKKHERHPIFGNIVDEVAYHEKYPTIYHLRKKLVDSTDKADLRLIYLALAHMIKFRGHFLIEGDLNPDNSDVDKLFIQLVQTYNQLFEENPINASGVDAKAILSARLSKSRRLENLIAQLPGEKKNGLFGNLIALSLGLTPNFKSNFDLAEDAKLQLSKDTYDDDLDNLLAQIGDQYADLFLAAKNLSDAILLSDILRVNTEITKAPLSASMIKRYDEHHQDLTLLKALVRQQLPEKYKEIFFDQSKNGYAGYIDGGASQEEFYKFIKPILEKMDGTEELLVKLNREDLLRKQRTFDNGSIPHQIHLGELHAILRRQEDFYPFLKDNREKIEKILTFRIPYYVGPLARGNSRFAWMTRKSEETITPWNFEEVVDKGASAQSFIERMTNFDKNLPNEKVLPKHSLLYEYFTVYNELTKVKYVTEGMRKPAFLSGEQKKAIVDLLFKTNRKVTVKQLKEDYFKKIECFDSVEISGVEDRFNASLGTYHDLLKIIKDKDFLDNEENEDILEDIVLTLTLFEDREMIEERLKTYAHLFDDKVMKQLKRRRYTGWGRLSRKLINGIRDKQSGKTILDFLKSDGFANRNFMQLIHDDSLTFKEDIQKAQVSGQGDSLHEHIANLAGSPAIKKGILQTVKVVDELVKVMGRHKPENIVIEMARENQTTQKGQKNSRERMKRIEEGIKELGSQILKEHPVENTQLQNEKLYLYYLQNGRDMYVDQELDINRLSDYDVDHIVPQSFLKDDSIDNKVLTRSDKNRGKSDNVPSEEVVKKMKNYWRQLLNAKLITQRKFDNLTKAERGGLSELDKAGFIKRQLVETRQITKHVAQILDSRMNTKYDENDKLIREVKVITLKSKLVSDFRKDFQFYKVREINNYHHAHDAYLNAVVGTALIKK

In [9]:
possible_mutations = cas_all_counts['Mutant AA'].unique()

In [10]:
cas_all_counts[:3]

Unnamed: 0,Nucleotide Position,AA Position,WT Codon,WT AA,Domain,Mutant AA,Mutant nucleotide,Mutant Codon,Initial Counts,Initial Total Counts,...,Positive 3 Counts,Positive 3 Sum Reads per Position,Negative Counts,Negative Total Counts,Positive Counts,Positive Total Counts,Negative Adjusted Fisher P Values,Positive Adjusted Fisher P Values,Log2 Fold Change after Negative Selection,Log2 Fold Change after Positive Selection
0,4,1,ATG,M,RuvC,,A,,35094,35220,...,37999,38352,86324,86806,125416,126550,,,-0.002863,-0.004953
1,5,1,ATG,M,RuvC,K,A,AAG,330,35200,...,725,38349,2092,86784,2685,126540,1.15e-67,0.000147,1.35884,-0.1842
2,6,1,ATG,M,RuvC,I,A,ATA,273,35168,...,2712,37985,1387,86653,9027,125407,9.41e-29,0.0,1.039798,2.168108


In [11]:
ordered_change_impact = list()

def assign_score(group):
    group_score = {}
    group_AAs = group['Mutant AA'].unique()
    for AA in possible_mutations:
        if AA in group_AAs:
            group_score[AA] = group[group['Mutant AA'] == AA]['AA Position'].values[0]
        else:
            group_score[AA] = math.nan
    ordered_change_impact.append(group_score)

cas_all_counts[['AA Position', 'Mutant AA']].groupby("AA Position").apply(assign_score)

In [13]:
mutation_matrix = DataFrame(ordered_change_impact, index=list(wt_sequence)).T

In [14]:
mutation_matrix

Unnamed: 0,M,D,K,K.1,Y,S,I,G,L,D.1,...,Y.1,P,Y.2,D.2,V,P.1,D.3,Y.3,A,*
,,,,,,,,,,,...,,,,,,,,,,
K,1.0,,3.0,4.0,,,,,,,...,,,,,,,,,,1391.0
I,1.0,,3.0,4.0,,,7.0,,,,...,,,,,1386.0,,,,,
L,1.0,,,,,6.0,7.0,,9.0,,...,,1383.0,,,1386.0,1387.0,,,,1391.0
T,1.0,,3.0,4.0,,6.0,7.0,,,,...,,1383.0,,,,1387.0,,,1390.0,
V,1.0,2.0,,,,,7.0,8.0,9.0,10.0,...,,,,1385.0,1386.0,,1388.0,,1390.0,
R,1.0,,3.0,4.0,,,,8.0,9.0,,...,,1383.0,,,,1387.0,,,,
N,,2.0,3.0,4.0,5.0,,7.0,,,10.0,...,1382.0,,1384.0,1385.0,,,1388.0,1389.0,,
E,,2.0,3.0,4.0,,,,8.0,,10.0,...,,,,1385.0,,,1388.0,,,1391.0
H,,2.0,,,5.0,,,,,10.0,...,1382.0,1383.0,1384.0,1385.0,,1387.0,1388.0,1389.0,,
