# Introduction

The purpose of this notebook is to provide official documentation of parts of the software used in the paper.

**Input**: This notebook mainly takes codon_counts files generated with dms_tools2 and bcsubamp from the Bloom laboratory.

**Output**: Either dataframes or figures are generated for the paper.

Dataframes that were generated in this notebook were then visualized in Prism. The final formatting was done in Adobe Illustrator.

XXX Update Github link
We recommended running this file in Google Colab by opening the following link:

To run the code, click "Run all". Alternatively, first run all code cells in "Run Setup" and then run any of the code cells in the figure section.

For further documentation please refer to the paper.

# Run setup

#### Initial setup

In [None]:
#@markdown Link to Github
import os
import warnings
from IPython.display import display, HTML, Markdown
!git clone https://github.com/HBV-DMS/R1
os.getcwd()
os.chdir('/content/R1/')
os.getcwd()

In [None]:
#@markdown Install dependencies for Google Colab
if 'google.colab' in str(get_ipython()):
    !pip install altair
    !pip install altair_saver
    !pip install biopython
    !pip install colour
    import os
    import warnings
    import pandas
    from IPython.display import display, HTML, Markdown
else:
    print('Not running on CoLab')

In [None]:
#@markdown Import dependencies
import altair as alt
import numpy as np
import pandas as pd
import math
from Bio.Seq import Seq
from scipy.stats import (ttest_ind, fisher_exact, chi2_contingency)
from sklearn.linear_model import LinearRegression
from matplotlib.pyplot import figure
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
#@markdown Set paths to files
Core_plasmid1 = 'data/codoncounts/Core_plasmid1_codoncounts.csv'
Core_plasmid2 = 'data/codoncounts/Core_plasmid2_codoncounts.csv'
Core_cell1 = 'data/codoncounts/Core_cell1_codoncounts.csv'
Core_cell2 = 'data/codoncounts/Core_cell2_codoncounts.csv'
Core_cell3 = 'data/codoncounts/Core_cell3_codoncounts.csv'


TP_plasmid1 = 'data/codoncounts/TP_plasmid1_codoncounts.csv'
TP_plasmid2 = 'data/codoncounts/TP_plasmid2_codoncounts.csv'
TP_plasmid3 = 'data/codoncounts/TP_plasmid3_codoncounts.csv'
TP_cell1 = 'data/codoncounts/TP_cell1_codoncounts.csv'
TP_cell2 = 'data/codoncounts/TP_cell2_codoncounts.csv'
TP_cell3 = 'data/codoncounts/TP_cell3_codoncounts.csv'

Spacer_plasmid1 = 'data/codoncounts/Spacer_plasmid1_codoncounts.csv'
Spacer_plasmid2 = 'data/codoncounts/Spacer_plasmid2_codoncounts.csv'
Spacer_plasmid3 = 'data/codoncounts/Spacer_plasmid3_codoncounts.csv'
Spacer_cell1 = 'data/codoncounts/Spacer_cell1_codoncounts.csv'
Spacer_cell2 = 'data/codoncounts/Spacer_cell2_codoncounts.csv'
Spacer_cell3 = 'data/codoncounts/Spacer_cell3_codoncounts.csv'

RT_plasmid1 = 'data/codoncounts/RT_plasmid1_codoncounts.csv'
RT_plasmid2 = 'data/codoncounts/RT_plasmid2_codoncounts.csv'
RT_plasmid3 = 'data/codoncounts/RT_plasmid3_codoncounts.csv'
RT_cell1 = 'data/codoncounts/RT_cell1_codoncounts.csv'
RT_cell2 = 'data/codoncounts/RT_cell2_codoncounts.csv'
RT_cell3 = 'data/codoncounts/RT_cell3_codoncounts.csv'

RNaseH_plasmid1 = 'data/codoncounts/RNaseH_plasmid1_codoncounts.csv'
RNaseH_cell1 = 'data/codoncounts/RNaseH_cell1_codoncounts.csv'
RNaseH_cell2 = 'data/codoncounts/RNaseH_cell2_codoncounts.csv'
RNaseH_cell3 = 'data/codoncounts/RNaseH_cell3_codoncounts.csv'
RNaseH_cell4 = 'data/codoncounts/RNaseH_cell4_codoncounts.csv'
RNaseH_cell5 = 'data/codoncounts/RNaseH_cell5_codoncounts.csv'
RNaseH_cell6 = 'data/codoncounts/RNaseH_cell6_codoncounts.csv'

OrderOfAA = "./data/other/AAorder.xlsx"
insilicoDMS_Pol = "./data/other/HBV_pol_RiceSeq_full_t099_b09_single_mutant_matrix.csv"
vkeym = "./data/other/vkeym.csv"
vkeyultradetail = "./data/other/vkeyultradetail.csv"
df41 = './data/other/codonaatranslation.csv'

In [None]:
#@markdown Hide code warning

import warnings

Hide_Warnings = True #@param {type:"boolean"}

if Hide_Warnings:
    warnings.filterwarnings('ignore')
else:
    warnings.filterwarnings('default')

In [None]:
#@markdown Do not reset variables
Do_not_reset = True #@param {type:"boolean"}

#### Define functions

In [None]:
def fundropvforcore(_RNaseH_plasmid1, whichkey):
    n = 0

    if isinstance(_RNaseH_plasmid1, str):
        df = pd.read_csv(_RNaseH_plasmid1)
    else:
        df = _RNaseH_plasmid1

    df = df.melt(id_vars=['site', 'wildtype'])


    vdf = pd.read_csv(whichkey)
    vdf['sus'] = 1


    mdf = pd.merge(df, vdf,  how='left', left_on=['wildtype','variable'], right_on = ['wildtype','variable'])


    mdf2 = mdf

    mdf2.loc[mdf2.sus == 1, ['value']] = 0

    mdf3 = pd.pivot_table(mdf2,index=["site", "wildtype"], columns="variable", values="value", aggfunc= 'first')


    return mdf3.reset_index()


In [None]:
def collapser(df):
    dftrans = pd.read_csv(df41)
    dftrans = dftrans.rename(columns={'Codon':'wildtype'})
    df = pd.merge(df, dftrans,  how='left', left_on=['wildtype'], right_on = ['wildtype'])
    dftrans = dftrans.rename(columns={'wildtype':'variable','AA':'AA2'})
    df = pd.merge(df, dftrans,  how='left', left_on=['variable'], right_on = ['variable'])
    df = df.rename(columns={'wildtype':'wtx','variable':'varx', 'AA':'wildtype','AA2':'variable'})
    df = df.groupby(['site', 'variable','wildtype']).sum().reset_index()
    return df

In [None]:
def pcal(_df):
    table = [[(_df['pre_sum']-_df['pre_value']),_df['pre_value']],[(_df['post_sum']-_df['post_value']),_df['post_value']]]
    _df['pvalue'] = chi2_contingency(table)[1]#_df['pre_sum']#
    return _df

def plusoneadderifzero(_post, _pre):
        if _post == 0 and _pre <1:
            return _pre
        elif _post == 0:
            return 1
        else:
            return _post

def concat123(_dfs):
    dfs = []
    for i in _dfs:
        dfs.append(pd.read_csv(i))

    df_concat = pd.concat(dfs)
    by_row_index = df_concat.groupby(df_concat.index)
    df_means = by_row_index.sum(numeric_only=True)
    df_means['wildtype'] = dfs[0]['wildtype']
    df_means['site']=df_means['site']/len(_dfs)
    return df_means

def mainfunction2(i, ii, _rep, _filter, _newpref, _yespvalue, mykozakyesorno = 'nokozak', _codonorAA="codon"):
        if _filter == "yesfilter":
            predf = fundropvforcore(i, vkeym)
            postdf = fundropvforcore(ii, vkeym)
        else:
            if isinstance(i, str):
                predf = pd.read_csv(i)
                postdf = pd.read_csv(ii)
            else:
                predf = i
                postdf = ii



        predf1m = predf.melt(id_vars=['site',  'wildtype'])
        postdf1m = postdf.melt(id_vars=['site',  'wildtype'])


        if _codonorAA == "AA":
            predf1m = collapser(predf1m)
            postdf1m = collapser(postdf1m)

        predf1m['replicate'] = _rep
        postdf1m['replicate'] = _rep

        df_concat_pre = predf1m
        df_concat_post = postdf1m

        df_concat_pre = df_concat_pre.rename(columns={"value": "pre_value"})
        df_concat_post = df_concat_post.rename(columns={"value": "post_value"})

        multi_pre = df_concat_pre.set_index(['site', 'wildtype', 'replicate']).sort_index()

        multi_pre_sum = multi_pre.groupby(axis=0, level=['site', 'wildtype', 'replicate']).sum(numeric_only=True)

        multi_pre_sum = multi_pre_sum.rename(columns={"pre_value": "pre_sum"})
        multi_pre = pd.merge(multi_pre, multi_pre_sum, on=['site', 'wildtype', 'replicate'])
        multi_pre['pre_frac'] = multi_pre['pre_value']/multi_pre['pre_sum']

        multi_pre.reset_index(inplace=True)
        multi_pre = multi_pre.set_index(['site', 'wildtype', 'variable', 'replicate']).sort_index()


        multi_post = df_concat_post.set_index(['site', 'wildtype', 'replicate']).sort_index()

        multi_post_sum = multi_post.groupby(axis=0, level=['site', 'wildtype', 'replicate']).sum(numeric_only=True)

        multi_post_sum = multi_post_sum.rename(columns={"post_value": "post_sum"})
        multi_post = pd.merge(multi_post, multi_post_sum, on=['site', 'wildtype', 'replicate'])

        multi_post.reset_index(inplace=True)
        multi_post = multi_post.set_index(['site', 'wildtype', 'variable', 'replicate']).sort_index()


        df = pd.merge(multi_pre, multi_post, on=['site', 'wildtype', 'variable', 'replicate', ]).sort_index()

        df['nrmpre'] = df['pre_frac'] * df['post_sum']

        df['post_value'] = df.apply(lambda x: plusoneadderifzero(x.post_value, x.nrmpre), axis=1)

        df['post_frac'] = df['post_value']/df['post_sum']


        if _newpref == 'yesnewpref':
            df['pre_ratio'] = df['pre_value']/(df['pre_sum'] - df['pre_value'])
            df['post_ratio'] = df['post_value']/(df['post_sum'] - df['post_value'])
            df['factor'] = df['post_ratio']/df['pre_ratio']
        else:
            df['factor'] = df['post_frac']/df['pre_frac']


        df = df.reset_index()



        if not _codonorAA=="AA":

            wtlist = df[['site', 'variable', 'replicate', 'factor']].rename(columns={"variable": "wildtype", 'factor': 'wtnormfactor'})

            df = pd.merge(df, wtlist, on=['site', 'wildtype', 'replicate'])

            tdf = pd.read_csv(df41).rename(columns={"Codon": "variable"})

            df = pd.merge(df, tdf, on='variable')

            tdf = tdf.rename(columns={"variable": "wildtype", 'AA':'wtAA'})

            df = pd.merge(df, tdf, on='wildtype')

        if _codonorAA=="AA":
            df['AA']=df['variable']
            df['wtAA']=df['wildtype']


        df['mfactor'] = df['post_frac']/df['pre_frac'] * (1 - df['pre_frac'])/(1-df['post_frac'])

        df['logmfactor'] = np.log2(df['mfactor'])

        df['fullname'] = df['AA'] + ', ' + df['variable']
        df['info_norpre'] = np.round(df['pre_frac'] * df['post_sum'], 1) #df['pre_value']

        df = df.sort_values('mfactor', ascending=False).dropna()
        df = df[df.logmfactor < 10000]

        if _yespvalue == 'yespvalue':
            df = df.apply(pcal, axis=1)
            df['info'] = 'norpre#: ' + df['info_norpre'].astype(str) + ' post#: ' + df['post_value'].astype(str) + " wt: " + df['wildtype'].astype(str) + " var: " + df['variable'].astype(str) + " factor: " + np.round(df['factor'],2).astype(str) + " pvalue: " + df['pvalue'].astype(str)
            df['pvaluelog'] = -np.log10(df['pvalue'])
        else:
            df['info'] = 'norpre#: ' + df['info_norpre'].astype(str) + ' post#: ' + df['post_value'].astype(str) + " wt: " + df['wildtype'].astype(str) + " var: " + df['variable'].astype(str) + " factor: " + np.round(df['factor'],2).astype(str) + " pvalue: " + 'empty'


        if mykozakyesorno == 'yeskozak':
            df = mykozak(df)
        elif mykozakyesorno == 'yeskozak2':
            df = mykozak2(df)

        df = somechanges(df)

        return df

In [None]:
def mykozak2(df):
    fullcodonseq = 'XXXXXXXXXX' + GetWTSeqFromDfAsString(pd.read_csv(Core_cell1), _codonORamino = 'codon') + 'XXXXXXXXXX'
    df['kozakscore']=0
    df = df.apply(row_iter_mykozak2, axis=1)
    return df

def row_iter_mykozak2(row):
    mypos0 = int(row['site']) * 3 - 2
    mypos1 = mypos0 + 10
    kozakseq = fullcodonseq[(int(mypos1)-9-1-1):(int(mypos1)+3+9-1+1)]
    row['kozakscore'] = similarity_score(kozakseq)
    return row


In [None]:
def myprefcal_deter(pre_counts, post_counts, _min, _max, scaletype, outputtype, _filter, _newpref, _concat, _yespvalue = 'yespvalue'):

    if _concat == 'yesconcat':
        preconcat = concat123(pre_counts)
        postconcat = concat123(post_counts)
        df = mainfunction2(preconcat, postconcat, 'ABC', _filter, _newpref, _yespvalue)

    df_wt = pd.read_csv(pre_counts[0])
    wdf1w = df_wt[['site','wildtype']].rename(columns={"wildtype": "variable"})

    codonseq=''

    for index, row in wdf1w.iterrows():
       codonseq += row['variable']

    _codonseq = 'XX' + codonseq + 'XX'

    df = df.apply(codonchecker, codonseq = _codonseq, axis=1)

    df['siteB']=0
    df['siteC']=0
    for i in df.iloc[: , :16].columns:
        df.loc[df[i]==1, 'siteC']=1
    df['siteC']=df['site']-df['siteC']

    for i in df.iloc[: , 16:32].columns:
        df.loc[df[i]==1, 'siteB']=1
    df['siteB']=df['site']-df['siteB']

    return df

In [None]:
def checkifsamegroup(_mutseqfrag, _targetcodon):
    if _mutseqfrag in ['ATG', 'CTG']:
        _trueorfalse = True
        _grouptype = 'start'
    elif _mutseqfrag in ['TAG', 'TAA', 'TGA']:
        _trueorfalse = True
        _grouptype = 'stop'
    else:
        _trueorfalse = False
        _grouptype = 'nogroup'

    return [_trueorfalse, _grouptype]

def codonchecker(_df, codonseq):

    mypos = int(_df['site']) * 3 - 2

    wtseq = codonseq[(int(mypos)-1):(int(mypos)+7-1)]

    mutseq = wtseq[0:2] + _df['variable'] + wtseq[5:7]
    _df['mypos'] = mypos
    _df['wtseq'] = wtseq
    _df['mutseq'] = mutseq


    for (hispos, frame, mysite) in zip([-2,-1,0,1,2],['B', 'C', 'A', 'B', 'C'],[-1, -1, 0, 0, 0]):
        wtseqfrag = wtseq[(hispos+2):(hispos+5)]
        mutseqfrag = mutseq[(hispos+2):(hispos+5)]
        for targetcodon in ['ATG', 'CTG', 'TAG', 'TAA', 'TGA']:
            if wtseqfrag == targetcodon:
                if mutseqfrag != wtseqfrag:

                    _df[(str(hispos) + targetcodon + 'break')] = 1

                    if checkifsamegroup(mutseqfrag, targetcodon)[0]:
                        grouptype = checkifsamegroup(mutseqfrag, targetcodon)[1]
                        _df[(str(hispos) + grouptype + 'preserve')] = 1

                if mutseqfrag == wtseqfrag:
                    _df[(str(hispos) + targetcodon + 'preserve')] = 1

            if wtseqfrag != targetcodon:
                if mutseqfrag == targetcodon:
                    _df[(str(hispos) + targetcodon + 'maker')] = 1
    return _df

In [None]:
def GetWTSeqFromDfAsString(_df, _codonORamino = 'codon'):
    wtcodonseq = ''
    wtaminoseq = ''
    wdf1w = _df[['site','wildtype']].rename(columns={"wildtype": "variable"})

    dftrans = pd.read_csv(df41)


    dftrans = dftrans.rename(columns={'Codon':'wildtype'})

    df = pd.merge(_df, dftrans,  how='left', left_on=['wildtype'], right_on = ['wildtype'])[['site', 'wildtype', 'AA']]

    for index, row in df.iterrows():
        wtcodonseq += row['wildtype']
        wtaminoseq += row['AA']

    if _codonORamino == 'codon':
        return wtcodonseq
    else:
        return wtaminoseq



In [None]:
def thecomparer(WTc, VARc):
    result=[0,0,0]
    for i, ii, iii in zip(WTc, VARc, [0,1,2]):
        if i == 'A':
            listofthenucleotides = ['A','T', 'G', 'C']
        if i == 'T':
            listofthenucleotides = ['T','A', 'C', 'G']
        if i == 'G':
            listofthenucleotides = ['G','C','A', 'T']
        if i == 'C':
            listofthenucleotides = ['C', 'G','T', 'A']
        if i != ii:
            result[iii] = listofthenucleotides.index(ii)
    return (str(result[0]) + str(result[1]) + str(result[2]))

def mysortbynumbermutations(_df):
    def funcforrow_mysort(row):
        WTc = row['wildtype']
        VARc = row['variable']
        row['sortedbymutations']=thecomparer(WTc, VARc)
        return row
    _df = _df.apply(funcforrow_mysort, axis=1)
    return _df

In [None]:
nucsort = ['GCT', 'GCA', 'GCG', 'GCC', 'ATC', 'ATA', 'ATT', 'TTA', 'TTG', 'CTG', 'CTA', 'CTC', 'CTT', 'GTT', 'GTA', 'GTG', 'GTC', 'ATG', 'TTC', 'TTT', 'TAC', 'TAT', 'TGG', 'TGC', 'TGT', 'GGT', 'GGC', 'GGA', 'GGG', 'CCT', 'CCC', 'CCA', 'CCG', 'CAC', 'CAT', 'CGC', 'AGG', 'CGT', 'AGA', 'CGG', 'CGA', 'AAA', 'AAG', 'GAC', 'GAT', 'GAG', 'GAA', 'CAA', 'CAG', 'AAC', 'AAT', 'AGC', 'TCT', 'TCA', 'TCG', 'AGT', 'TCC', 'ACG', 'ACA', 'ACT', 'ACC', 'TAG', 'TAA', 'TGA','A', 'I', 'L', 'V', 'M', 'F', 'Y', 'W', 'C', 'G', 'P', 'H', 'R', 'K', 'D', 'E', 'Q', 'N', 'S', 'T', '*']

In [None]:
alt.data_transformers.disable_max_rows()

def myprefcal(pre_counts, post_counts, _min, _max, scaletype, outputtype, _filter, _newpref, _concat, _yespvalue, mykozakyesorno = 'nokozak', __removetext='noremove', __codonorAA="codon", _sortbynumbermutations = 'nosortby', cscheme="redyellowgreen", creverse=False, _sea = 'nosea'):

    if _concat == 'yesconcat':
        preconcat = concat123(pre_counts)
        postconcat = concat123(post_counts)
        df = mainfunction2(preconcat, postconcat, 'ABC', _filter, _newpref, _yespvalue, mykozakyesorno = mykozakyesorno, _codonorAA=__codonorAA)
    else:
        df = mainfunction2(pre_counts[0], post_counts[0], 'A', _filter, _newpref, _yespvalue, mykozakyesorno = mykozakyesorno, _codonorAA=__codonorAA)


    def squaresizef(_x):
        if (_x*400000) < 10:
            return 0
        else:
            return 1



    if _sea == 'yessea':
        seadf = df.pivot(index=['AA', 'variable'], columns='site', values='logmfactor').fillna(0).replace([np.inf, -np.inf], 0)
        sns.set(rc={'figure.figsize':(40, 20)})
        sns.set_theme(style="whitegrid")
        return sns.clustermap(seadf, annot=False, cmap=sns.diverging_palette(20, 140, l=60, center="light", as_cmap=True), col_cluster=False, figsize=(40, 20), vmin = -5, vmax = 5)

    if _sortbynumbermutations == 'yessortby':
        df = mysortbynumbermutations(df)

    if outputtype == 'table':
        return df
    else:
        return dftoheatmap(df, _min, _max, scaletype, _removetext=__removetext, _sortbynt = _sortbynumbermutations, cscheme=cscheme, creverse=creverse)#(main_heatmap + wildtype + transtext + codontext)#.interactive()#.configure_view(fill='black')#.configure(background='gray')#add_selection(scales)



name = "corecodonmypref notwtnorm 210614"

In [None]:
def somechanges(_df):
    return _df

In [None]:
def dftoheatmap(df, _min=-6, _max=6, scaletype='linear', _sizer='new_just_pre_frac_per_1E6', _sizervalue = 16, _sizerwho = 'nrmpre', _removetext='noremove',_sortbynt = 'nosortbynt', cscheme="redyellowgreen", creverse=False):

    df['pre_frac_/1E6']=df['pre_frac']*1000000

    if (_sizer == 'new_just_pre_frac_per_1E6'):
        def myf(row):
            if (row['pre_frac_/1E6'] > _sizervalue):
                row['size'] = _sizervalue
            else:
                row['size'] = row['pre_frac_/1E6']
            return row
    if (_sizer == 'standard'):
        def myf(row):
            if (row[_sizerwho] > _sizervalue):
                row['size'] = _sizervalue
            else:
                row['size'] = row[_sizerwho]
            return row
    if (_sizer == 'random'):
        def myf(row):
            if (row[_sizerwho] > 100):
                row['size'] = 6
            elif (row[_sizerwho] > 10):
                row['size'] = 4
            else:
                row['size'] = 2
            return row
    if (_sizer == 'nstandard'):
        def myf(row):
            if (row['pvalue'] < (0.05/10000) or row['post_value'] > 150):
                row['size'] = 4
            else:
                row['size'] = 2
            return row

    df = df.apply(myf, axis=1)
    df

    def logmfactormax(row):
        if (row['logmfactor']>_max):
            row['logmfactor'] = _max
        if (row['logmfactor']<_min):
            row['logmfactor'] = _min
        return row

    df = df.apply(logmfactormax, axis=1)
    df


    main_heatmap = alt.Chart(df).mark_square(opacity=1).encode(
        size='size:Q',
        x=alt.X('site:N'),
        tooltip=['site', 'wildtype', 'variable', 'AA', 'wtAA', 'pre_value', 'pre_sum', 'post_value', 'post_sum', 'logmfactor'],
        #opacity='pvaluelog:Q',
        y=alt.Y('variable:O', sort=nucsort),
        color=alt.Color(
            'logmfactor:Q',
            scale=alt.Scale(
                #range=['red', 'white' ,'blue'],
                scheme=cscheme,
                domain=[_min, _max],
                domainMid=0,
                reverse= creverse,
                type=scaletype
            )
        )
    ).properties(
        title='test'
    )

    df_wt = df
    wdf1w = df_wt[['site','wildtype']].rename(columns={"wildtype": "variable"})

    wildtype = alt.Chart(wdf1w).mark_circle(color='black').encode(
        x=alt.X('site:O'),
        y=alt.Y('variable:O', sort=nucsort)
        )

    transtext = alt.Chart(df).mark_text(color='black', fontSize=7, dy=-3).encode(
        x=alt.X('site:O'),
        y=alt.Y('variable:O', sort=nucsort),
        text='AA',
        tooltip='info'
        )
    codontext = alt.Chart(df).mark_text(color='black', fontSize=7, dy=3).encode(
        x=alt.X('site:O'),
        y=alt.Y('variable:O', sort=nucsort),
        text='variable',
        tooltip='info',
        )

    if _removetext == 'yesremove':
        return (main_heatmap + wildtype)
    else:
        return (main_heatmap + wildtype + transtext + codontext)

In [None]:
def ntmuttype(row):
    wtcodon = row['wildtype']
    vcodon = row['variable']
    for letter1, letter2 in zip(wtcodon, vcodon):
        if letter1 != letter2:
            if letter1 == "A":
                if letter2 == "T":
                    row['AtoT']=row['AtoT']+1
                if letter2 == "G":
                    row['AtoG']=row['AtoG']+1
                if letter2 == "C":
                    row['AtoC']=row['AtoC']+1
            if letter1 == "T":
                if letter2 == "A":
                    row['TtoA']=row['TtoA']+1
                if letter2 == "G":
                    row['TtoG']=row['TtoG']+1
                if letter2 == "C":
                    row['TtoC']=row['TtoC']+1
            if letter1 == "G":
                if letter2 == "A":
                    row['GtoA']=row['GtoA']+1
                if letter2 == "T":
                    row['GtoT']=row['GtoT']+1
                if letter2 == "C":
                    row['GtoC']=row['GtoC']+1
            if letter1 == "C":
                if letter2 == "A":
                    row['CtoA']=row['CtoA']+1
                if letter2 == "T":
                    row['CtoT']=row['CtoT']+1
                if letter2 == "G":
                    row['CtoG']=row['CtoG']+1

    return row

In [None]:
#https://github.com/Agleason1/TIS-Predictor/blob/main/Koazk_Similarity_Score_Algorithm.ipynb
#Kozak Consensus Scoring System

weights = np.array([
       [0.04210526, 0.        , 0.03157895, 0.05263158, 0.        ],
       [0.04210526, 0.05263158, 0.10526316, 0.0625    , 0.        ],
       [0.03157895, 0.04210526, 0.05263158, 0.07368421, 0.        ],
       [0.03157895, 0.01052632, 0.04210526, 0.05263158, 0.        ],
       [0.08421053, 0.07368421, 0.18947368, 0.10526316, 0.        ],
       [0.04210526, 0.05263158, 0.05263158, 0.08421053, 0.        ],
       [0.12631579, 0.0625    , 0.12631579, 0.21052632, 0.        ],
       [0.83157895, 0.12631579, 0.65263158, 0.16842105, 0.        ],
       [0.15789474, 0.06315789, 0.11578947, 0.2       , 0.        ],
       [0.21052632, 0.09473684, 0.31578947, 0.51578947, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.24210526, 0.16666667, 0.53684211, 0.13684211, 0.        ],
       [0.15789474, 0.09473684, 0.09473684, 0.24210526, 0.        ],
       [0.05263158, 0.08421053, 0.14736842, 0.09473684, 0.        ],
       [0.07216495, 0.05263158, 0.10526316, 0.06315789, 0.        ],
       [0.        , 0.        , 0.        , 0.05263158, 0.        ],
       [0.05263158, 0.05263158, 0.10526316, 0.09473684, 0.        ],
       [0.04210526, 0.03157895, 0.05263158, 0.04210526, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.04210526, 0.04210526, 0.08421053, 0.07368421, 0.        ],
       [0.0625    , 0.04210526, 0.09473684, 0.05263158, 0.        ]
])

In [None]:
def similarity_score(sequence):

    assert len(sequence)==23,'Sequence must be 23 bases long. Codon of interest must be centered, with 10 bases flanking both sides.'



    sequence = sequence.upper()
    for i in np.arange(len(sequence)):
        if sequence[i] =='U':
            sequence = sequence[0:i]+'T'+sequence[i+1:len(sequence)]

    numbers=[0]*len(sequence)

    for k in np.arange(len(sequence)):
        if sequence[k]=='A':
            numbers[k] = 0
        elif sequence[k]=='T':
            numbers[k] = 1
        elif sequence[k]=='G':
            numbers[k] = 2
        elif sequence[k]=='C':
            numbers[k] = 3
        else:
            numbers[k]=4


    score = 0
    for k in np.arange(len(numbers)):
        score += weights[k][numbers[k]]

    max_score = np.sum(weights.max(axis=1))

    score = score/max_score

    return(score)

In [None]:
fullcodonseq = 'XXXXXXXXXX' + GetWTSeqFromDfAsString(pd.read_csv(Core_cell1), _codonORamino = 'codon') + 'XXXXXXXXXX'

def KozakScoreAtCertainPos2(df, _NucPosition, deter_df_to_merge = 0):
    fullcodonseq = 'XXXXXXXXXX' + GetWTSeqFromDfAsString(pd.read_csv(Core_cell1), _codonORamino = 'codon') + 'XXXXXXXXXX'

    df['kozakscore']=0


    df = df.apply(row_iter_mykozak_fixedPos2, NucPosition = _NucPosition, axis=1)

    if isinstance(deter_df_to_merge, pd.DataFrame):
        df = pd.merge(df, deter_df_to_merge)
    else:
        print('deter_df_to_merge not assigned, generate e.g. dfCore_deter = ...')

    return df

def row_iter_mykozak_fixedPos2(_row, NucPosition):
    NucPosition10 = NucPosition + 10

    mypos0 = int(_row['site']) * 3 - 2
    mypos1 = mypos0 + 10

    codonseq_with_mut = fullcodonseq[:mypos1-1]+_row['variable']+fullcodonseq[mypos1+2:]
    kozakseq = codonseq_with_mut[(NucPosition10-9-1-1):(NucPosition10+3+9-1+1)]

    _row['kozakscore'] = similarity_score(kozakseq)


    return _row

In [None]:
def dftoheatmap_AAgrouped(df, _min=-6, _max=6, scaletype='linear', _sizer='new_just_pre_frac_per_1E6', _sizervalue = 16, _sizerwho = 'nrmpre', _removetext='noremove',_sortbynt = 'nosortbynt', cscheme="redyellowgreen", creverse=False):

    df['pre_frac_/1E6']=df['pre_frac']*1000000

    if (_sizer == 'new_just_pre_frac_per_1E6'):
        def myf(row):
            if (row['pre_frac_/1E6'] > _sizervalue):
                row['size'] = _sizervalue
            else:
                row['size'] = row['pre_frac_/1E6']
            return row
    if (_sizer == 'standard'):
        def myf(row):
            if (row[_sizerwho] > _sizervalue):
                row['size'] = _sizervalue
            else:
                row['size'] = row[_sizerwho]
            return row
    if (_sizer == 'random'):
        def myf(row):
            if (row[_sizerwho] > 100):
                row['size'] = 6
            elif (row[_sizerwho] > 10):
                row['size'] = 4
            else:
                row['size'] = 2
            return row
    if (_sizer == 'nstandard'):
        def myf(row):
            if (row['pvalue'] < (0.05/10000) or row['post_value'] > 150):
                row['size'] = 4
            else:
                row['size'] = 2
            return row

    df = df.apply(myf, axis=1)

    def logmfactormax(row):
        if (row['logmfactor']>_max):
            row['logmfactor'] = _max
        if (row['logmfactor']<_min):
            row['logmfactor'] = _min
        return row

    df = df.apply(logmfactormax, axis=1)

    df_OrderOfAA = pd.read_excel(OrderOfAA)

    df = pd.merge(df, df_OrderOfAA, on=['AA'])


    main_heatmap = alt.Chart(df).mark_square(opacity=1).encode(
        size='size:Q',
        x=alt.X('site:N', axis=None),
        tooltip=['site', 'variable', 'wildtype', 'AA', 'wtAA', 'pre_value', 'pre_sum', 'post_value', 'post_sum', 'logmfactor'],
        y=alt.Y('variable:O', sort=nucsort, title=""),
        color=alt.Color(
            'logmfactor:Q',
            scale=alt.Scale(
                scheme=cscheme,
                domain=[_min, _max],
                domainMid=0,
                reverse= creverse,
                type=scaletype
            )
        )
    ).properties(

    )

    wdf1w = df.loc[df['wildtype'] == df['variable']]

    wildtype = alt.Chart(wdf1w).mark_circle(color='black').encode(
        x=alt.X('site:O'),
        y=alt.Y('variable:O', sort=nucsort)
        #text=alt.Text('value:N')
        )

    transtext = alt.Chart(df).mark_text(color='black', fontSize=7, dy=-3).encode(
        x=alt.X('site:O'),
        y=alt.Y('variable:O', sort=nucsort),
        text='AA',
        tooltip='info'
        )
    codontext = alt.Chart(df).mark_text(color='black', fontSize=7, dy=3).encode(
        x=alt.X('site:O'),
        y=alt.Y('variable:O', sort=nucsort),
        text='variable',
        tooltip='info',
        )

    if _removetext == 'yesremove':
        return (main_heatmap + wildtype)
    else:
        return (main_heatmap + wildtype + transtext + codontext)

In [None]:
def dftoheatmap_precount(df, _min=1, _max=1000, scaletype='log', _removetext='yesremove',_sortbynt = 'yessortbynt', cscheme="greenblue", creverse=False, _normtoreaddepth = 'yesnorm'):

    if _normtoreaddepth == 'yesnorm_but_to_post_sum':
        df['color_value']=df['nrmpre']

    if _normtoreaddepth == 'yesnorm':
        df['color_value']=df['pre_frac']*1000000
    else:
        df['color_value']=df['pre_value']

    df.loc[df.color_value>1000,['color_value']]=1000

    nucsort = ['GCT', 'GCA', 'GCG', 'GCC', 'ATC', 'ATA', 'ATT', 'TTA', 'TTG', 'CTG', 'CTA', 'CTC', 'CTT', 'GTT', 'GTA', 'GTG', 'GTC', 'ATG', 'TTC', 'TTT', 'TAC', 'TAT', 'TGG', 'TGC', 'TGT', 'GGT', 'GGC', 'GGA', 'GGG', 'CCT', 'CCC', 'CCA', 'CCG', 'CAC', 'CAT', 'CGC', 'AGG', 'CGT', 'AGA', 'CGG', 'CGA', 'AAA', 'AAG', 'GAC', 'GAT', 'GAG', 'GAA', 'CAA', 'CAG', 'AAC', 'AAT', 'AGC', 'TCT', 'TCA', 'TCG', 'AGT', 'TCC', 'ACG', 'ACA', 'ACT', 'ACC', 'TAG', 'TAA', 'TGA','A', 'I', 'L', 'V', 'M', 'F', 'Y', 'W', 'C', 'G', 'P', 'H', 'R', 'K', 'D', 'E', 'Q', 'N', 'S', 'T', '*']

    if _sortbynt == "yessortbynt":
        df = mysortbynumbermutations(df)
        df['variable']=df['sortedbymutations']
        nucsort = ['000', '001', '002', '003', '010', '020', '030', '100', '200', '300', '011', '012', '013', '021', '022', '023', '031', '032', '033', '101', '102', '103', '110', '120', '130', '201', '202', '203', '210', '220', '230', '301', '302', '303', '310', '320', '330', '111', '112', '113', '121', '122', '123', '131', '132', '133', '211', '212', '213', '221', '222', '223', '231', '232', '233', '311', '312', '313', '321', '322', '323', '331', '332', '333']

    main_heatmap = alt.Chart(df).mark_rect().encode(
        x=alt.X('site:N'),
        tooltip=['site', 'variable', 'nrmpre', 'pre_value', 'pre_sum', 'post_value', 'post_sum', 'color_value'],
        y=alt.Y('variable:O', sort=nucsort),
        color=alt.Color(
            'color_value:Q',
            scale=alt.Scale(
                scheme=cscheme,
                domain=[_min, _max],
                type=scaletype
            )
        )
    ).properties(
        title='test'
    )

    df_wt = df
    wdf1w = df_wt[['site','wildtype']].rename(columns={"wildtype": "variable"})

    wildtype = alt.Chart(wdf1w).mark_circle(color='black').encode(
        x=alt.X('site:O'),
        y=alt.Y('variable:O', sort=nucsort)
        #text=alt.Text('value:N')
        )

    transtext = alt.Chart(df).mark_text(color='black', fontSize=7, dy=-3).encode(
        x=alt.X('site:O'),
        y=alt.Y('variable:O', sort=nucsort),
        text='AA',
        tooltip=['site', 'nrmpre']
        )
    codontext = alt.Chart(df).mark_text(color='black', fontSize=7, dy=3).encode(
        x=alt.X('site:O'),
        y=alt.Y('variable:O', sort=nucsort),
        text='variable',
        tooltip=['site', 'nrmpre'],
        #text=alt.Text('leftoverred:N')
        )
    if _sortbynt == "yessortbynt":
        return main_heatmap
    elif _removetext == 'yesremove':
        return (main_heatmap + wildtype)
    else:
        return (main_heatmap + wildtype + transtext + codontext)

In [None]:
def dftoheatmap_insilico(df, _min=-6, _max=6, scaletype='linear', _sizer='standard', _sizervalue = 16, _sizerwho = 'nrmpre', _removetext='noremove',_sortbynt = 'nosortbynt', cscheme="redyellowgreen", creverse=False):

    df['nrmpre'] = 10
    df['logmfactor'] = df['prediction_independent']
    df['site'] = df['pos']
    df['wildtype'] = df['wt']
    df['variable'] = df['subs']

    if (_sizer == 'standard'):
        def myf(row):
            if (row[_sizerwho] > _sizervalue):
                row['size'] = _sizervalue
            else:
                row['size'] = row[_sizerwho]
            return row
    if (_sizer == 'random'):
        def myf(row):
            if (row[_sizerwho] > 100):
                row['size'] = 6
            elif (row[_sizerwho] > 10):
                row['size'] = 4
            else:
                row['size'] = 2
            return row
    if (_sizer == 'nstandard'):
        def myf(row):
            if (row['pvalue'] < (0.05/10000) or row['post_value'] > 150):
                row['size'] = 4
            else:
                row['size'] = 2
            return row

    df = df.apply(myf, axis=1)
    df

    def logmfactormax(row):
        if (row['logmfactor']>_max):
            row['logmfactor'] = _max
        if (row['logmfactor']<_min):
            row['logmfactor'] = _min
        return row

    df = df.apply(logmfactormax, axis=1)
    df


    main_heatmap = alt.Chart(df).mark_square(opacity=1).encode(
        size='size:Q',
        x=alt.X('site:N'),
        tooltip=['site','nrmpre', 'logmfactor'],
        y=alt.Y('variable:O', sort=nucsort),
        color=alt.Color(
            'logmfactor:Q',
            scale=alt.Scale(
                scheme=cscheme,
                domain=[_min, _max],
                domainMid=0,
                reverse= creverse,
                type=scaletype
            )
        )
    ).properties(
        title='test'
    )

    df_wt = df
    wdf1w = df_wt[['site','wildtype']].rename(columns={"wildtype": "variable"})

    wildtype = alt.Chart(wdf1w).mark_circle(color='black').encode(
        x=alt.X('site:O'),
        y=alt.Y('variable:O', sort=nucsort)
        )

    transtext = alt.Chart(df).mark_text(color='black', fontSize=7, dy=-3).encode(
        x=alt.X('site:O'),
        y=alt.Y('variable:O', sort=nucsort),
        text='AA',
        )
    codontext = alt.Chart(df).mark_text(color='black', fontSize=7, dy=3).encode(
        x=alt.X('site:O'),
        y=alt.Y('variable:O', sort=nucsort),
        text='variable',
        )

    if _removetext == 'yesremove':
        return (main_heatmap + wildtype)
    else:
        return (main_heatmap + wildtype + transtext + codontext)

In [None]:
def muttypesingleNTcombined(_df, _yesgraph="yesgraph", _preorpost = "pre"):
    codonlist = ['ATG', 'ATC', 'ATA', 'ATT', 'TTA', 'TTG', 'CTG', 'CTA', 'CTC', 'CTT', 'GTT', 'GTA', 'GTG', 'GTC', 'GCT', 'GCA', 'GCG', 'GCC', 'TTC', 'TTT', 'TAC', 'TAT', 'TGG', 'TGC', 'TGT', 'GGT', 'GGC', 'GGA', 'GGG', 'CCT', 'CCC', 'CCA', 'CCG', 'CAC', 'CAT', 'CGC', 'AGG', 'CGT', 'AGA', 'CGG', 'CGA', 'AAA', 'AAG', 'GAC', 'GAT', 'GAG', 'GAA', 'CAA', 'CAG', 'AAC', 'AAT', 'AGC', 'TCT', 'TCA', 'TCG', 'AGT', 'TCC', 'ACG', 'ACA', 'ACT', 'ACC', 'TAG', 'TAA', 'TGA']#,'M', 'I', 'L', 'V', 'A', 'F', 'Y', 'W', 'C', 'G', 'P', 'H', 'R', 'K', 'D', 'E', 'Q', 'N', 'S', 'T', '*']for i in ['']

    iii = 0
    for i in codonlist:
        for ii in codonlist:
            df = pd.DataFrame(data={'wildtype':i, 'variable':ii}, index=[0])
            if iii == 0:
                dfall = df
                iii = 1
            else:
                dfall = pd.concat([dfall, df])

    dfall['AtoT']=0
    dfall['AtoG']=0
    dfall['AtoC']=0
    dfall['TtoA']=0
    dfall['TtoG']=0
    dfall['TtoC']=0
    dfall['GtoA']=0
    dfall['GtoT']=0
    dfall['GtoC']=0
    dfall['CtoA']=0
    dfall['CtoT']=0
    dfall['CtoG']=0

    df = dfall.apply(ntmuttype, axis=1)
    dfmerged = pd.merge(_df, df, on=['wildtype', 'variable'])
    df = dfmerged

    if _preorpost == "pre":
        var_value = "pre_value"
    else:
        var_value = "post_value"

    typelist = ['AtoT', 'AtoG', 'AtoC', 'TtoA', 'TtoG', 'TtoC', 'GtoA', 'GtoT', 'GtoC', 'CtoA', 'CtoT', 'CtoG']

    for i in typelist:
        df[i]=df[var_value]*df[i]

    df = df[['AtoT', 'AtoG', 'AtoC', 'TtoA', 'TtoG', 'TtoC', 'GtoA', 'GtoT', 'GtoC', 'CtoA', 'CtoT', 'CtoG']].sum().reset_index()
    df = df.rename(columns={'index':'type', df.columns[1]:'value'})
    df['sum'] = df['value'].sum()
    df['frac'] = df['value'] / df['sum']

    graph = alt.Chart(df).mark_bar().encode(
        alt.X('name:O'),
        alt.Y('sum(frac):Q', scale=alt.Scale(domain=[0,1])),
        color=alt.Color('type', scale=alt.Scale(scheme='set2')),
        tooltip=['frac'],
    )

    if _yesgraph=="yesgraph":
        return graph
    else:
        return df

# Figures

## Fig. 2B  [*](#cell-id)

In [None]:
#@markdown Figure

dfCore = myprefcal([Core_plasmid1, Core_plasmid2], [Core_cell1, Core_cell2, Core_cell3], -6, 6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext='yesremove') if Do_not_reset and 'dfCore' not in globals() else dfCore

df = dfCore
df = df[df['site']>29]
df['site']=df['site']-28

df_OrderOfAA = pd.read_excel(OrderOfAA)
df = pd.merge(df, df_OrderOfAA, on=['AA'])
alt.vconcat(
    dftoheatmap_AAgrouped(df[df['AAgrouplarge']=='nonpolar'], cscheme="redblue", creverse=True, _removetext = 'yesremove'),
    dftoheatmap_AAgrouped(df[df['AAgrouplarge']=='start'], cscheme="redblue", creverse=True, _removetext = 'yesremove'),
    dftoheatmap_AAgrouped(df[df['AAgrouplarge']=='placeholder'], cscheme="redblue", creverse=True, _removetext = 'yesremove'),
    dftoheatmap_AAgrouped(df[df['AAgrouplarge']=='stop'], cscheme="redblue", creverse=True, _removetext = 'yesremove'),
).resolve_scale(x='shared').configure_concat(spacing=5).configure_view(strokeOpacity=0).configure_view(strokeWidth=0)#.save("Fig_2B.html")

In [None]:
#@markdown Dataframe

dfCore = myprefcal([Core_plasmid1, Core_plasmid2], [Core_cell1, Core_cell2, Core_cell3], -6, 6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext='yesremove') if Do_not_reset and 'dfCore' not in globals() else dfCore

df = dfCore
df = df[df['site']>29]
df['site']=df['site']-28

df_OrderOfAA = pd.read_excel(OrderOfAA)
df = pd.merge(df, df_OrderOfAA, on=['AA'])
df#.to_csv('Fig_2B.csv')

## Fig. 2C [**](#cell-id2)

In [None]:
#@markdown Figure
dfCore_deter = myprefcal_deter([Core_plasmid1,Core_plasmid2], [Core_cell1, Core_cell2, Core_cell3],-6,6, 'linear', 'ntable', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue') if Do_not_reset and 'dfCore_deter' not in globals() else dfCore_deter
df = dfCore_deter
df = df[df['site']>29][df['site']<164]
df['group'] = 'other'

df.loc[(df['variable']=='TAG'), 'group']='in-frame stop'
df.loc[(df['variable']=='TAA'), 'group']='in-frame stop'
df.loc[(df['variable']=='TGA'), 'group']='in-frame stop'

df.loc[df['variable']==df['wildtype'], 'group']='WT'

df.loc[df['variable']=='ATG', 'group']='in-frame start'

df = df.loc[df['group']!='other']

points = alt.Chart().mark_point(size = 22).encode(
        x=alt.X(
        'jitter:Q',
        title=None,
        axis=alt.Axis(values=[0], ticks=True, grid=False, labels=False),
        scale=alt.Scale(domain=[-4, 4]),
        ),
        y=alt.Y('logmfactor:Q'),
        color=alt.Color('group:N', legend=None),
        tooltip=['variable', 'wildtype', 'logmfactor', 'site', 'AA', 'wtAA', 'pre_value', 'pre_sum', 'nrmpre', 'post_value', 'post_sum'],
).transform_calculate(jitter='sqrt(-2*log(random()))*cos(2*PI*random())')

boxplot = alt.Chart().mark_boxplot(color='gray', size=10, opacity=0.4, outliers={'size': 0}).encode(
    alt.Y('logmfactor:Q')
).properties(width=100)

error = alt.Chart().mark_errorbar(extent='ci', ticks=True, color='black', opacity=0.8).encode(
    y=alt.Y('logmfactor:Q'),
    strokeWidth=alt.value(3)
)

fullgraph = alt.layer(points, boxplot, error, data=df).properties(width=75, height=400).facet(column=alt.Column('group:N', title='with Pol start kozak value', sort=['C1 optimized', 'C1 reduced', 'J start optimized', 'J start reduced', 'J start breaker', 'J start preserver', 'J stop break', 'J stop preserve']), spacing=0).configure_view(stroke=None).resolve_scale(x=alt.ResolveMode("independent"))
fullgraph#.save("Fig_2C.html")

In [None]:
#@markdown Dataframe
dfCore_deter = myprefcal_deter([Core_plasmid1,Core_plasmid2], [Core_cell1, Core_cell2, Core_cell3],-6,6, 'linear', 'ntable', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue') if Do_not_reset and 'dfCore_deter' not in globals() else dfCore_deter
df = dfCore_deter
df = df[df['site']>29][df['site']<164]
df['group'] = 'other'

df.loc[(df['variable']=='TAG'), 'group']='in-frame stop'
df.loc[(df['variable']=='TAA'), 'group']='in-frame stop'
df.loc[(df['variable']=='TGA'), 'group']='in-frame stop'

df.loc[df['variable']==df['wildtype'], 'group']='WT'

df.loc[df['variable']=='ATG', 'group']='in-frame start'

df = df.loc[df['group']!='other']

df#.to_csv('Fig_2C.csv')

## Fig. 2D [**](#cell-id2)

In [None]:
#@markdown Figure
dfCore = myprefcal([Core_plasmid1,Core_plasmid2], [Core_cell1, Core_cell2, Core_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove') if Do_not_reset and 'dfCore' not in globals() else dfCore
dfCore_deter = myprefcal_deter([Core_plasmid1,Core_plasmid2], [Core_cell1, Core_cell2, Core_cell3],-6,6, 'linear', 'ntable', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue') if Do_not_reset and 'dfCore_deter' not in globals() else dfCore_deter
dfCorewithKozak2_280 = KozakScoreAtCertainPos2(dfCore, 280, dfCore_deter) if Do_not_reset and 'dfCorewithKozak2_280' not in globals() else dfCorewithKozak2_280

df = dfCorewithKozak2_280
df = df.loc[df['site'].isin([93])]

df = df[df['variable']!='ATG']
df = df[df['variable']!='TAG']
df = df[df['variable']!='TGA']
df = df[df['variable']!='TAA']

df['group'] = 'C1 reduced'
df.loc[df['kozakscore']>0.5181337952805991, 'group'] = 'C1 optimized'
df_C1_Kozak = df

###############################################

dfCorewithKozak2_116B = KozakScoreAtCertainPos2(dfCore, (116*3-2+1), dfCore_deter)if Do_not_reset and 'dfCorewithKozak2_116B' not in globals() else dfCorewithKozak2_116B

df = dfCorewithKozak2_116B

df['group']='J start preserver'
df.loc[df['1ATGbreak']==1, 'group']='J start breaker'
df.loc[df['-2ATGbreak']==1, 'group']='J start breaker'
df = df[df['group']=='J start preserver']

df = df.loc[df['site'].isin([115])]

df['group'] = 'J start reduced'
df.loc[df['kozakscore']>0.539792965618, 'group'] = 'J start optimized'
df_Jstart_Kozak = df

###############################################

df = dfCorewithKozak2_116B

df['group']='J start preserver'
df.loc[df['1ATGbreak']==1, 'group']='J start breaker'
df.loc[df['-2ATGbreak']==1, 'group']='J start breaker'
df = df.loc[df['site'].isin([116])]

df_Jstart_break = df

###############################################

df = dfCorewithKozak2_116B

df['group']='J stop break'

df.loc[(df['1TAGpreserve']==1) & (df['site'].isin([123])), 'group']='J stop preserve'
df.loc[(df['1TGApreserve']==1) & (df['site'].isin([123])), 'group']='J stop preserve'
df.loc[(df['1TAApreserve']==1) & (df['site'].isin([123])), 'group']='J stop preserve'

df.loc[(df['1stoppreserve']==1) & (df['site'].isin([123])), 'group']='J stop preserve'


df.loc[(df['-2TAGpreserve']==1) & (df['site'].isin([124])), 'group']='J stop preserve'
df.loc[(df['-2TGApreserve']==1) & (df['site'].isin([124])), 'group']='J stop preserve'
df.loc[(df['-2TAApreserve']==1) & (df['site'].isin([124])), 'group']='J stop preserve'

df.loc[(df['-2stoppreserve']==1) & (df['site'].isin([124])), 'group']='J stop preserve'

df.loc[(df['variable']=='TAA') & (df['group']=='J stop preserve'), 'group']='in-frame stop from J stop preserve'
df.loc[(df['variable']=='TGA') & (df['group']=='J stop preserve'), 'group']='in-frame stop from J stop preserve'
df.loc[(df['variable']=='TAG') & (df['group']=='J stop preserve'), 'group']='in-frame stop from J stop preserve'
df.loc[(df['variable']=='TAA') & (df['group']=='J stop break'), 'group']='in-frame stop from J stop break'
df.loc[(df['variable']=='TGA') & (df['group']=='J stop break'), 'group']='in-frame stop from J stop break'
df.loc[(df['variable']=='TAG') & (df['group']=='J stop break'), 'group']='in-frame stop from J stop break'

df = df.loc[df['site'].isin([123, 124])]

df_Jstop_break = df
df = pd.concat([df_C1_Kozak, df_Jstart_Kozak, df_Jstart_break, df_Jstop_break])

points = alt.Chart().mark_point(size = 22).encode(
        x=alt.X(
        'jitter:Q',
        title=None,
        axis=alt.Axis(values=[0], ticks=True, grid=False, labels=False),
        scale=alt.Scale(domain=[-4, 4]),
        ),
        y=alt.Y('logmfactor:Q'),
        color=alt.Color('group:N', legend=None),
        tooltip=['variable', 'wildtype', 'mutseq', 'logmfactor', 'site', 'AA', 'wtAA', 'pre_value', 'pre_sum', 'nrmpre', 'post_value', 'post_sum'],
).transform_calculate(jitter='sqrt(-2*log(random()))*cos(2*PI*random())')

boxplot = alt.Chart().mark_boxplot(color='gray', size=10, opacity=0.4, outliers={'size': 0}).encode(
    alt.Y('logmfactor:Q')
).properties(width=100)

error = alt.Chart().mark_errorbar(extent='ci', ticks=True, color='black', opacity=0.8).encode(
    y=alt.Y('logmfactor:Q'),
    strokeWidth=alt.value(3)
)

fullgraph = alt.layer(points, boxplot, error, data=df).properties(width=75, height=400).facet(column=alt.Column('group:N', title='with Pol start kozak value', sort=['C1 optimized', 'C1 reduced', 'J start optimized', 'J start reduced', 'J start breaker', 'J start preserver', 'J stop break', 'J stop preserve']), spacing=0).configure_view(stroke=None).resolve_scale(x=alt.ResolveMode("independent"))
fullgraph#.save("Fig_2D.html")

In [None]:
#@markdown Dataframe
dfCore = myprefcal([Core_plasmid1,Core_plasmid2], [Core_cell1, Core_cell2, Core_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove') if Do_not_reset and 'dfCore' not in globals() else dfCore
dfCore_deter = myprefcal_deter([Core_plasmid1,Core_plasmid2], [Core_cell1, Core_cell2, Core_cell3],-6,6, 'linear', 'ntable', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue') if Do_not_reset and 'dfCore_deter' not in globals() else dfCore_deter
dfCorewithKozak2_280 = KozakScoreAtCertainPos2(dfCore, 280, dfCore_deter) if Do_not_reset and 'dfCorewithKozak2_280' not in globals() else dfCorewithKozak2_280

df = dfCorewithKozak2_280
df = df.loc[df['site'].isin([93])]

df = df[df['variable']!='ATG']
df = df[df['variable']!='TAG']
df = df[df['variable']!='TGA']
df = df[df['variable']!='TAA']

df['group'] = 'C1 reduced'
df.loc[df['kozakscore']>0.5181337952805991, 'group'] = 'C1 optimized'
df_C1_Kozak = df

###############################################

dfCorewithKozak2_116B = KozakScoreAtCertainPos2(dfCore, (116*3-2+1), dfCore_deter)if Do_not_reset and 'dfCorewithKozak2_116B' not in globals() else dfCorewithKozak2_116B

df = dfCorewithKozak2_116B

df['group']='J start preserver'
df.loc[df['1ATGbreak']==1, 'group']='J start breaker'
df.loc[df['-2ATGbreak']==1, 'group']='J start breaker'
df = df[df['group']=='J start preserver']

df = df.loc[df['site'].isin([115])]

df['group'] = 'J start reduced'
df.loc[df['kozakscore']>0.539792965618, 'group'] = 'J start optimized'
df_Jstart_Kozak = df

###############################################

df = dfCorewithKozak2_116B

df['group']='J start preserver'
df.loc[df['1ATGbreak']==1, 'group']='J start breaker'
df.loc[df['-2ATGbreak']==1, 'group']='J start breaker'
df = df.loc[df['site'].isin([116])]

df_Jstart_break = df

###############################################

df = dfCorewithKozak2_116B

df['group']='J stop break'

df.loc[(df['1TAGpreserve']==1) & (df['site'].isin([123])), 'group']='J stop preserve'
df.loc[(df['1TGApreserve']==1) & (df['site'].isin([123])), 'group']='J stop preserve'
df.loc[(df['1TAApreserve']==1) & (df['site'].isin([123])), 'group']='J stop preserve'

df.loc[(df['1stoppreserve']==1) & (df['site'].isin([123])), 'group']='J stop preserve'


df.loc[(df['-2TAGpreserve']==1) & (df['site'].isin([124])), 'group']='J stop preserve'
df.loc[(df['-2TGApreserve']==1) & (df['site'].isin([124])), 'group']='J stop preserve'
df.loc[(df['-2TAApreserve']==1) & (df['site'].isin([124])), 'group']='J stop preserve'

df.loc[(df['-2stoppreserve']==1) & (df['site'].isin([124])), 'group']='J stop preserve'

df.loc[(df['variable']=='TAA') & (df['group']=='J stop preserve'), 'group']='in-frame stop from J stop preserve'
df.loc[(df['variable']=='TGA') & (df['group']=='J stop preserve'), 'group']='in-frame stop from J stop preserve'
df.loc[(df['variable']=='TAG') & (df['group']=='J stop preserve'), 'group']='in-frame stop from J stop preserve'
df.loc[(df['variable']=='TAA') & (df['group']=='J stop break'), 'group']='in-frame stop from J stop break'
df.loc[(df['variable']=='TGA') & (df['group']=='J stop break'), 'group']='in-frame stop from J stop break'
df.loc[(df['variable']=='TAG') & (df['group']=='J stop break'), 'group']='in-frame stop from J stop break'

df = df.loc[df['site'].isin([123, 124])]

df_Jstop_break = df
df = pd.concat([df_C1_Kozak, df_Jstart_Kozak, df_Jstart_break, df_Jstop_break])

df#.to_csv('Fig_2D.csv')

## Fig. 3A [*](#cell-id)

In [None]:
#@markdown Figure (Part 1)
dfTP = myprefcal([TP_plasmid1, TP_plasmid2, TP_plasmid3], [TP_cell1,TP_cell2,TP_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')
dfSpacer = myprefcal([Spacer_plasmid1, Spacer_plasmid2, Spacer_plasmid3], [Spacer_cell1,Spacer_cell2,Spacer_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')
dfRT = myprefcal([RT_plasmid1, RT_plasmid2, RT_plasmid3], [RT_cell1,RT_cell2,RT_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')
dfRNaseH = myprefcal([RNaseH_plasmid1], [RNaseH_cell1, RNaseH_cell2, RNaseH_cell3, RNaseH_cell4, RNaseH_cell5, RNaseH_cell6],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')

dfSpacer['site'] = dfSpacer['site'] + 179
dfRT['site'] = dfRT['site'] + 179 + 169
dfRNaseH['site'] = dfRNaseH['site'] + 179 + 169 + 344

dfPol = pd.concat([dfTP, dfSpacer, dfRT, dfRNaseH])
#dftoheatmap(dfPol, cscheme="redblue", creverse=True, _removetext = 'yesremove')#.save("Fig_3A.html")
dftoheatmap(pd.concat([dfTP, dfSpacer]), cscheme="redblue", creverse=True, _removetext = 'yesremove')

In [None]:
#@markdown Figure (Part 2)
dftoheatmap(pd.concat([dfRT, dfRNaseH]), cscheme="redblue", creverse=True, _removetext = 'yesremove')

In [None]:
#@markdown Dataframe
dfTP = myprefcal([TP_plasmid1, TP_plasmid2, TP_plasmid3], [TP_cell1,TP_cell2,TP_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')
dfSpacer = myprefcal([Spacer_plasmid1, Spacer_plasmid2, Spacer_plasmid3], [Spacer_cell1,Spacer_cell2,Spacer_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')
dfRT = myprefcal([RT_plasmid1, RT_plasmid2, RT_plasmid3], [RT_cell1,RT_cell2,RT_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')
dfRNaseH = myprefcal([RNaseH_plasmid1], [RNaseH_cell1, RNaseH_cell2, RNaseH_cell3, RNaseH_cell4, RNaseH_cell5, RNaseH_cell6],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')

dfSpacer['site'] = dfSpacer['site'] + 179
dfRT['site'] = dfRT['site'] + 179 + 169
dfRNaseH['site'] = dfRNaseH['site'] + 179 + 169 + 344

dfPol = pd.concat([dfTP, dfSpacer, dfRT, dfRNaseH])
dfPol#.to_csv('Fig_3A.csv')

## Fig. 3D [**](#cell-id2)

In [None]:
#@markdown Figure
dfTP = myprefcal([TP_plasmid1, TP_plasmid2, TP_plasmid3], [TP_cell1,TP_cell2,TP_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')
dfSpacer = myprefcal([Spacer_plasmid1, Spacer_plasmid2, Spacer_plasmid3], [Spacer_cell1,Spacer_cell2,Spacer_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')
dfRT = myprefcal([RT_plasmid1, RT_plasmid2, RT_plasmid3], [RT_cell1,RT_cell2,RT_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')
dfRNaseH = myprefcal([RNaseH_plasmid1], [RNaseH_cell1, RNaseH_cell2, RNaseH_cell3, RNaseH_cell4, RNaseH_cell5, RNaseH_cell6],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')

dfSpacer['site'] = dfSpacer['site'] + 179
dfRT['site'] = dfRT['site'] + 179 + 169
dfRNaseH['site'] = dfRNaseH['site'] + 179 + 169 + 344

dfPol = pd.concat([dfTP, dfSpacer, dfRT, dfRNaseH])
df = dfPol

df['abs_logmfactor'] = df['logmfactor'].abs()

df = df[['site', 'abs_logmfactor']].groupby(['site']).sum().reset_index()

df['norm_abs_logmfactor'] = (df['abs_logmfactor'] - df['abs_logmfactor'].min()) / (df['abs_logmfactor'].max() - df['abs_logmfactor'].min())


df['group']='Remaining'
df.loc[df['site'].between(325, 354), 'group']='RT extension'
df.loc[df['site'].between(456, 508), 'group']='RT insertion'
df.loc[df['site'].between(179, 324), 'group']='Spacer'
df.loc[df['site'].isin([65,553,554,702,731,750,790]), 'group']='Active sites'
df.loc[df['site'].isin([325, 336, 340, 354, 481, 484, 504, 508]), 'group']='Zn fingers'

points = alt.Chart().mark_point(size = 22).encode(
        x=alt.X(
        'jitter:Q',
        title=None,
        axis=alt.Axis(values=[0], ticks=True, grid=False, labels=False),
        scale=alt.Scale(domain=[-4, 4]),
        ),
        y=alt.Y('norm_abs_logmfactor:Q'),
        color=alt.Color('group:N', legend=None),
).transform_calculate(jitter='sqrt(-2*log(random()))*cos(2*PI*random())')

boxplot = alt.Chart().mark_boxplot(color='gray', size=10, opacity=0.4, outliers={'size': 0}).encode(
    alt.Y('norm_abs_logmfactor:Q')
).properties(width=100)

error = alt.Chart().mark_errorbar(extent='ci', ticks=True, color='black', opacity=0.8).encode(
    y=alt.Y('norm_abs_logmfactor:Q'),
    strokeWidth=alt.value(3)
)

fullgraph = alt.layer(points, boxplot, error, data=df).properties(width=75, height=400).facet(column=alt.Column('group:N', sort=['Zn fingers', 'RT extension', 'RT insertion', 'Spacer', 'Active sites', 'Remaining']), spacing=0).configure_view(stroke=None).resolve_scale(x=alt.ResolveMode("independent"))
fullgraph#.save("Fig_3D.html")

In [None]:
#@markdown Dataframe
dfTP = myprefcal([TP_plasmid1, TP_plasmid2, TP_plasmid3], [TP_cell1,TP_cell2,TP_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')
dfSpacer = myprefcal([Spacer_plasmid1, Spacer_plasmid2, Spacer_plasmid3], [Spacer_cell1,Spacer_cell2,Spacer_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')
dfRT = myprefcal([RT_plasmid1, RT_plasmid2, RT_plasmid3], [RT_cell1,RT_cell2,RT_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')
dfRNaseH = myprefcal([RNaseH_plasmid1], [RNaseH_cell1, RNaseH_cell2, RNaseH_cell3, RNaseH_cell4, RNaseH_cell5, RNaseH_cell6],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')

dfSpacer['site'] = dfSpacer['site'] + 179
dfRT['site'] = dfRT['site'] + 179 + 169
dfRNaseH['site'] = dfRNaseH['site'] + 179 + 169 + 344

dfPol = pd.concat([dfTP, dfSpacer, dfRT, dfRNaseH])
df = dfPol

df['abs_logmfactor'] = df['logmfactor'].abs()

df = df[['site', 'abs_logmfactor']].groupby(['site']).sum().reset_index()

df['norm_abs_logmfactor'] = (df['abs_logmfactor'] - df['abs_logmfactor'].min()) / (df['abs_logmfactor'].max() - df['abs_logmfactor'].min())


df['group']='Remaining'
df.loc[df['site'].between(325, 354), 'group']='RT extension'
df.loc[df['site'].between(456, 508), 'group']='RT insertion'
df.loc[df['site'].between(179, 324), 'group']='Spacer'
df.loc[df['site'].isin([65,553,554,702,731,750,790]), 'group']='Active sites'
df.loc[df['site'].isin([325, 336, 340, 354, 481, 484, 504, 508]), 'group']='Zn fingers'

df#.to_csv('Fig_3D.csv')

# Supplemental Figures

## Fig. S1B **

In [None]:
#@markdown Figure
dfCore_nofilt = myprefcal([Core_plasmid1,Core_plasmid2], [Core_cell1, Core_cell2, Core_cell3],-6,6, 'linear', 'table', 'nofilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove') if Do_not_reset and 'dfCore_nofilt' not in globals() else dfCore_nofilt

df = dfCore_nofilt
df = df[df['site']>29]
df['site']=df['site']-28
dfCore_nofilt = df

dfTP_nofilt = myprefcal([TP_plasmid1, TP_plasmid2, TP_plasmid3], [TP_cell1,TP_cell2,TP_cell3],-6,6, 'linear', 'table', 'nofilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')
dfSpacer_nofilt = myprefcal([Spacer_plasmid1, Spacer_plasmid2, Spacer_plasmid3], [Spacer_cell1,Spacer_cell2,Spacer_cell3],-6,6, 'linear', 'table', 'nofilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')
dfRT_nofilt = myprefcal([RT_plasmid1, RT_plasmid2, RT_plasmid3], [RT_cell1,RT_cell2,RT_cell3],-6,6, 'linear', 'table', 'nofilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')
dfRNaseH_nofilt = myprefcal([RNaseH_plasmid1], [RNaseH_cell1, RNaseH_cell2, RNaseH_cell3, RNaseH_cell4, RNaseH_cell5, RNaseH_cell6],-6,6, 'linear', 'table', 'nofilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')

dfSpacer_nofilt['site'] = dfSpacer_nofilt['site'] + 179
dfRT_nofilt['site'] = dfRT_nofilt['site'] + 179 + 169
dfRNaseH_nofilt['site'] = dfRNaseH_nofilt['site'] + 179 + 169 + 344
dffull_nofilt = pd.concat([dfTP_nofilt, dfSpacer_nofilt, dfRT_nofilt, dfRNaseH_nofilt])
df = muttypesingleNTcombined(pd.concat([dfCore_nofilt, dffull_nofilt]), _yesgraph="df", _preorpost = "post")

alt.Chart(df).mark_bar().encode(
    alt.X('type:O', sort=["AtoC", "TtoG", "AtoG", "TtoC", "AtoT", "TtoA", "CtoA", "GtoT", "CtoG", "GtoC", "CtoT", "GtoA"]),
    alt.Y('frac:Q'),
)#.save("Fig_S1B.html")

In [None]:
#@markdown Dataframe
dfCore_nofilt = myprefcal([Core_plasmid1,Core_plasmid2], [Core_cell1, Core_cell2, Core_cell3],-6,6, 'linear', 'table', 'nofilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove') if Do_not_reset and 'dfCore_nofilt' not in globals() else dfCore_nofilt

df = dfCore_nofilt
df = df[df['site']>29]
df['site']=df['site']-28
dfCore_nofilt = df

dfTP_nofilt = myprefcal([TP_plasmid1, TP_plasmid2, TP_plasmid3], [TP_cell1,TP_cell2,TP_cell3],-6,6, 'linear', 'table', 'nofilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')
dfSpacer_nofilt = myprefcal([Spacer_plasmid1, Spacer_plasmid2, Spacer_plasmid3], [Spacer_cell1,Spacer_cell2,Spacer_cell3],-6,6, 'linear', 'table', 'nofilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')
dfRT_nofilt = myprefcal([RT_plasmid1, RT_plasmid2, RT_plasmid3], [RT_cell1,RT_cell2,RT_cell3],-6,6, 'linear', 'table', 'nofilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')
dfRNaseH_nofilt = myprefcal([RNaseH_plasmid1], [RNaseH_cell1, RNaseH_cell2, RNaseH_cell3, RNaseH_cell4, RNaseH_cell5, RNaseH_cell6],-6,6, 'linear', 'table', 'nofilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')

dfSpacer_nofilt['site'] = dfSpacer_nofilt['site'] + 179
dfRT_nofilt['site'] = dfRT_nofilt['site'] + 179 + 169
dfRNaseH_nofilt['site'] = dfRNaseH_nofilt['site'] + 179 + 169 + 344
dffull_nofilt = pd.concat([dfTP_nofilt, dfSpacer_nofilt, dfRT_nofilt, dfRNaseH_nofilt])
muttypesingleNTcombined(pd.concat([dfCore_nofilt, dffull_nofilt]), _yesgraph="df", _preorpost = "post")#.to_csv('Fig_S1B.csv')

## Fig. S2 [*](#cell-id)

In [None]:
#@markdown Figure
dfCore = myprefcal([Core_plasmid1,Core_plasmid2], [Core_cell1, Core_cell2, Core_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove') if Do_not_reset and 'dfCore' not in globals() else dfCore
df = dfCore
df = df[df['site']>29]
df['site']=df['site']-28

dftoheatmap_precount(df, _sortbynt = "nosortbynt")#.save("Fig_S2.html")

In [None]:
#@markdown Dataframe
dfCore = myprefcal([Core_plasmid1,Core_plasmid2], [Core_cell1, Core_cell2, Core_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove') if Do_not_reset and 'dfCore' not in globals() else dfCore
df = dfCore
df = df[df['site']>29]
df['site']=df['site']-28

df#.to_csv('Fig_S2.csv')

## Fig. S3B  [**](#cell-id2)

In [None]:
#@markdown Figure
dfCore = myprefcal([Core_plasmid1,Core_plasmid2], [Core_cell1, Core_cell2, Core_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove') if Do_not_reset and 'dfCore' not in globals() else dfCore
dfCore_deter = myprefcal_deter([Core_plasmid1,Core_plasmid2], [Core_cell1, Core_cell2, Core_cell3],-6,6, 'linear', 'ntable', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue') if Do_not_reset and 'dfCore_deter' not in globals() else dfCore_deter
dfCorewithKozak2 = KozakScoreAtCertainPos2(dfCore, 280, dfCore_deter) if Do_not_reset and 'dfCorewithKozak2' not in globals() else dfCorewithKozak2

df = dfCorewithKozak2
df = df.loc[df['site'].isin([93])]

df = df[df['variable']!='ATG']
df = df[df['variable']!='TAG']
df = df[df['variable']!='TGA']
df = df[df['variable']!='TAA']

chart = alt.Chart(df).mark_point().encode(
    y=alt.Y('logmfactor', scale=alt.Scale(domain=[-5, 1.5])),
    x=alt.X('kozakscore', scale=alt.Scale(domain=[0.4, 0.85])),
    tooltip=['info', 'site', 'nrmpre', 'pre_value', 'pre_sum', 'post_value', 'post_sum', 'logmfactor', 'kozakscore']
).properties(width=500, height=500)

line1 = alt.Chart(pd.DataFrame({'kozakscore': [0.5181337952805991]})).mark_rule().encode(x='kozakscore')
line2 = alt.Chart(pd.DataFrame({'logmfactor': [0]})).mark_rule().encode(y='logmfactor')

(chart + chart.transform_regression('logmfactor', 'kozakscore').mark_line() + line1 + line2)#.save("Fig_S3B.html")

In [None]:
#@markdown Dataframe
dfCore = myprefcal([Core_plasmid1,Core_plasmid2], [Core_cell1, Core_cell2, Core_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove') if Do_not_reset and 'dfCore' not in globals() else dfCore
dfCore_deter = myprefcal_deter([Core_plasmid1,Core_plasmid2], [Core_cell1, Core_cell2, Core_cell3],-6,6, 'linear', 'ntable', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue') if Do_not_reset and 'dfCore_deter' not in globals() else dfCore_deter
dfCorewithKozak2 = KozakScoreAtCertainPos2(dfCore, 280, dfCore_deter) if Do_not_reset and 'dfCorewithKozak2' not in globals() else dfCorewithKozak2

df = dfCorewithKozak2
df = df.loc[df['site'].isin([93])]

df = df[df['variable']!='ATG']
df = df[df['variable']!='TAG']
df = df[df['variable']!='TGA']
df = df[df['variable']!='TAA']

df#.to_csv('Fig_S3B.csv')

## Fig. S3C [**](#cell-id2)

In [None]:
#@markdown Figure
dfCore = myprefcal([Core_plasmid1,Core_plasmid2], [Core_cell1, Core_cell2, Core_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove') if Do_not_reset and 'dfCore' not in globals() else dfCore
dfCore_deter = myprefcal_deter([Core_plasmid1,Core_plasmid2], [Core_cell1, Core_cell2, Core_cell3],-6,6, 'linear', 'ntable', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue') if Do_not_reset and 'dfCore_deter' not in globals() else dfCore_deter
dfCorewithKozak2_116B = KozakScoreAtCertainPos2(dfCore, (116*3-2+1), dfCore_deter) if Do_not_reset and 'dfCorewithKozak2_116B' not in globals() else dfCorewithKozak2_116B

df = dfCorewithKozak2_116B

df['group']='J start preserver'
df.loc[df['1ATGbreak']==1, 'group']='J start breaker'
df.loc[df['-2ATGbreak']==1, 'group']='J start breaker'
df = df[df['group']=='J start preserver']

df = df.loc[df['site'].isin([115])]

df['position -3']=df['variable'].astype(str).str[1]
df['CTvsAG']='C or T'
df.loc[df['position -3']=='A', 'CTvsAG']='A or G'
df.loc[df['position -3']=='G', 'CTvsAG']='A or G'

chart = alt.Chart(df).mark_point().encode(
    y=alt.Y('logmfactor', scale=alt.Scale(domain=[-4, 1.5])),
    x=alt.X('kozakscore', scale=alt.Scale(domain=[0.25, 0.70])),
    color='CTvsAG',
    tooltip=['info', 'site', 'nrmpre', 'pre_value', 'pre_sum', 'post_value', 'post_sum', 'logmfactor', 'kozakscore']
).properties(width=500, height=500)

line1 = alt.Chart(pd.DataFrame({'kozakscore': [0.539792965618]})).mark_rule().encode(x='kozakscore')

line2 = alt.Chart(pd.DataFrame({'logmfactor': [0]})).mark_rule().encode(y='logmfactor')

(chart + chart.transform_regression('logmfactor', 'kozakscore').mark_line() + line1 + line2)#.save("Fig_S3C.html")

In [None]:
#@markdown Dataframe
dfCore = myprefcal([Core_plasmid1,Core_plasmid2], [Core_cell1, Core_cell2, Core_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove') if Do_not_reset and 'dfCore' not in globals() else dfCore
dfCore_deter = myprefcal_deter([Core_plasmid1,Core_plasmid2], [Core_cell1, Core_cell2, Core_cell3],-6,6, 'linear', 'ntable', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue') if Do_not_reset and 'dfCore_deter' not in globals() else dfCore_deter
dfCorewithKozak2_116B = KozakScoreAtCertainPos2(dfCore, (116*3-2+1), dfCore_deter) if Do_not_reset and 'dfCorewithKozak2_116B' not in globals() else dfCorewithKozak2_116B

df = dfCorewithKozak2_116B

df['group']='J start preserver'
df.loc[df['1ATGbreak']==1, 'group']='J start breaker'
df.loc[df['-2ATGbreak']==1, 'group']='J start breaker'
df = df[df['group']=='J start preserver']

df = df.loc[df['site'].isin([115])]

df['position -3']=df['variable'].astype(str).str[1]
df['CTvsAG']='C or T'
df.loc[df['position -3']=='A', 'CTvsAG']='A or G'
df.loc[df['position -3']=='G', 'CTvsAG']='A or G'

df#.to_csv('Fig_S3C.csv')

## Fig. S3D [**](#cell-id2)

In [None]:
#@markdown Figure
dfCore = myprefcal([Core_plasmid1,Core_plasmid2], [Core_cell1, Core_cell2, Core_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove') if Do_not_reset and 'dfCore' not in globals() else dfCore
dfCore_deter = myprefcal_deter([Core_plasmid1,Core_plasmid2], [Core_cell1, Core_cell2, Core_cell3],-6,6, 'linear', 'ntable', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue') if Do_not_reset and 'dfCore_deter' not in globals() else dfCore_deter
dfCorewithKozak2_116B = KozakScoreAtCertainPos2(dfCore, (116*3-2+1), dfCore_deter) if Do_not_reset and 'dfCorewithKozak2_116B' not in globals() else dfCorewithKozak2_116B

df = dfCorewithKozak2_116B

df['group']='J start preserver'
df.loc[df['1ATGbreak']==1, 'group']='J start breaker'
df.loc[df['-2ATGbreak']==1, 'group']='J start breaker'
df = df[df['group']=='J start preserver']

df = df.loc[df['site'].isin([115])]

df['position -3']=df['variable'].astype(str).str[1]
df['CTvsAG']='C or T'
df.loc[df['position -3']=='A', 'CTvsAG']='A or G'
df.loc[df['position -3']=='G', 'CTvsAG']='A or G'

points = alt.Chart().mark_point(size = 22).encode(
        x=alt.X(
        'jitter:Q',
        title=None,
        axis=alt.Axis(values=[0], ticks=True, grid=False, labels=False),
        scale=alt.Scale(domain=[-4, 4]),
        ),
        y=alt.Y('logmfactor:Q'),
        tooltip=['variable', 'wildtype', 'mutseq', 'logmfactor', 'site', 'AA', 'wtAA', 'pre_value', 'pre_sum', 'nrmpre', 'post_value', 'post_sum'],
).transform_calculate(jitter='sqrt(-2*log(random()))*cos(2*PI*random())')

boxplot = alt.Chart().mark_boxplot(color='gray', size=10, opacity=0.4, outliers={'size': 0}).encode(
    alt.Y('logmfactor:Q')
).properties(width=100)

error = alt.Chart().mark_errorbar(extent='ci', ticks=True, color='black', opacity=0.8).encode(
    y=alt.Y('logmfactor:Q'),
    strokeWidth=alt.value(3)
)

fullgraph = alt.layer(points, boxplot, error, data=df).properties(width=75, height=400).facet(column=alt.Column('CTvsAG:N', sort = ['C or T', 'A or G']), spacing=0).configure_view(stroke=None).resolve_scale(x=alt.ResolveMode("independent"))
fullgraph#.save("Fig_S3D.html")

In [None]:
#@markdown Dataframe
dfCore = myprefcal([Core_plasmid1,Core_plasmid2], [Core_cell1, Core_cell2, Core_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove') if Do_not_reset and 'dfCore' not in globals() else dfCore
dfCore_deter = myprefcal_deter([Core_plasmid1,Core_plasmid2], [Core_cell1, Core_cell2, Core_cell3],-6,6, 'linear', 'ntable', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue') if Do_not_reset and 'dfCore_deter' not in globals() else dfCore_deter
dfCorewithKozak2_116B = KozakScoreAtCertainPos2(dfCore, (116*3-2+1), dfCore_deter) if Do_not_reset and 'dfCorewithKozak2_116B' not in globals() else dfCorewithKozak2_116B
df = dfCorewithKozak2_116B

df['group']='J start preserver'
df.loc[df['1ATGbreak']==1, 'group']='J start breaker'
df.loc[df['-2ATGbreak']==1, 'group']='J start breaker'
df = df[df['group']=='J start preserver']

df = df.loc[df['site'].isin([115])]

df['position -3']=df['variable'].astype(str).str[1]
df['CTvsAG']='C or T'
df.loc[df['position -3']=='A', 'CTvsAG']='A or G'
df.loc[df['position -3']=='G', 'CTvsAG']='A or G'

df#.to_csv('Fig_S3D.csv')

## Fig. S4 [*](#cell-id)

In [None]:
#@markdown Figure (Part 1)
dfTP = myprefcal([TP_plasmid1, TP_plasmid2, TP_plasmid3], [TP_cell1,TP_cell2,TP_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')
dfSpacer = myprefcal([Spacer_plasmid1, Spacer_plasmid2, Spacer_plasmid3], [Spacer_cell1,Spacer_cell2,Spacer_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')
dfRT = myprefcal([RT_plasmid1, RT_plasmid2, RT_plasmid3], [RT_cell1,RT_cell2,RT_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')
dfRNaseH = myprefcal([RNaseH_plasmid1], [RNaseH_cell1, RNaseH_cell2, RNaseH_cell3, RNaseH_cell4, RNaseH_cell5, RNaseH_cell6],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')

dfSpacer['site'] = dfSpacer['site'] + 179
dfRT['site'] = dfRT['site'] + 179 + 169
dfRNaseH['site'] = dfRNaseH['site'] + 179 + 169 + 344

dfPol = pd.concat([dfTP, dfSpacer, dfRT, dfRNaseH])
#dftoheatmap_precount(dfPol, _sortbynt = "nosortbynt")#.save("Fig_S4.html")
dftoheatmap_precount(pd.concat([dfTP, dfSpacer]), _sortbynt = "nosortbynt")

In [None]:
#@markdown Figure (Part 2)
dftoheatmap_precount(pd.concat([dfRT, dfRNaseH]), _sortbynt = "nosortbynt")

In [None]:
#@markdown Dataframe
dfTP = myprefcal([TP_plasmid1, TP_plasmid2, TP_plasmid3], [TP_cell1,TP_cell2,TP_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')
dfSpacer = myprefcal([Spacer_plasmid1, Spacer_plasmid2, Spacer_plasmid3], [Spacer_cell1,Spacer_cell2,Spacer_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')
dfRT = myprefcal([RT_plasmid1, RT_plasmid2, RT_plasmid3], [RT_cell1,RT_cell2,RT_cell3],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')
dfRNaseH = myprefcal([RNaseH_plasmid1], [RNaseH_cell1, RNaseH_cell2, RNaseH_cell3, RNaseH_cell4, RNaseH_cell5, RNaseH_cell6],-6,6, 'linear', 'table', 'yesfilter', 'yesnewpref', 'yesconcat', 'nopvalue', __codonorAA="codon", cscheme="redblue", creverse=True, __removetext = 'yesremove')

dfSpacer['site'] = dfSpacer['site'] + 179
dfRT['site'] = dfRT['site'] + 179 + 169
dfRNaseH['site'] = dfRNaseH['site'] + 179 + 169 + 344

dfPol = pd.concat([dfTP, dfSpacer, dfRT, dfRNaseH])
dfPol#.to_csv('Fig_S4.csv')

## Fig. S5 [*](#cell-id)

In [None]:
#@markdown Figure
dftoheatmap_insilico(pd.read_csv(insilicoDMS_Pol), _min=-13, _max=13, cscheme="redblue", creverse=True, _removetext = 'yesremove')#.save("Fig_S5.html")

In [None]:
#@markdown Code
pd.read_csv(insilicoDMS_Pol)#.to_csv('Fig_S5.csv')

# Footnotes

<a name="cell-id"></a>
\* For the final figure in the paper, figure was formatted in Adobe Illustrator.

<a name="cell-id2"></a>
\** For the final figure in the paper, dataframe was visualized in Prism Graphpad and formatted in Adobe Illustrator.