# Notebook to generate dataframes of DMS mutants

## Import libraries

In [1]:
import pandas as pd
import numpy as np

## Functions

In [2]:
def get_nt_seq(wt_seq, mut_dic):
    wt_list_codons = [wt_seq[i:i+3] for i in range(0, len(wt_seq), 3)] # Converting WT nucleotide sequence to list of codons
    seq_l = [mut_dic[a] if a in mut_dic.keys() else wtcodon for a, wtcodon in enumerate(wt_list_codons)]
    return ''.join(seq_l)

In [3]:
def get_alt_codons(nt_seq, wt_seq):
    list_alt_pos = []
    list_alt_cod = []
    wt_list_codons = [wt_seq[i:i+3] for i in range(0, len(wt_seq), 3)] # Converting WT nucleotide sequence to list of codons
    seq_list_codons = [nt_seq[i:i+3] for i in range(0, len(nt_seq), 3)] # Converting nucleotide sequence of variant to list of codons
    for i, c in enumerate(seq_list_codons):
        if c != wt_list_codons[i]:
            list_alt_pos.append(i)
            list_alt_cod.append(c)
    
    return list(range(1,len(list_alt_pos)+1)), list_alt_pos, list_alt_cod

## Specify locus

In [35]:
locus = 'FKS2-HS2'

## Specify paths

In [36]:
codon_table_path = 'data/general_use/ScerevisiaeTAXID559292_Cocoputs_codon_table.csv'
wtseqs_path = 'data/general_use/wtseqs.csv'
ortho_path = f'orthologs/{locus}_ortholog_nt.csv'
outpath_f = f'data/{locus}/{locus}_var_df.csv'

## Get WT and possible mutations

In [7]:
wtseqs = pd.read_csv(wtseqs_path, header=0)
wtseq = wtseqs.loc[wtseqs.locus == locus, 'nt_seq'].values[0]
wtseq

'tttttagttttatctttgagagatcca'

In [8]:
codon_table = pd.read_csv(codon_table_path, header=0)
codon_table['codon'] = codon_table['codon'].str.lower()
codon_table.head(3)

Unnamed: 0,codon,aminoacid,freq,number
0,ttt,F,26.26,76999
1,ttc,F,17.89,52459
2,tta,L,26.31,77131


In [9]:
# Converting to dictionary
codon_dic = dict(zip(codon_table['codon'], codon_table['aminoacid']))
print(codon_dic)

{'ttt': 'F', 'ttc': 'F', 'tta': 'L', 'ttg': 'L', 'ctt': 'L', 'ctc': 'L', 'cta': 'L', 'ctg': 'L', 'att': 'I', 'atc': 'I', 'ata': 'I', 'atg': 'M', 'gtt': 'V', 'gtc': 'V', 'gta': 'V', 'gtg': 'V', 'tct': 'S', 'tcc': 'S', 'tca': 'S', 'tcg': 'S', 'cct': 'P', 'ccc': 'P', 'cca': 'P', 'ccg': 'P', 'act': 'T', 'acc': 'T', 'aca': 'T', 'acg': 'T', 'gct': 'A', 'gcc': 'A', 'gca': 'A', 'gcg': 'A', 'tat': 'Y', 'tac': 'Y', 'taa': '*', 'tag': '*', 'cat': 'H', 'cac': 'H', 'caa': 'Q', 'cag': 'Q', 'aat': 'N', 'aac': 'N', 'aaa': 'K', 'aag': 'K', 'gat': 'D', 'gac': 'D', 'gaa': 'E', 'gag': 'E', 'tgt': 'C', 'tgc': 'C', 'tga': '*', 'tgg': 'W', 'cgt': 'R', 'cgc': 'R', 'cga': 'R', 'cgg': 'R', 'agt': 'S', 'agc': 'S', 'aga': 'R', 'agg': 'R', 'ggt': 'G', 'ggc': 'G', 'gga': 'G', 'ggg': 'G'}


In [10]:
nnk = [x for x in codon_dic.keys() if x[2] in ['g', 't']]
len(nnk)

32

## Generate single mutants

In [10]:
rows_l = []

for i in range(0,len(wtseq),3):
    list_var = [x for x in nnk if x != wtseq[i:i+3]]
    #print(len(list_var))
    rows_l.append([
              i//3, # 0-based position (aa)
              list_var # list of possible NNK codons other than WT
                  ])

singles_compact_df = pd.DataFrame(rows_l, columns = ['aa_pos', 'alt_codons'])
singles_compact_df['seq_type'] = 'single'
singles_compact_df['mutated_codon'] = 1
singles_compact_df

Unnamed: 0,aa_pos,alt_codons,seq_type,mutated_codon
0,0,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...",single,1
1,1,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...",single,1
2,2,"[ttt, ttg, ctt, ctg, att, atg, gtt, tct, tcg, ...",single,1
3,3,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...",single,1
4,4,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...",single,1
5,5,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...",single,1
6,6,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...",single,1
7,7,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...",single,1


In [11]:
singles_df = singles_compact_df.explode('alt_codons')
singles_df

Unnamed: 0,aa_pos,alt_codons,seq_type,mutated_codon
0,0,ttt,single,1
0,0,ttg,single,1
0,0,ctt,single,1
0,0,ctg,single,1
0,0,att,single,1
...,...,...,...,...
7,7,cgg,single,1
7,7,agt,single,1
7,7,agg,single,1
7,7,ggt,single,1


In [12]:
singles_df['mutations'] = singles_df.apply(lambda row: {row[f'aa_pos']: row[f'alt_codons']}, axis=1)
singles_df

Unnamed: 0,aa_pos,alt_codons,seq_type,mutated_codon,mutations
0,0,ttt,single,1,{0: 'ttt'}
0,0,ttg,single,1,{0: 'ttg'}
0,0,ctt,single,1,{0: 'ctt'}
0,0,ctg,single,1,{0: 'ctg'}
0,0,att,single,1,{0: 'att'}
...,...,...,...,...,...
7,7,cgg,single,1,{7: 'cgg'}
7,7,agt,single,1,{7: 'agt'}
7,7,agg,single,1,{7: 'agg'}
7,7,ggt,single,1,{7: 'ggt'}


In [13]:
singles_df['nt_seq'] = singles_df.mutations.apply(lambda x: get_nt_seq(wtseq, x))
singles_df.drop(columns='mutations', inplace=True)
singles_df

Unnamed: 0,aa_pos,alt_codons,seq_type,mutated_codon,nt_seq
0,0,ttt,single,1,ttttgggtgagacgttatacactc
0,0,ttg,single,1,ttgtgggtgagacgttatacactc
0,0,ctt,single,1,ctttgggtgagacgttatacactc
0,0,ctg,single,1,ctgtgggtgagacgttatacactc
0,0,att,single,1,atttgggtgagacgttatacactc
...,...,...,...,...,...
7,7,cgg,single,1,gattgggtgagacgttatacacgg
7,7,agt,single,1,gattgggtgagacgttatacaagt
7,7,agg,single,1,gattgggtgagacgttatacaagg
7,7,ggt,single,1,gattgggtgagacgttatacaggt


## Generate double mutants

In [14]:
# First we generate a list of possible combinations (i, j) where i is the position of the first mutated aminoacid and j the position of the second one
import itertools
pos = range(0,len(wtseq)//3)
combinations = list(itertools.combinations(pos, 2))
print(combinations)

[(0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6), (0, 7), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (2, 3), (2, 4), (2, 5), (2, 6), (2, 7), (3, 4), (3, 5), (3, 6), (3, 7), (4, 5), (4, 6), (4, 7), (5, 6), (5, 7), (6, 7)]


In [15]:
rows_l = []

for c in combinations:
    for l in range(0,len(c)):
        rows_l.append([
                       c, # Combination
                       l+1, # Mutated codon
                       c[l], # 0-based position (aa)
                       singles_compact_df.loc[singles_compact_df.aa_pos == c[l], 'alt_codons'].values[0], # list of possible NNK codons other than WT
                      ])

doubles_compact_df = pd.DataFrame(rows_l, columns = ['combination', 'mutated_codon', 'aa_pos', 'alt_codons'])
doubles_compact_df['seq_type'] = 'double'
doubles_compact_df

Unnamed: 0,combination,mutated_codon,aa_pos,alt_codons,seq_type
0,"(0, 1)",1,0,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...",double
1,"(0, 1)",2,1,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...",double
2,"(0, 2)",1,0,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...",double
3,"(0, 2)",2,2,"[ttt, ttg, ctt, ctg, att, atg, gtt, tct, tcg, ...",double
4,"(0, 3)",1,0,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...",double
5,"(0, 3)",2,3,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...",double
6,"(0, 4)",1,0,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...",double
7,"(0, 4)",2,4,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...",double
8,"(0, 5)",1,0,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...",double
9,"(0, 5)",2,5,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...",double


In [16]:
# Dataframe is pivoted only to be able to use pd.explode(), then later on melted to go back to long format
doubles_piv = doubles_compact_df.pivot_table(index = ['seq_type', 'combination'], columns = 'mutated_codon', values = ['aa_pos', 'alt_codons'], aggfunc = 'first').reset_index()
doubles_piv.columns = [x[0] for x in doubles_piv.columns[:-4]] + [f"{x[0]}{x[1]}" for x in doubles_piv.columns[-4:]]
doubles_piv

Unnamed: 0,seq_type,combination,aa_pos1,aa_pos2,alt_codons1,alt_codons2
0,double,"(0, 1)",0,1,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...","[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ..."
1,double,"(0, 2)",0,2,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...","[ttt, ttg, ctt, ctg, att, atg, gtt, tct, tcg, ..."
2,double,"(0, 3)",0,3,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...","[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ..."
3,double,"(0, 4)",0,4,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...","[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ..."
4,double,"(0, 5)",0,5,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...","[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ..."
5,double,"(0, 6)",0,6,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...","[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ..."
6,double,"(0, 7)",0,7,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...","[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ..."
7,double,"(1, 2)",1,2,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...","[ttt, ttg, ctt, ctg, att, atg, gtt, tct, tcg, ..."
8,double,"(1, 3)",1,3,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...","[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ..."
9,double,"(1, 4)",1,4,"[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ...","[ttt, ttg, ctt, ctg, att, atg, gtt, gtg, tct, ..."


In [17]:
doubles_exp1 = doubles_piv.explode('alt_codons1')
doubles_exp2 = doubles_exp1.explode('alt_codons2')
doubles_df = doubles_exp2.reset_index(drop=True)
doubles_df

Unnamed: 0,seq_type,combination,aa_pos1,aa_pos2,alt_codons1,alt_codons2
0,double,"(0, 1)",0,1,ttt,ttt
1,double,"(0, 1)",0,1,ttt,ttg
2,double,"(0, 1)",0,1,ttt,ctt
3,double,"(0, 1)",0,1,ttt,ctg
4,double,"(0, 1)",0,1,ttt,att
...,...,...,...,...,...,...
27557,double,"(6, 7)",6,7,ggg,cgg
27558,double,"(6, 7)",6,7,ggg,agt
27559,double,"(6, 7)",6,7,ggg,agg
27560,double,"(6, 7)",6,7,ggg,ggt


In [18]:
aa_pos_col = [x for x in doubles_df.columns if 'aa_pos' in x]
aa_cod_col = [x for x in doubles_df.columns if 'alt_codons' in x]
doubles_df['mutations'] = doubles_df.apply(lambda row: dict(zip(row[aa_pos_col].values,
                                                                row[aa_cod_col].values
                                                               )), axis=1)
doubles_df.head(10)

Unnamed: 0,seq_type,combination,aa_pos1,aa_pos2,alt_codons1,alt_codons2,mutations
0,double,"(0, 1)",0,1,ttt,ttt,"{0: 'ttt', 1: 'ttt'}"
1,double,"(0, 1)",0,1,ttt,ttg,"{0: 'ttt', 1: 'ttg'}"
2,double,"(0, 1)",0,1,ttt,ctt,"{0: 'ttt', 1: 'ctt'}"
3,double,"(0, 1)",0,1,ttt,ctg,"{0: 'ttt', 1: 'ctg'}"
4,double,"(0, 1)",0,1,ttt,att,"{0: 'ttt', 1: 'att'}"
5,double,"(0, 1)",0,1,ttt,atg,"{0: 'ttt', 1: 'atg'}"
6,double,"(0, 1)",0,1,ttt,gtt,"{0: 'ttt', 1: 'gtt'}"
7,double,"(0, 1)",0,1,ttt,gtg,"{0: 'ttt', 1: 'gtg'}"
8,double,"(0, 1)",0,1,ttt,tct,"{0: 'ttt', 1: 'tct'}"
9,double,"(0, 1)",0,1,ttt,tcg,"{0: 'ttt', 1: 'tcg'}"


In [19]:
doubles_df['nt_seq'] = doubles_df.mutations.apply(lambda x: get_nt_seq(wtseq, x))
doubles_df.drop(columns='mutations', inplace=True)
doubles_df.head(10)

Unnamed: 0,seq_type,combination,aa_pos1,aa_pos2,alt_codons1,alt_codons2,nt_seq
0,double,"(0, 1)",0,1,ttt,ttt,ttttttgtgagacgttatacactc
1,double,"(0, 1)",0,1,ttt,ttg,tttttggtgagacgttatacactc
2,double,"(0, 1)",0,1,ttt,ctt,tttcttgtgagacgttatacactc
3,double,"(0, 1)",0,1,ttt,ctg,tttctggtgagacgttatacactc
4,double,"(0, 1)",0,1,ttt,att,tttattgtgagacgttatacactc
5,double,"(0, 1)",0,1,ttt,atg,tttatggtgagacgttatacactc
6,double,"(0, 1)",0,1,ttt,gtt,tttgttgtgagacgttatacactc
7,double,"(0, 1)",0,1,ttt,gtg,tttgtggtgagacgttatacactc
8,double,"(0, 1)",0,1,ttt,tct,ttttctgtgagacgttatacactc
9,double,"(0, 1)",0,1,ttt,tcg,ttttcggtgagacgttatacactc


In [20]:
doubles_long = pd.wide_to_long(doubles_df, stubnames=['aa_pos', 'alt_codons'], i='nt_seq', j='mutated_codon').reset_index()
doubles_long

Unnamed: 0,nt_seq,mutated_codon,seq_type,combination,aa_pos,alt_codons
0,ttttttgtgagacgttatacactc,1,double,"(0, 1)",0,ttt
1,tttttggtgagacgttatacactc,1,double,"(0, 1)",0,ttt
2,tttcttgtgagacgttatacactc,1,double,"(0, 1)",0,ttt
3,tttctggtgagacgttatacactc,1,double,"(0, 1)",0,ttt
4,tttattgtgagacgttatacactc,1,double,"(0, 1)",0,ttt
...,...,...,...,...,...,...
55119,gattgggtgagacgttatgggcgg,2,double,"(6, 7)",7,cgg
55120,gattgggtgagacgttatgggagt,2,double,"(6, 7)",7,agt
55121,gattgggtgagacgttatgggagg,2,double,"(6, 7)",7,agg
55122,gattgggtgagacgttatgggggt,2,double,"(6, 7)",7,ggt


## Import ortholog sequences

In [21]:
ortho_df = pd.read_csv(ortho_path, header=0)
ortho_df['seq_type'] = 'ortho'
ortho_df.head(3)

Unnamed: 0,nt_seq,seq_type
0,ggtgttattttgaatcaacaattt,ortho
1,ggtgctattttgaatcaacaattt,ortho
2,gattggatgagacaatctttgttg,ortho


In [22]:
ortho_df['mutated_codon'], ortho_df['aa_pos'], ortho_df['alt_codons'] = zip(*ortho_df['nt_seq'].map(lambda x: get_alt_codons(x, wtseq)))
ortho_df.head(3)

Unnamed: 0,nt_seq,seq_type,mutated_codon,aa_pos,alt_codons
0,ggtgttattttgaatcaacaattt,ortho,"[1, 2, 3, 4, 5, 6, 7, 8]","[0, 1, 2, 3, 4, 5, 6, 7]","[ggt, gtt, att, ttg, aat, caa, caa, ttt]"
1,ggtgctattttgaatcaacaattt,ortho,"[1, 2, 3, 4, 5, 6, 7, 8]","[0, 1, 2, 3, 4, 5, 6, 7]","[ggt, gct, att, ttg, aat, caa, caa, ttt]"
2,gattggatgagacaatctttgttg,ortho,"[1, 2, 3, 4, 5]","[2, 4, 5, 6, 7]","[atg, caa, tct, ttg, ttg]"


In [23]:
ortho_long = ortho_df.explode(['mutated_codon', 'aa_pos', 'alt_codons']).reset_index(drop=True)
ortho_long

Unnamed: 0,nt_seq,seq_type,mutated_codon,aa_pos,alt_codons
0,ggtgttattttgaatcaacaattt,ortho,1,0,ggt
1,ggtgttattttgaatcaacaattt,ortho,2,1,gtt
2,ggtgttattttgaatcaacaattt,ortho,3,2,att
3,ggtgttattttgaatcaacaattt,ortho,4,3,ttg
4,ggtgttattttgaatcaacaattt,ortho,5,4,aat
...,...,...,...,...,...
384,gattgggttaatagatgtattatt,ortho,2,3,aat
385,gattgggttaatagatgtattatt,ortho,3,4,aga
386,gattgggttaatagatgtattatt,ortho,4,5,tgt
387,gattgggttaatagatgtattatt,ortho,5,6,att


## Concatenate all dataframes

In [24]:
wt_df = pd.DataFrame.from_dict({'seq_type':['WT'], 'nt_seq':[wtseq]})
wt_df

Unnamed: 0,seq_type,nt_seq
0,WT,gattgggtgagacgttatacactc


In [25]:
var_df = pd.concat([wt_df, singles_df, doubles_long, ortho_long], ignore_index = True)
var_df

Unnamed: 0,seq_type,nt_seq,aa_pos,alt_codons,mutated_codon,combination
0,WT,gattgggtgagacgttatacactc,,,,
1,single,ttttgggtgagacgttatacactc,0,ttt,1,
2,single,ttgtgggtgagacgttatacactc,0,ttg,1,
3,single,ctttgggtgagacgttatacactc,0,ctt,1,
4,single,ctgtgggtgagacgttatacactc,0,ctg,1,
...,...,...,...,...,...,...
55760,ortho,gattgggttaatagatgtattatt,3,aat,2,
55761,ortho,gattgggttaatagatgtattatt,4,aga,3,
55762,ortho,gattgggttaatagatgtattatt,5,tgt,4,
55763,ortho,gattgggttaatagatgtattatt,6,att,5,


In [26]:
var_df.to_csv(outpath_f)

## Checking the number of unique variants

In [37]:
var_df = pd.read_csv(outpath_f, index_col=0)
var_df

Unnamed: 0,seq_type,nt_seq,aa_pos,alt_codons,mutated_codon,combination
0,WT,gattgggtgagacgttatacactc,,,,
1,single,ttttgggtgagacgttatacactc,0.0,ttt,1.0,
2,single,ttgtgggtgagacgttatacactc,0.0,ttg,1.0,
3,single,ctttgggtgagacgttatacactc,0.0,ctt,1.0,
4,single,ctgtgggtgagacgttatacactc,0.0,ctg,1.0,
...,...,...,...,...,...,...
55760,ortho,gattgggttaatagatgtattatt,3.0,aat,2.0,
55761,ortho,gattgggttaatagatgtattatt,4.0,aga,3.0,
55762,ortho,gattgggttaatagatgtattatt,5.0,tgt,4.0,
55763,ortho,gattgggttaatagatgtattatt,6.0,att,5.0,


In [38]:
def get_aa_seq(seq, cdic):
    clist = [seq[i:i+3] for i in range(0, len(seq), 3)] # Converting nucleotide sequence to list of codons
    return ''.join([cdic[x] for x in clist])

In [39]:
get_aa_seq('gattgggtgagacgttatacactc', codon_dic)

'DWVRRYTL'

In [40]:
var_df['aa_seq'] = var_df.nt_seq.apply(lambda x: get_aa_seq(x, codon_dic))
var_df

Unnamed: 0,seq_type,nt_seq,aa_pos,alt_codons,mutated_codon,combination,aa_seq
0,WT,gattgggtgagacgttatacactc,,,,,DWVRRYTL
1,single,ttttgggtgagacgttatacactc,0.0,ttt,1.0,,FWVRRYTL
2,single,ttgtgggtgagacgttatacactc,0.0,ttg,1.0,,LWVRRYTL
3,single,ctttgggtgagacgttatacactc,0.0,ctt,1.0,,LWVRRYTL
4,single,ctgtgggtgagacgttatacactc,0.0,ctg,1.0,,LWVRRYTL
...,...,...,...,...,...,...,...
55760,ortho,gattgggttaatagatgtattatt,3.0,aat,2.0,,DWVNRCII
55761,ortho,gattgggttaatagatgtattatt,4.0,aga,3.0,,DWVNRCII
55762,ortho,gattgggttaatagatgtattatt,5.0,tgt,4.0,,DWVNRCII
55763,ortho,gattgggttaatagatgtattatt,6.0,att,5.0,,DWVNRCII


In [41]:
var_df.groupby('seq_type')[['aa_seq','nt_seq']].nunique().reset_index()

Unnamed: 0,seq_type,aa_seq,nt_seq
0,WT,1,1
1,double,11361,27562
2,ortho,63,63
3,single,161,251
