This notebook provides scripts for preparing RelaVDEP training files. It includes three essential scripts for data processing, and the final project structure is as follows:

```
RelaVDEP
├── figures
├── notebook
│   ├── 1_prepare.ipynb (We are here)
│   ├── 2_train_rm.ipynb
├── relavdep
│   ├── data
│   │   ├── fasta
│   │   │   ├── TARGET.fasta (Wild-type protein sequence)
│   │   ├── fitness
│   │   │   ├── TARGET.csv (Mutation data)
│   │   ├── params
│   │   ├── restraints
│   │   │   ├── TARGET.npz (Mutation site constraint)
├── environment.yml
├── README.md
```

## 1. Wild-type protein sequence

In [1]:
import os
import sys
sys.path.insert(0, '../relavdep')
import pandas as pd
import numpy as np

In [2]:
save_path = '../relavdep/data/fasta'
os.makedirs(save_path, exist_ok=True)

def generate_fasta(name, sequence):
    with open(f'{save_path}/{name}.fasta', 'w') as f:
        f.write(f'>{name}\n')
        f.write(wt_seq)

In [3]:
name = 'TARGET'
wt_seq = 'MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK'

generate_fasta(name, wt_seq)

In [4]:
# check TARGET.fasta
os.path.exists(f'../relavdep/data/fasta/{name}.fasta')

True

## 2. Mutation data

Two optional scripts are available for processing mutation data: one for deep mutational scanning (DMS) data from the ProteinGym dataset, and another for experimental data.

In [5]:
mut_data_path = '../relavdep/data/fitness'
os.makedirs(save_path, exist_ok=True)

### (1) DMS data processing

In [6]:
dms_name = 'GFP_AEQVI_Sarkisyan_2016.csv'

dms_data = pd.read_csv(f'{mut_data_path}/{dms_name}')
dms_data = dms_data.drop('DMS_score_bin', axis=1)
dms_data = dms_data.rename(columns={'mutated_sequence': 'sequence', 'DMS_score': 'label'})
dms_data.to_csv(f'{mut_data_path}/{name}.csv', index=False)
dms_data.head()

Unnamed: 0,mutant,sequence,label
0,K3R:V55A:Q94R:A110T:D117G:M153K:D216A,MSRGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,1.30103
1,K3Q:V16A:I167T:L195Q,MSQGEELFTGVVPILAELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,3.13735
2,K3Q:Y143C:N164D:S205P:A227T,MSQGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,1.553913
3,K3Q:Y143N:V193A,MSQGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,3.404237
4,K3R,MSRGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,3.738586


In [7]:
# check TARGET.csv
os.path.exists(f'../relavdep/data/fitness/{name}.csv')

True

### (2) Experimental data processing

In [8]:
exp_name = 'AmeR.xlsx'
exp_wt_seq = 'MNKTIDQVRKGDRKSDLPVRRRPRRSAEETRRDILAKAEELFRERGFNAVAIADIASALNMSPANVFKHFSSKNALVDAIGFGQIGVFERQICPLDKSHAPLDRLRHLARNLMEQHHQDHFKHIRVFIQILMTAKQDMKCGDYYKSVIAKLLAEIIRDGVEAGLYIATDIPVLAETVLHALTSVIHPVLIAQEDIGNLATRCDQLVDLIDAGLRNPLAK'

In [9]:
exp_data = pd.read_excel(f'{mut_data_path}/{exp_name}')
sequences = []
for mutant in list(exp_data['mutant']):
    mut_seq = list(exp_wt_seq)
    for mut in mutant.split(','):
        mut_seq[int(mut[1:-1]) - 1] = mut[-1]
    mut_seq = ''.join(mut_seq)
    sequences.append(mut_seq)
exp_data.insert(1, 'sequence', sequences)
exp_data.head()
exp_data.to_csv(f'{mut_data_path}/AmeR.csv', index=False)

In [10]:
# check AmeR.csv
os.path.exists(f'../relavdep/data/fitness/AmeR.csv')

True

## 3. Mutation site constraint

In [11]:
int2A = {0: 'A', 1: 'R', 2: 'N', 3: 'D', 4: 'C', 5: 'Q', 6: 'E', 7: 'G', 8: 'H', 9: 'I', 
         10: 'L', 11: 'K', 12: 'M', 13: 'F', 14: 'P', 15: 'S', 16: 'T', 17: 'W', 18: 'Y', 19: 'V'}
A2int = {value: key for key, value in int2A.items()}

# define non-mutable sites (starting index: 1)
illegal_pos = [65, 66, 67]    # active sites
illegal_mut = ['F64L']        # mutated site

# define mutable sites (starting index: 1)
legal_pos = [42, 62, 72, 84, 87, 105, 163]
legal_mut = ['G228E', 'Q184K']

In [12]:
# illegal actions
illegal = []

if illegal_pos:
    for pos in illegal_pos:
        for res in range(20):
            action = (pos - 1) * 20 + res + 1
            illegal.append(action)

if illegal_mut:
    for mut in illegal_mut:
        pos = int(mut[1:-1])
        res = list(A2int.keys()).index(mut[0])
        action = (pos - 1) * 20 + res + 1
        illegal.append(action)

In [13]:
# legal actions
legal = []

if legal_pos:
    for pos in legal_pos:
        for res in range(20):
            action = (pos - 1) * 20 + res + 1
            legal.append(action)

if legal_mut:
    for mut in legal_mut:
        pos = int(mut[1:-1])
        res = list(A2int.keys()).index(mut[-1])
        action = (pos - 1) * 20 + res + 1
        legal.append(action)  

In [14]:
np.savez(f'../relavdep/data/restraints/{name}.npz', illegal=illegal, legal=legal)

In [15]:
# check TARGET.npz
os.path.exists(f'../relavdep/data/restraints/TARGET.npz')

True