# Notes

# Configs & Imports

In [12]:
from os import path
from pathlib import Path

from pandas import DataFrame, read_csv
from numpy import NaN

from Bio import SeqIO

from IPython.display import display

from tqdm import tqdm

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Where the raw data is stored and where processed data will be deposited
data_path = Path('') / '..' / 'data' / 'sav'

neutral_path = data_path / 'labels' / 'neutral'
neutral_files = neutral_path.rglob('*.effect')

effect_path = data_path / 'labels' / 'effect'
effect_files = effect_path.rglob('*.effect')

split_path = Path('') / '..' / 'splits' / 'sav'

# Obtain original datasets

In [3]:
# Function to encapsulate the fasta data files
def getProteinsFromFASTAFile(filePath):
    columns = ["id", "sequence", "neutral", "effect"]
    dataset = DataFrame(columns = columns)
    
    for protein in tqdm(SeqIO.parse(filePath, "fasta")):
        dataset = dataset.append({"id": protein.id,
                                  "sequence": str(protein.seq)}, ignore_index=True)
    
    return dataset

In [4]:
# If available we use the FASTA file converted to CSV because
# processing this FASTA file takes too long.
if path.exists(split_path / 'sequences_dataframe.csv'):
    data = read_csv(split_path / 'sequences_dataframe.csv')
else:
    data = getProteinsFromFASTAFile(data_path / "sequences.fasta")
    data.to_csv(split_path / 'sequences_dataframe.csv', index=False)

9660it [07:36, 21.18it/s]


In [5]:
# Get the neutral SAVs
neutral_sav = dict()
for path in tqdm(neutral_files):
    name = str(path).split('/')[5]
    protein_name = name.split('@')[0]
    
    with open(path, 'r') as f:
        variations = f.read().splitlines()
        neutral_sav[protein_name] = list(map(lambda x: x.split(' ')[0], variations))

4233it [01:22, 51.28it/s] 


In [6]:
# Get the effect SAVs
effect_sav = dict()
for path in tqdm(effect_files):
    name = str(path).split('/')[5]
    protein_name = name.split('@')[0]
    
    with open(path, 'r') as f:
        variations = f.read().splitlines()
        effect_sav[protein_name] = list(map(lambda x: x.split(' ')[0], variations))

7387it [01:24, 87.29it/s] 


In [8]:
# Modify the ID of the proteins and add effect and neutral SAV
for i in tqdm(range(len(data))):
    id = data.iloc[i]["id"].split('@')[0]
    data["id"].iloc[i] = id
    
    if id in set(neutral_sav.keys()):
        data["neutral"].iloc[i] = neutral_sav[id]
    
    if id in set(effect_sav.keys()):
        data["effect"].iloc[i] = effect_sav[id]

100%|██████████████████████████████████████| 9660/9660 [00:55<00:00, 175.07it/s]


In [9]:
# Check the dataset
data

Unnamed: 0,id,sequence,neutral,effect
0,A40493,MEVSPLQPVNENMQVNKIKKNEDAKKRLSVERIYQKKTQLEHILLR...,,"[R449Q, R486K]"
1,S17875,MPFPVTTQGSQQTQPPQKHYGITSPISLAAPKETDCLLTQKLVETL...,[K96H],"[D125A, D125S, D128A, F100A, F100I, F98N, F98S..."
2,S37460,MRRNSRARLGVSLLLVAGALGLGAAPSTAADTPPAAPSAIPAPSAY...,,"[H315V, H340A, S328A, S328G]"
3,DCLBHP,MSELDAKLNKLGVDRIAISPYKQWTRGYMEPGNIGNGYVTGLKVDA...,,"[D199N, D54N, D64N, G59D, I60A, I60V, Y63F]"
4,AJECDS,MGNNVVVLGTQWGDEGKGKIVDLLTERAKYVVRYQGGHNAGHTLVI...,,"[D334E, D334N, D334Q, G13V, G16V, G18V, I20T, ..."
...,...,...,...,...
9655,A940612,RVTLSSKPQALATPNKEEHGKRKKKGKGLGKKRDPCLRKYKDFCIH...,"[K16A, K21A, K23A, K24A, K25A, K27A, K53A, R22...",
9656,A984244,NTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAG...,"[A37E, D42A, E16A, E26A, E29A, E33A, E36A, E39...",
9657,A000292,MAQTPAFDKPKVELHVHLDGSIKPETILYYGRRRGIALPANTAEGL...,"[A131D, D143A, E128D, G134N]",
9658,A983849,MPEISLRHVVSGSSQDSTHCAENLLKADTYRKWRAATAGEKTISVV...,[Q221L],


# Statistics

In [26]:
# Number of proteins with neutral and/or effect SAVs
count_neutral_proteins = len(data.loc[data["neutral"].notnull()])
count_effect_proteins = len(data.loc[data["effect"].notnull()])
count_only_neutral_proteins = len(data.loc[data["neutral"].notnull() & data["effect"].isnull()])
count_only_effect_proteins = len(data.loc[data["neutral"].isnull() & data["effect"].notnull()])
count_neutral_and_effect_proteins = len(data.loc[data["neutral"].notnull() & data["effect"].notnull()])


display("Number of proteins with neutral SAVs: " + str(count_neutral_proteins))
display("Number of proteins with effect SAVs: " + str(count_effect_proteins))
display("Number of proteins with only neutral SAVs: " + str(count_only_neutral_proteins))
display("Number of proteins with only effect SAVs: " + str(count_only_effect_proteins))
display("Number of proteins with neutral and effect SAVs: " + str(count_neutral_and_effect_proteins))

'Number of proteins with neutral SAVs: 4299'

'Number of proteins with effect SAVs: 7491'

'Number of proteins with only neutral SAVs: 2169'

'Number of proteins with only effect SAVs: 5361'

'Number of proteins with neutral and effect SAVs: 2130'

In [63]:
# Number of residues and number of residues with neutral or effect SAV
count_aa = [len(data.iloc[i].sequence) for i in range(len(data))]
neutral_savs = [data["neutral"].iloc[i] for i in range(len(data)) if data["neutral"].iloc[i] is not NaN]
effect_savs = [data["effect"].iloc[i] for i in range(len(data)) if data["effect"].iloc[i] is not NaN]

display("Total number of AAs: " + str(sum(count_aa)))
display("Size of the smallest protein: " + str(min(count_aa)))
display("Size of the largest protein: " + str(max(count_aa)))
display("Number of neutral SAVs: " + str(sum([len(protein_neutral_savs) for protein_neutral_savs in neutral_savs])))
display("Number of effect SAVs: " + str(sum([len(protein_effect_savs) for protein_effect_savs in effect_savs])))

'Total number of AAs: 4983949'

'Size of the smallest protein: 28'

'Size of the largest protein: 4967'

'Number of neutral SAVs: 40418'

'Number of effect SAVs: 62124'