# Notes

Used data from:
- Paper "Protein embeddings and deep learning predict binding residues for various ligand classes" by Littmann et al. available at: https://github.com/Rostlab/bindPredict

**Data explanation**:
- Development set
    - DevSet1014: 1014 proteins (13.999 binding residues + 156.684 non-binding residues)
        - Metal ions: 455 proteins (2.374 binding residues + 77.404 non-binding residues)
        - Nucleic acids: 108 proteins (2.689 binding residues + 15.582 non-binding residues)
        - Small molecules: 606 proteins (9.281 binding residues + 94.119 non-binding residues)
    - TestSet300: 300 proteins (5.869 binding residues + 56.820 non-binding residues)
        - Metal ions: 122 proteins (881 binding residues + 26.763 non-binding residues)
        - Nucleic acids: 66 proteins (1.470 binding residues + 14.698 non-binding residues)
        - Small molecules: 220 proteins (3.906 binding residues + 42.629 non-binding residues)
- Independent set
    - TestSetNew46: 46 proteins (575 binding residues + 6.652 non-binding residues)
        - Metal ions: 15 proteins (77 binding residues + 2.198 non-binding residues)
        - Nucleic acids: 10 proteins (77 binding residues + 874 non-binding residues)
        - Small molecules: 25 proteins (425 binding residues + 3.259 non-binding residues)

**Proposed splits**:
- `one_vs_many`: train on proteins with only 1 type of ligand, test on proteins with 2 and 3 types of ligands
- `two_vs_many`: train on proteins with 1 or 2 types of ligand, test on proteins with 3 types of ligand
- `three_vs_many`: train on proteins with 1 or 2 types of ligand from original training set, test on original test sets TestSet300 and TestSetNew46 mixed.

# Configs & Imports

In [53]:
from pathlib import Path

from pandas import DataFrame, read_csv
from matplotlib import pyplot as plt
from numpy import NaN

from Bio import SeqIO

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Where the raw data is stored and where processed data will be deposited
data_path = Path('') / '..' / 'data' / 'bind'
split_path = Path('') / '..' / 'splits' / 'bind'

# Development set (DevSet1014 & estSet300)
sequences_path = data_path / 'development_set' / 'all.fasta'
binding_residues_metal_path = data_path / 'development_set' / 'binding_residues_2.5_metal.txt'
binding_residues_nuclear_path = data_path / 'development_set' / 'binding_residues_2.5_nuclear.txt'
binding_residues_small_path = data_path / 'development_set' / 'binding_residues_2.5_small.txt'
test_set_path = data_path / 'development_set' / 'uniprot_test.txt'

# Independent (test) set (TestSetNew46)
independent_sequences_path = data_path / 'independent_set' / 'indep_set.fasta'
independent_binding_residues_metal_path = data_path / 'independent_set' / 'binding_residues_metal.txt'
independent_binding_residues_nuclear_path = data_path / 'independent_set' / 'binding_residues_nuclear.txt'
independent_binding_residues_small_path = data_path / 'independent_set' / 'binding_residues_small.txt'
independent_set_path = data_path / 'independent_set' / 'indep_set.txt'

# Obtain original datasets

In [3]:
# Function to encapsulate reading from fasta files
def getProteinsFromFASTAFile(filePath):
    columns = ["id", "sequence"]
    dataset = DataFrame(columns = columns)
    
    for protein in SeqIO.parse(filePath, "fasta"):
        dataset = dataset.append({"id": protein.id,
                                  "sequence": str(protein.seq)}, ignore_index=True)
    
    return dataset

In [4]:
# Obtain data from sets
sequences = getProteinsFromFASTAFile(sequences_path)
binding_residues_metal = read_csv(binding_residues_metal_path, sep="	", names=["id", "binding residues"])
binding_residues_nuclear = read_csv(binding_residues_nuclear_path, sep="	", names=["id", "binding residues"])
binding_residues_small = read_csv(binding_residues_small_path, sep="	", names=["id", "binding residues"])
test_set = read_csv(test_set_path, names=["id"])

independent_sequences = getProteinsFromFASTAFile(independent_sequences_path)
independent_binding_residues_metal = read_csv(independent_binding_residues_metal_path, sep="	", names=["id", "binding residues"])
independent_binding_residues_nuclear = read_csv(independent_binding_residues_nuclear_path, sep="	", names=["id", "binding residues"])
independent_binding_residues_small = read_csv(independent_binding_residues_small_path, sep="	", names=["id", "binding residues"])
independent_set = read_csv(independent_set_path, names=["id"])

In [5]:
# Let's construct a single development dataset with all the information together
sequences_with_binding_info = DataFrame(columns = ["id", "sequence", "metal", "nuclear", "small", "set"])

for index, row in sequences.iterrows():
    # Determine the binding type and the binding residues
    metal, nuclear, small = NaN, NaN, NaN
    if row["id"] in list(binding_residues_metal["id"]):
        metal = binding_residues_metal.iloc[list(binding_residues_metal["id"]).index(row["id"])]["binding residues"]
    
    if row["id"] in list(binding_residues_nuclear["id"]):
        nuclear = binding_residues_nuclear.iloc[list(binding_residues_nuclear["id"]).index(row["id"])]["binding residues"]
    
    if row["id"] in list(binding_residues_small["id"]):
        small = binding_residues_small.iloc[list(binding_residues_small["id"]).index(row["id"])]["binding residues"]
    
    # Train/validation or test
    train_or_test = "train"
    if row["id"] in list(test_set["id"]):
        train_or_test = "test"
    
    new_sequence = {"id": row["id"], 
                    "sequence": row["sequence"], 
                    "metal": metal,
                    "nuclear": nuclear,
                    "small": small,
                    "set": train_or_test}
    sequences_with_binding_info = sequences_with_binding_info.append(new_sequence, ignore_index = True)

In [16]:
# Let's construct a single development dataset with all the information together
independent_sequences_with_binding_info = DataFrame(columns = ["id", "sequence", "metal", "nuclear", "small", "set"])

for index, row in independent_sequences.iterrows():
    # Determine the binding type and the binding residues
    metal, nuclear, small = NaN, NaN, NaN
    if row["id"] in list(independent_binding_residues_metal["id"]):
        metal = independent_binding_residues_metal.iloc[list(independent_binding_residues_metal["id"]).index(row["id"])]["binding residues"]
    
    if row["id"] in list(independent_binding_residues_nuclear["id"]):
        nuclear = independent_binding_residues_nuclear.iloc[list(independent_binding_residues_nuclear["id"]).index(row["id"])]["binding residues"]
    
    if row["id"] in list(independent_binding_residues_small["id"]):
        small = independent_binding_residues_small.iloc[list(independent_binding_residues_small["id"]).index(row["id"])]["binding residues"]
    
    # Train/validation or test
    train_or_test = "train"
    if row["id"] in list(test_set["id"]):
        train_or_test = "test"
    
    new_sequence = {"id": row["id"], 
                    "sequence": row["sequence"], 
                    "metal": metal,
                    "nuclear": nuclear,
                    "small": small,
                    "set": train_or_test}
    independent_sequences_with_binding_info = independent_sequences_with_binding_info.append(new_sequence, ignore_index = True)

In [20]:
# Let's mix both test set to acquire a bigger test set since the unique difference is the acquisition moment
full_set = sequences_with_binding_info.append(independent_sequences_with_binding_info)
full_set

Unnamed: 0,id,sequence,metal,nuclear,small,set
0,Q5LL55,MSETWLPTLVTATPQEGFDLAVKLSRIAVKKTQPDAQVRDTLRAVY...,,,"53,46,32,42,25,60,56,43,57,31,54,22,34,40,26,2...",train
1,H9L4N9,MQINIQGHHIDLTDSMQDYVHSKFDKLERFFDHINHVQVILRVEKL...,516242605564,,,train
2,O34738,MKSWKVKEIVIMSVISIVFAVVYLLFTHFGNVLAGMFGPIAYEPIY...,,,"100,46,104,88,63,42,77,135,91,108,27,101,134,1...",train
3,P39579,MDFKQEVLDVLAEVCQDDIVKENPDIEIFEEGLLDSFGTVELLLAI...,,,37364061,train
4,P01887,MARSVTLVFLVLVSLTGLYAIQKTPQIQVYSRHPPENGKPNILNCY...,8377,,30117118116,train
...,...,...,...,...,...,...
41,Q9KDJ7,MSDEKKILGEERRSLLIKWLKASDTPLTGAELAKRTNVSRQVIVQD...,15015291,,13013410514114283150889192152,train
42,Q9LFM3,MGAGREVSVSLDGVRDKNLMQLKILNTVLFPVRYNDKYYADAIAAG...,,,"129,79,80,81,82,83,88,89,90,91,92,93,94,114,11...",train
43,Q9SJ89,MNLQAVSCSFGFLSSPLGVTPRTSFRRFVIRAKTEPSEKSVEIMRK...,,,698586104106109115116117119120,train
44,U2EQ00,MAWLILIIAGIFEVVWAIALKYSNGFTRLIPSMITLIGMLISFYLL...,,,"64,65,68,13,16,25,26,27,29,33,98,36,39,40,43,4...",train


In [21]:
# Get proteins with only 1, with 2 and with 3 type of ligand
one_ligand = DataFrame([row for index, row in full_set.iterrows() if row.isna().sum() == 2])
two_ligand = DataFrame([row for index, row in full_set.iterrows() if row.isna().sum() == 1])
three_ligand = DataFrame([row for index, row in full_set.iterrows() if row.isna().sum() == 0])

# one_vs_many

In [65]:
one_vs_many = DataFrame(columns = ["sequence", 
                                   "target metal binding residues", 
                                   "target nuclear binding residues",
                                   "target small binding residues", 
                                   "set",
                                   "validation"])

# Training set
tmp = one_ligand[["sequence", "metal", "nuclear", "small"]]
tmp.rename(columns = {"metal": "target metal binding residues", 
                      "nuclear": "target nuclear binding residues", 
                      "small": "target small binding residues"}, inplace = True)
tmp.insert(2, "set", ["train"]*len(one_ligand))
one_vs_many = one_vs_many.append(tmp)


# Validation set
val_sample_indexs = list(one_vs_many.sample(frac=0.1, random_state=1234).index)
validation = [True if i in val_sample_indexs else NaN for i in range(len(one_vs_many))]
one_vs_many.validation = validation


# Test set
tmp = two_ligand.append(three_ligand)[["sequence", "metal", "nuclear", "small"]]
tmp.rename(columns = {"metal": "target metal binding residues", 
                      "nuclear": "target nuclear binding residues", 
                      "small": "target small binding residues"}, inplace = True)
tmp.insert(2, "set", ["test"]*len(tmp))
one_vs_many = one_vs_many.append(tmp)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [None]:
# Let's plot some statistics



In [None]:
one_vs_many.to_csv(split_path / 'splits' / 'one_vs_many.csv', index = False)

# two_vs_many

In [90]:
two_vs_many = DataFrame(columns = ["sequence", 
                                   "target metal binding residues", 
                                   "target nuclear binding residues",
                                   "target small binding residues", 
                                   "set",
                                   "validation"])

# Training set
tmp = one_ligand[["sequence", "metal", "nuclear", "small"]]
tmp.rename(columns = {"metal": "target metal binding residues", 
                      "nuclear": "target nuclear binding residues", 
                      "small": "target small binding residues"}, inplace = True)
tmp.insert(2, "set", ["train"]*len(one_ligand))
two_vs_many = two_vs_many.append(tmp)

tmp = two_ligand[["sequence", "metal", "nuclear", "small"]]
tmp.rename(columns = {"metal": "target metal binding residues", 
                      "nuclear": "target nuclear binding residues", 
                      "small": "target small binding residues"}, inplace = True)
tmp.insert(2, "set", ["train"]*len(two_ligand))
two_vs_many = two_vs_many.append(tmp)


# Validation set
val_sample_indexs = list(two_vs_many.sample(frac=0.1, random_state=1234).index)
validation = [True if i in val_sample_indexs else NaN for i in range(len(two_vs_many))]
two_vs_many.validation = validation


# Test set
tmp = three_ligand[["sequence", "metal", "nuclear", "small"]]
tmp.rename(columns = {"metal": "target metal binding residues", 
                      "nuclear": "target nuclear binding residues", 
                      "small": "target small binding residues"}, inplace = True)
tmp.insert(2, "set", ["test"]*len(tmp))
two_vs_many = two_vs_many.append(tmp)

In [None]:
# Let's plot some statistics



In [None]:
two_vs_many.to_csv(split_path / 'splits' / 'two_vs_many.csv', index = False)

# three_vs_many

In [86]:
three_vs_many = full_set

three_vs_many.rename(columns = {"metal": "target metal binding residues", 
                      "nuclear": "target nuclear binding residues", 
                      "small": "target small binding residues"}, inplace = True)

# Let's specify validation set
val_sample_indexs = list(three_vs_many.query("set == 'train'").sample(frac=0.1, random_state=1234).index)
validation = [True if i in val_sample_indexs else NaN for i in range(len(three_vs_many))]
three_vs_many.insert(2, "validation", validation)

In [None]:
# Let's plot some statistics



In [None]:
three_vs_many.to_csv(split_path / 'splits' / 'three_vs_many.csv', index = False)