Split the MOAD dataset into 4 clusters:
1. Complexes contained in PDBBind
2. Training set for new confidence model
3. Validation for self-distillation
4. Testing for self-distillation

In [6]:
import sys
import pickle

In [7]:
### Read all MOAD clusters
with open("../data/BindingMOAD_2020_ab_processed_biounit/train_val_test_ecod_t_group_plus_mmseqs30_no_membrane.pkl", "rb") as f: 
    split_clusters = pickle.load(f)


all_moad_clusters = set()
for split in split_clusters.values():
    all_moad_clusters = all_moad_clusters.union(split)
    


### PDBBind clusters

def read_strings_from_txt(path):
    # every line will be one element of the returned list
    with open(path) as file:
        lines = file.readlines()
        return [line.rstrip() for line in lines]
        
complexes_pdbbind = read_strings_from_txt('/data/rsg/nlp/gcorso/ligbind/data/splits/timesplit_no_lig_overlap_train') + read_strings_from_txt('/data/rsg/nlp/gcorso/ligbind/data/splits/timesplit_no_lig_overlap_val')
with open('/data/rsg/nlp/gcorso/ligbind/data/BindingMOAD_2020_ab_processed_biounit/ecod_t_group_binding_site_assignment_dict_major_domain.pkl', 'rb') as f:
        pdbbind_to_cluster = pickle.load(f)
        clusters_pdbbind = set([pdbbind_to_cluster[c] for c in complexes_pdbbind])

with open("/data/rsg/nlp/gcorso/ligbind/data/BindingMOAD_2020_ab_processed_biounit/superligand_clusters_ecod_t_group_plus_mmseqs30_filtered_train_rscc85_rsr25_testval_rscc90_rsr20_no_membrane.pkl", "rb") as f:
        cluster_to_ligands = pickle.load(f)
        cluster_to_ligands = {k: [s.split('.')[0] for s in v] for k, v in cluster_to_ligands.items()}
    
overlap_clusters = set([c for c in all_moad_clusters if c in clusters_pdbbind])
moad_only_clusters = set([c for c in all_moad_clusters if c not in clusters_pdbbind])

overlap_ligand_cnt = sum([len(cluster_to_ligands[x]) for x in overlap_clusters])
moad_only_ligand_cnt = sum([len(cluster_to_ligands[x]) for x in moad_only_clusters])

print('There are', len(all_moad_clusters), 'clusters from MOAD')
print('There are', len(clusters_pdbbind), 'clusters from PDBBind')
print('There are', len(overlap_clusters), 'clusters in both MOAD and PDBBind, with ', overlap_ligand_cnt, 'ligands.')
print('There are', len(moad_only_clusters), 'clusters in both MOAD only, with ', moad_only_ligand_cnt, 'ligands.')

There are 525 clusters from MOAD
There are 487 clusters from PDBBind
There are 346 clusters in both MOAD and PDBBind, with  40772 ligands.
There are 179 clusters in both MOAD only, with  2282 ligands.


Generate new splits

In [8]:
new_split = dict()


moad_only_clusters = sorted(list(moad_only_clusters))
n = len(moad_only_clusters)

import random
random.seed(0)
random.shuffle(moad_only_clusters)

val, test = moad_only_clusters[:int(0.5 * n)], moad_only_clusters[int(0.5 * n):]
new_split['PDBBind'] = overlap_clusters
new_split['val'] = set(val)
new_split['test'] = set(test)

with open('./splits/MOAD_generalisation_splits.pkl', 'wb') as f:
    pickle.dump(new_split, f)
print(f'There are:\n{overlap_ligand_cnt} ligands in the PDBBind split,\n{sum([len(cluster_to_ligands[x]) for x in val])} ligands in the val split,\n{sum([len(cluster_to_ligands[x]) for x in test])} ligands in the test split.')



There are:
40772 ligands in the PDBBind split,
737 ligands in the val split,
1545 ligands in the test split.


In [9]:
# get pdb ids for every cluster
pdbids = dict()
for cluster in val:
    pdbids[cluster] = set(c[:4] for c in cluster_to_ligands[cluster])

pdbids

{'barrel domain in D-aminoacid aminotransferase-like PLP-dependent enzymes': {'5i5w'},
 'Cobalamin adenosyltransferase': {'2nt8'},
 'Phosphofructokinase N-terminal domain': {'2qv7',
  '4i4i',
  '4i7e',
  '4pfk',
  '5xz7',
  '5xz8',
  '5xz9',
  '5xza'},
 'Heterocyclase TruD C-terminal domain': {'6cib', '6pe3'},
 'ERO1-like': {'3ahq'},
 'TrmB C-terminal domain-like': {'2f5t'},
 'Cysteine methyltransferase effector NleE': {'4r29'},
 'Aminomethyltransferase beta-barrel domain': {'1ha3', '2hcj'},
 'Hypothetical protein TT1679': {'6np3'},
 'Viral glycoprotein, central and dimerisation domains': {'6egu'},
 'FomD-like': {'5zdn'},
 'Ribonuclease Rh-like': {'1iyb',
  '1j1f',
  '1j1g',
  '1v9h',
  '1vd3',
  '4dw3',
  '4dw4'},
 'mannan-binding lectin MVL': {'1zhs'},
 'Capsid protein protrusion (P) domain': {'4x1z'},
 'Peridinin-chlorophyll protein': {'2c9e', '2x1z', '2x21', '3iis', '3iiu'},
 'Barrel domain in heme-dependent catalases': {'1dgb',
  '1dgf',
  '1dgg',
  '1dgh',
  '4b7f',
  '4b7g',
  '

In [23]:
import argparse
import sys
import requests # this is used to access json files

PY3 = sys.version > '3'
if PY3:
    import urllib.request as urllib2
else:
    import urllib2

SERVER_URL = "https://www.ebi.ac.uk/pdbe/api"
UNIPROT = "/mappings/uniprot"

def get_request(url, arg, pretty=False):
    full_url = "%s/%s/%s?pretty=%s" % (SERVER_URL, url, arg, str(pretty).lower())
    # e.g. for PDB id. 1ivv we get:
    # full_url = https://www.ebi.ac.uk/pdbe/api//mappings/uniprot/1ivv?pretty=true
    #print("This is the url string:\n{}".format(full_url))

    json_results = requests.get( full_url ).json() #This calls the information back from the API using the 'requests' module, and converts it to json format

    # pull out the UniProt id. for this PDB id:
    uniprot_id = json_results[arg] # 'arg' is the input PDB ID e.g. 1ivv
    uniprot_id2 = uniprot_id["UniProt"]
    uniprot_id3 = list(uniprot_id2.keys()) # a list of the UniProt ids. for this input PDB id.
    if (len(uniprot_id3) != 1):
        print(arg, uniprot_id3)
    #uniprot_id4 = uniprot_id3[0]
    #print("UniProt id=",uniprot_id4)

    return uniprot_id3

def get_uniprot(pdbid):
    response = get_request(UNIPROT, pdbid, True)
    return response

In [24]:
get_uniprot('4i4i')

['P00512']

In [25]:
uniprot_ids = dict()

for cluster in val:
    uniprots = []
    for pdbid in pdbids[cluster]:
        uniprots.extend(get_uniprot(pdbid))
    uniprot_ids[cluster] = uniprots

uniprot_ids

4b7g []
4b7h []
3sqg ['D1JBK4', 'D1JBK2', 'D1JBK3']
3m1v ['P11558', 'P11560', 'P11562']
5n1q ['A0A247D6X3', 'A0A247D6X4', 'A0A247D6X5']
5g0r ['P11558', 'P11560', 'P11562']
5a8r ['P58815', 'D9PXZ6', 'P58816']
5a0y ['P11558', 'P11560', 'P11562']
5a8w ['H7CHY2', 'A0A1C7D1E4', 'A0A1C7D1E5']
5aqk ['P11142', 'Q99933']
4zh1 ['P01024', 'P08603']
6czh []
6w70 []
4x28 ['I6YCA3', 'I6Y3Q0']
6wy9 ['D1AB78', 'D1AB76']
6cxt ['F2K074', 'F2K077']
4hdr ['F6MZ55', 'F6MZ56']
4yry ['Q9X1X4', 'Q9X1X5']
5a1s []
5ysn ['P0AEJ6', 'P19636']
4jrb ['P42212', 'Q9UAM5']
6in7 ['P38107', 'Q06198']
4fju []
2bpb ['Q9LA16', 'Q9LA15']


{'barrel domain in D-aminoacid aminotransferase-like PLP-dependent enzymes': ['O15382'],
 'Cobalamin adenosyltransferase': ['Q50EJ2'],
 'Phosphofructokinase N-terminal domain': ['P00512',
  'Q6GFF9',
  'P99165',
  'P99165',
  'P00512',
  'Q2FXM8',
  'P00512',
  'P99165'],
 'Heterocyclase TruD C-terminal domain': ['Q58494', 'Q8TZ25'],
 'ERO1-like': ['Q96HE7'],
 'TrmB C-terminal domain-like': ['Q7LYW4'],
 'Cysteine methyltransferase effector NleE': ['Q7DBA6'],
 'Aminomethyltransferase beta-barrel domain': ['P0CE47', 'Q5SHN6'],
 'Hypothetical protein TT1679': ['Q47030'],
 'Viral glycoprotein, central and dimerisation domains': ['P03518'],
 'FomD-like': ['D2SNF7'],
 'Ribonuclease Rh-like': ['Q7XZV5',
  'Q96662',
  'P23540',
  'P23540',
  'Q96662',
  'P23540',
  'Q9SSV1'],
 'mannan-binding lectin MVL': ['Q9RHG4'],
 'Capsid protein protrusion (P) domain': ['I7FLU3'],
 'Peridinin-chlorophyll protein': ['P80484',
  'P80484',
  'O76183',
  'P80484',
  'P80484'],
 'Barrel domain in heme-dependen