In [2]:
import pickle

import pandas as pd
import numpy as np
import os
from tqdm import tqdm

from data.const import DATA_FOLDER

### Load processed metadata

#### Metadata properties:
1. pH range
2. Solution NMR
3. AA length > 20
4. Protein sequence doesn't contain missing AAs
5. entity_sequence doesn't contain (by mistake) RNA sequence instead
6. All the protein contain chemical shifts info for the C_alpha (at least 6 CAs)
7. entity_sequence contain ions

In [24]:
metadata_processed = pd.read_csv(f"{DATA_FOLDER}/all_metadata_processed.csv")
metadata_processed #entitites as set

Unnamed: 0,bmrb_id,expt_method,expt_method_subtype,pH,temperature,num_components,num_monomers,pdb_refs,entities,entity_seq,refDB
0,10002,NMR,.,,,1.0,56.0,,['F-spondin TSR 4'],GSIPCLLSPWSEWSDCSVTCGKGMRTRQRMLKSLAELGDCNEDLEQ...,True
1,10004,NMR,.,,,1.0,190.0,,"[""2'-5' RNA ligase-like protein""]",MRAFIAIDVSESVRDALVRAQDYIGSKEAKIKFVERENFHITLKFL...,False
2,10005,NMR,.,,,1.0,54.0,['2D49'],['ChBD'],MATCATAWSSSSVYTNGGTVSYNGRNYTAKWWTQNERPGTSDVWAD...,True
3,10006,NMR,.,,,1.0,120.0,['1WFU'],['FN3 domain'],GSSGSSGMEPHKVVPLSKPHPPVVGKVTHHSIELYWDLEQKEKRQG...,False
4,10008,NMR,.,,,1.0,123.0,['1WFT'],['FN3 domain'],GSSGSSGPGAPSTVRISKNVDGIHLSWEPPTSPSGNILEYSAYLAI...,False
...,...,...,...,...,...,...,...,...,...,...,...
7778,7426,NMR,solution,,,1.0,227.0,"['1hy5', '1JYA', '1L2W']",['YopE'],MKISSFISTSLPLPTSVSGSSSVGEMSGRSVSQQTSDQYANNLAGR...,False
7779,7430,NMR,solution,6.5,298.00,1.0,157.0,,['ERp18'],MHHHHHHMSDGHNGLGKGFGDHIHWRTLEDGKKEAAASGLPLMVII...,False
7780,7432,NMR,solution,8.0,298.00,1.0,128.0,"['1fpo', '1r9p']",['IscU(D39A)'],MAYSEKVIDHYENPRNVGSFDNNDENVGSGMVGAPACGAVMKLQIK...,False
7781,7433,NMR,solution,6.5,298.15,1.0,125.0,,['Extracellular CD147 Isoform-3'],GSHMMGTANIQLHGPPRVKAVKSSEHINEGETAMLVCKSESVPPVT...,False


pLDDT score satisfies: at least 80% of residues have b_factor > 75

In [27]:
metadata_processed = pd.read_csv(f"{DATA_FOLDER}/all_metadata_processed_high_bfactor.csv")
metadata_processed

Unnamed: 0,bmrb_id,expt_method,expt_method_subtype,pH,temperature,num_components,num_monomers,pdb_refs,entities,entity_seq,refDB
0,10002,NMR,.,,,1.0,56.0,,['F-spondin TSR 4'],GSIPCLLSPWSEWSDCSVTCGKGMRTRQRMLKSLAELGDCNEDLEQ...,True
1,10004,NMR,.,,,1.0,190.0,,"[""2'-5' RNA ligase-like protein""]",MRAFIAIDVSESVRDALVRAQDYIGSKEAKIKFVERENFHITLKFL...,False
2,10005,NMR,.,,,1.0,54.0,['2D49'],['ChBD'],MATCATAWSSSSVYTNGGTVSYNGRNYTAKWWTQNERPGTSDVWAD...,True
3,10008,NMR,.,,,1.0,123.0,['1WFT'],['FN3 domain'],GSSGSSGPGAPSTVRISKNVDGIHLSWEPPTSPSGNILEYSAYLAI...,False
4,10009,NMR,.,,,1.0,74.0,['1WFW'],['SH3 domain'],GSSGSSGSTMTVIKDYYALKENEICVSQGEVVQVLAVNQQNMCLVY...,True
...,...,...,...,...,...,...,...,...,...,...,...
4952,7424,NMR,solution,7.5,298.00,5.0,148.0,,"['calmodulin', 'CALCIUM ION, 1', 'CALCIUM ION,...",ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEA...,False
4953,7425,NMR,solution,7.5,298.00,5.0,148.0,,"['calmodulin', 'CALCIUM ION, 1', 'CALCIUM ION,...",ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEA...,False
4954,7430,NMR,solution,6.5,298.00,1.0,157.0,,['ERp18'],MHHHHHHMSDGHNGLGKGFGDHIHWRTLEDGKKEAAASGLPLMVII...,False
4955,7432,NMR,solution,8.0,298.00,1.0,128.0,"['1fpo', '1r9p']",['IscU(D39A)'],MAYSEKVIDHYENPRNVGSFDNNDENVGSGMVGAPACGAVMKLQIK...,False


#### Metadata properties:
1. pH range
2. Solution NMR
3. AA length > 20
4. Protein sequence doesn't contain missing AAs
5. entity_sequence doesn't contain (by mistake) RNA sequence instead
6. All the protein contain chemical shifts info for the C_alpha
7. Number of entities==1

In [None]:
metadata_processed = pd.read_csv(f"{DATA_FOLDER}/all_metadata_processed_1E.csv")
metadata_processed.shape

pLDDT score satisfies: at least 80% of residues have b_factor > 75

In [None]:
metadata_processed = pd.read_csv(f"{DATA_FOLDER}/all_metadata_processed_1E_high_bfactor.csv")
metadata_processed.shape

### Create valid sequences and bmrb refs files

In [28]:
valid_exp_sequences = metadata_processed[["bmrb_id","entity_seq"]]
valid_exp_sequences.to_csv(f"{DATA_FOLDER}/valid-exp-sequences_high_bfactor.csv", index=False)

bmrb_refs = pd.DataFrame(metadata_processed["bmrb_id"])
bmrb_refs.to_csv(f"{DATA_FOLDER}/bmrb_refs_high_bfactor.csv", index=False)

### Get BMRB IDs

In [29]:
test_set_bmrb_ids = pd.read_csv(f'{DATA_FOLDER}/clusters_high_bfactor/test_set.csv')['bmrb_id'].tolist()

In [30]:
print(list(map(str,test_set_bmrb_ids)))

['1062', '1375', '4060', '4078', '4081', '4087', '4091', '4101', '4110', '4117', '4121', '4136', '4150', '4161', '4206', '4197', '4214', '4216', '4217', '4223', '4224', '4227', '4249', '4264', '19197', '4276', '4306', '4318', '19737', '4358', '4373', '4376', '4393', '4397', '4438', '4448', '4455', '4496', '36077', '4637', '4648', '4677', '4701', '4706', '4740', '4775', '4797', '4814', '4819', '4820', '4829', '4836', '4876', '4579', '4944', '4957', '4972', '4984', '5014', '5075', '5090', '5106', '5115', '5155', '5165', '30323', '5202', '5210', '5225', '5309', '4649', '5368', '5393', '5402', '5459', '5468', '5471', '5485', '5491', '5589', '5594', '5615', '5622', '5626', '5678', '15256', '5692', '5704', '5706', '5729', '5754', '5764', '51419', '5792', '5794', '5798', '5800', '5817', '5820', '5844', '5861', '5879', '5907', '5918', '5947', '5959', '5974', '5976', '5991', '5999', '6010', '6012', '6053', '6072', '6073', '19742', '6081', '6114', '6128', '6142', '6171', '6178', '6181', '6188', 

In [31]:
train_set_bmrb_ids = pd.read_csv(f'{DATA_FOLDER}/clusters_high_bfactor/train_set.csv')['bmrb_id'].tolist()

In [32]:
print(list(map(str,train_set_bmrb_ids[:1000])))

['5359', '397', '447', '1642', '2371', '4105', '1657', '4022', '4029', '4034', '4035', '4036', '4037', '4038', '4040', '4045', '4047', '4049', '4050', '4064', '4864', '4070', '4367', '4082', '4084', '4085', '4090', '4092', '4094', '4102', '4108', '4109', '4111', '4113', '4116', '4122', '4126', '4127', '4132', '4144', '4156', '4160', '4162', '4168', '15554', '27945', '4188', '4205', '4215', '4228', '4239', '4251', '4254', '4255', '4259', '4265', '4267', '4269', '4272', '4282', '4297', '4299', '4301', '11114', '4309', '4311', '4326', '4330', '4333', '4334', '4335', '4339', '4315', '4342', '4353', '4360', '4369', '4371', '4381', '4382', '4386', '4389', '4391', '4395', '4405', '4407', '4421', '4425', '18854', '4431', '4437', '4445', '4447', '4449', '4451', '4457', '4460', '4467', '4470', '4475', '4478', '4486', '4492', '4514', '4558', '4560', '4561', '4571', '4577', '4590', '4603', '4617', '4619', '4636', '4661', '4664', '4666', '4668', '4670', '4678', '4681', '4688', '4697', '4698', '4704

## Get RefDB

In [None]:
# refDB_ids_list = metadata_processed.query('refDB==True')['bmrb_id'].tolist()

In [None]:
refDB_ids_list = pd.read_csv(f"{DATA_FOLDER}/refDB_test.csv")['bmrb_id'].tolist() + pd.read_csv(f"{DATA_FOLDER}/refDB_train.csv")['bmrb_id'].tolist()

In [None]:
print(refDB_ids_list)

### BMRB2IDX

In [33]:
import json
import numpy as np

def build_bmrb2idx(train_metadata, save_path="bmrb2idx.json"):
    """
    train_metadata : an iterable of dict / DF rows that cover ONLY the
                     training samples (each must have "bmrb_id").
    Returns a dict  {bmrb_id: 0 … N_train-1} and writes it to disk.
    """
    bmrb_ids = sorted({ int(d["bmrb_id"]) for d in train_metadata })
    bmrb2idx = { bmrb_id: idx for idx, bmrb_id in enumerate(bmrb_ids) }

    with open(f"{DATA_FOLDER}/{save_path}", "w") as fp:
        json.dump(bmrb2idx, fp)

    return bmrb2idx

In [34]:
train_entries = pd.read_csv(f"{DATA_FOLDER}/clusters_high_bfactor/train_set.csv")
bmrb2idx = build_bmrb2idx(train_entries.to_dict("records"))

In [35]:
bmrb2idx

{397: 0,
 447: 1,
 1642: 2,
 1657: 3,
 1875: 4,
 2371: 5,
 4022: 6,
 4029: 7,
 4034: 8,
 4035: 9,
 4036: 10,
 4037: 11,
 4038: 12,
 4040: 13,
 4045: 14,
 4047: 15,
 4049: 16,
 4050: 17,
 4064: 18,
 4070: 19,
 4082: 20,
 4084: 21,
 4085: 22,
 4090: 23,
 4092: 24,
 4093: 25,
 4094: 26,
 4102: 27,
 4105: 28,
 4108: 29,
 4109: 30,
 4111: 31,
 4113: 32,
 4116: 33,
 4122: 34,
 4126: 35,
 4127: 36,
 4132: 37,
 4144: 38,
 4156: 39,
 4160: 40,
 4162: 41,
 4168: 42,
 4188: 43,
 4205: 44,
 4215: 45,
 4228: 46,
 4232: 47,
 4239: 48,
 4251: 49,
 4254: 50,
 4255: 51,
 4259: 52,
 4265: 53,
 4267: 54,
 4269: 55,
 4272: 56,
 4282: 57,
 4297: 58,
 4299: 59,
 4301: 60,
 4305: 61,
 4309: 62,
 4311: 63,
 4315: 64,
 4326: 65,
 4327: 66,
 4330: 67,
 4333: 68,
 4334: 69,
 4335: 70,
 4339: 71,
 4342: 72,
 4353: 73,
 4360: 74,
 4367: 75,
 4369: 76,
 4371: 77,
 4381: 78,
 4382: 79,
 4386: 80,
 4389: 81,
 4391: 82,
 4395: 83,
 4405: 84,
 4407: 85,
 4421: 86,
 4425: 87,
 4431: 88,
 4437: 89,
 4445: 90,
 4447: 91,
