In [1]:
import pickle

import pandas as pd
import numpy as np
import os
from tqdm import tqdm

from data.const import DATA_FOLDER

### Load Data

In [3]:
gt_cs = pd.read_csv(f"{DATA_FOLDER}/gt_cs.csv")

In [None]:
gt_cs

In [None]:
metadata = pd.read_csv(f"{DATA_FOLDER}/all_metadata.csv")

### Basic preprocess (Filter out)

In [None]:
metadata = pd.read_csv(f"{DATA_FOLDER}/all_metadata.csv")
metadata_ = metadata[~metadata["entity_seq"].isna()]
metadata = metadata[~metadata["entity_seq"].isin(['.'])]
# Any pH > 15 doesn't make sense, NaN it -- No need to do it.
if metadata.loc[metadata["pH"] > 15, "pH"].shape[0] > 0:
    metadata.loc[metadata["pH"] > 15, "pH"] = np.nan
    
import ast

entities = metadata["entities"]
num_entities = entities.apply(lambda x: len(ast.literal_eval(x)))
metadata = metadata[~(num_entities>1)]

# Remove any expt type that has solid in its name

expt_subtypes = metadata["expt_method_subtype"]
expt_subtypes_nmr_only = expt_subtypes.apply(
    lambda x: False if ("solid" in x.lower()) or ("x-ray" in x.lower()) or ("magic" in x.lower()) 
                    else True
)
metadata = metadata[expt_subtypes_nmr_only]

## bmrbs more than 20 cs are measured
_bmrb_ids = gt_cs.groupby("bmrb_id", as_index=False).size().query('size>20')["bmrb_id"]
metadata = metadata.query("bmrb_id in @_bmrb_ids")

# filter out entities that contained UNKNOWN bmrb_ids
metadata = metadata.query("not entity_seq.str.contains('X')")
metadata = metadata.query("not entity_seq.str.contains(r'\\*')")

# filter sequences less than 20 AAs
entity_seq = metadata["entity_seq"]
entity_seq_long = entity_seq.apply(
    lambda x: False if len(x) < 20 else True
)
metadata_processed = metadata[entity_seq_long]
not_wanted = [34365, 25264] # solid and theoretical
metadata_processed = metadata_processed.query('bmrb_id not in @not_wanted')
metadata_processed.shape

### entity sequence filtering: keep ions

In [None]:
import ast

entities = metadata_processed["entities"]
num_entities = entities.apply(lambda x: len(ast.literal_eval(x)))

metadata_multiple_entites = metadata_processed[num_entities>1]
metadata_multiple_entites['has_ion'] = False
for bmrb_id, mdata in tqdm(metadata_multiple_entites.groupby('bmrb_id')):
    if 'ion' in mdata['entities'].tolist()[0].lower():
        metadata_multiple_entites.loc[metadata_multiple_entites['bmrb_id'] == bmrb_id,'has_ion'] = True
    if 'zn' in mdata['entities'].tolist()[0].lower():
        metadata_multiple_entites.loc[metadata_multiple_entites['bmrb_id'] == bmrb_id,'has_ion'] = True
    if 'calcium' in mdata['entities'].tolist()[0].lower():
        metadata_multiple_entites.loc[metadata_multiple_entites['bmrb_id'] == bmrb_id,'has_ion'] = True
    if 'ca' in mdata['entities'].tolist()[0].lower():
        metadata_multiple_entites.loc[metadata_multiple_entites['bmrb_id'] == bmrb_id,'has_ion'] = True    
    if 'fe(ii)' in mdata['entities'].tolist()[0].lower():
        metadata_multiple_entites.loc[metadata_multiple_entites['bmrb_id'] == bmrb_id,'has_ion'] = True
    if 'fe(iii)' in mdata['entities'].tolist()[0].lower():
        metadata_multiple_entites.loc[metadata_multiple_entites['bmrb_id'] == bmrb_id,'has_ion'] = True
    if 'zinc' in mdata['entities'].tolist()[0].lower():
        metadata_multiple_entites.loc[metadata_multiple_entites['bmrb_id'] == bmrb_id,'has_ion'] = True
    if 'mg' in mdata['entities'].tolist()[0].lower():
        metadata_multiple_entites.loc[metadata_multiple_entites['bmrb_id'] == bmrb_id,'has_ion'] = True
    if 'magnesium' in mdata['entities'].tolist()[0].lower():
        metadata_multiple_entites.loc[metadata_multiple_entites['bmrb_id'] == bmrb_id,'has_ion'] = True
    if 'hg' in mdata['entities'].tolist()[0].lower():
        metadata_multiple_entites.loc[metadata_multiple_entites['bmrb_id'] == bmrb_id,'has_ion'] = True
    if 'cu1' in mdata['entities'].tolist()[0].lower():
        metadata_multiple_entites.loc[metadata_multiple_entites['bmrb_id'] == bmrb_id,'has_ion'] = True

In [None]:
metadata_multiple_entites.query('has_ion == False')

not_needed_list = metadata_multiple_entites.query('has_ion == False')['bmrb_id'].tolist()

metadata_processed = metadata_processed.query(
    'not bmrb_id in @not_needed_list'
)
#.to_csv(f"{DATA_FOLDER}/all_metadata_processed.csv")
metadata_processed.shape

In [None]:
metadata_processed

### RNA filtering

In [None]:
metadata_processed['has_rna'] = False

In [None]:
for bmrb_id, mdata in tqdm(metadata_processed.groupby('bmrb_id')):
    if 'rna' in mdata['entities'].tolist()[0].lower():
        metadata_processed.loc[metadata_processed['bmrb_id'] == bmrb_id,'has_rna'] = True

In [None]:
metadata_processed = metadata_processed.query('not (has_rna==True and ('
                         'entity_seq.str.contains("C") '
                         'and entity_seq.str.contains("G") '
                         'and entity_seq.str.contains("U") '
                         '))')

In [None]:
metadata_processed = metadata_processed.query('not entity_seq.str.contains("U")')

In [None]:
metadata_processed.drop('has_rna', axis=1, inplace=True)
metadata_processed

In [None]:
metadata_processed.to_csv(f"{DATA_FOLDER}/all_metadata_processed_1E.csv", index=False)

### add refDB flag

In [None]:
metadata_processed = pd.read_csv(f"{DATA_FOLDER}/all_metadata_processed.csv")
refDB = os.listdir(f"{DATA_FOLDER}/af-bmrb-h-v3")
for i,refDB_i in enumerate(refDB):
    refDB[i] = int(refDB_i[:-5])
    
refdb_flag = metadata_processed['bmrb_id'].apply(
    lambda x: True if x in refDB else False
)

metadata_processed["refDB"] = refdb_flag
# metadata_processed.shape
metadata_processed.to_csv(f"{DATA_FOLDER}/all_metadata_processed.csv", index=False)

### Filter out bmrbs that don't contain CA information

In [None]:
# contains_ca = gt_cs.groupby("bmrb_id")["atom"].apply(lambda atoms: any('CA' in str(atom) for atom in atoms)).reset_index()

contains_ca = gt_cs.groupby("bmrb_id")["atom"].apply(
    lambda atoms: sum('CA' in str(atom) for atom in atoms) >= 6
).reset_index()

# contains_ca = gt_cs.groupby("bmrb_id")["atom"].apply(
#     lambda atoms: all('CA' in str(atom) for atom in atoms)
# ).reset_index()

contains_ca.columns = ['bmrb_id', 'has_CA']

bmrb_contains_ca_list = contains_ca.query("has_CA == True")['bmrb_id'].tolist()
len(bmrb_contains_ca_list)

In [None]:
# metadata_processed = pd.read_csv(f"{DATA_FOLDER}/all_metadata_processed_1E.csv")

In [None]:
metadata_processed = metadata_processed.query("bmrb_id in @bmrb_contains_ca_list")
metadata_processed.to_csv(f"{DATA_FOLDER}/all_metadata_processed.csv", index=False)


In [None]:
metadata_processed.shape

### Check pLDDT

In [4]:
results = pd.read_csv(f'{DATA_FOLDER}/bfactor_analysis.csv')
proteins_meeting_criteria = results[results['meets_criteria']]

In [5]:
results

Unnamed: 0,bmrb_id,percentage_high_bfactor,percentage_high_seq_cov,meets_criteria
0,25,100.000000,97.163121,True
1,25338,86.057692,61.057692,True
2,4373,99.107143,60.714286,True
3,27856,99.200000,84.000000,True
4,16372,90.647482,58.273381,True
...,...,...,...,...
13805,26619,99.090909,50.909091,True
13806,11461,100.000000,32.307692,True
13807,30677,100.000000,74.820144,True
13808,15653,84.375000,63.020833,True


In [6]:
bfactor_proteins_list = proteins_meeting_criteria['bmrb_id'].unique().tolist()
len(bfactor_proteins_list)

8862

In [7]:
metadata_processed = pd.read_csv(f"{DATA_FOLDER}/all_metadata_processed.csv")

metadata_high_bfactor = metadata_processed.query('bmrb_id in @bfactor_proteins_list')


In [8]:
metadata_high_bfactor

Unnamed: 0,bmrb_id,expt_method,expt_method_subtype,pH,temperature,num_components,num_monomers,pdb_refs,entities,entity_seq,refDB
0,10002,NMR,.,,,1.0,56.0,,['F-spondin TSR 4'],GSIPCLLSPWSEWSDCSVTCGKGMRTRQRMLKSLAELGDCNEDLEQ...,True
1,10004,NMR,.,,,1.0,190.0,,"[""2'-5' RNA ligase-like protein""]",MRAFIAIDVSESVRDALVRAQDYIGSKEAKIKFVERENFHITLKFL...,False
2,10005,NMR,.,,,1.0,54.0,['2D49'],['ChBD'],MATCATAWSSSSVYTNGGTVSYNGRNYTAKWWTQNERPGTSDVWAD...,True
4,10008,NMR,.,,,1.0,123.0,['1WFT'],['FN3 domain'],GSSGSSGPGAPSTVRISKNVDGIHLSWEPPTSPSGNILEYSAYLAI...,False
5,10009,NMR,.,,,1.0,74.0,['1WFW'],['SH3 domain'],GSSGSSGSTMTVIKDYYALKENEICVSQGEVVQVLAVNQQNMCLVY...,True
...,...,...,...,...,...,...,...,...,...,...,...
7776,7424,NMR,solution,7.5,298.00,5.0,148.0,,"['calmodulin', 'CALCIUM ION, 1', 'CALCIUM ION,...",ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEA...,False
7777,7425,NMR,solution,7.5,298.00,5.0,148.0,,"['calmodulin', 'CALCIUM ION, 1', 'CALCIUM ION,...",ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEA...,False
7779,7430,NMR,solution,6.5,298.00,1.0,157.0,,['ERp18'],MHHHHHHMSDGHNGLGKGFGDHIHWRTLEDGKKEAAASGLPLMVII...,False
7780,7432,NMR,solution,8.0,298.00,1.0,128.0,"['1fpo', '1r9p']",['IscU(D39A)'],MAYSEKVIDHYENPRNVGSFDNNDENVGSGMVGAPACGAVMKLQIK...,False


In [9]:
metadata_high_bfactor.to_csv(f"{DATA_FOLDER}/all_metadata_processed_high_bfactor.csv", index=False)

### STATISTICS

In [None]:
metadata_processed["expt_method_subtype"].value_counts()