In [1]:
import os
import numpy as np
import pandas as pd
import tqdm
from joblib import Parallel, delayed

In [2]:
import re
def affinity_text_transform(text:str, trg_indicator:str='KD'):
    """
    将PDBbind v2020中的Affinity Data转为KD值，单位为M。
    text的格式：
    [Kd,Ki]=[数字][nM,uM,fM]
    Example:
    src,trg = affinity_text_transform('Kd=0.17uM')
    """
    
    assert trg_indicator in ['KD','dG']
    
    text = text.replace('>','=').replace('<','=')
    pattern = '(\w*)=(\d+\.?\d*)(\w*)'
    src_indicator,value,unit = re.findall(pattern, text)[0]
    assert src_indicator in ['Kd','Ki','IC50'], f"Error: {src_metric} not in ['Kd','Ki','IC50']"
        
    value = float(value)
    if unit=='fM':
        value = value*1e-15# (M/L)
    elif unit=='pM':
        value = value*1e-12# (M/L)
    elif unit=='nM':
        value = value*1e-9# (M/L)
    elif unit=='uM':
        value = value*1e-6# (M/L)
    elif unit=='mM':
        value = value*1e-3# (M/L)
    else:
        print(text,unit)
        return None, None
    
    src = {'indicator':src_indicator, 'value':value, 'unit':'M'}
    
    if src_indicator in ['Kd','Ki']:
        # Kd = Ki / (1 + [L]/Km),  根据该方程，当配体浓度远小于Km时，可以近似认为Kd ≈ Ki
        if trg_indicator=='KD':
            trg = {'indicator':'KD', 'value':value, 'unit':'M'}
        elif trg_indicator=='dG':
            dG = float(format( (8.314/4184)*(273.15 + 25.0) * np.log(value), '.4g'))
            trg = {'indicator':'dG', 'value':dG, 'unit':'kcal/mol'}
    elif src_indicator=='IC50':
        trg = None
    else:
        trg = None
    return src, trg

def lower_chain(chains:str):
    chains = [chain.strip() for chain in chains.split(',')]
    chains = ','.join([chain[1].lower() if len(chain)==2 and chain[0]=='/' else chain for chain in chains])
    return chains

In [3]:
save_dir = './processed_data'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [4]:
# root_dir = r"E:\1.THPRD\1.AI_LifeScience\3.AntiBody\data\PDBbindCN"
root_dir = "source_data"
filepath = os.path.join(root_dir, r'PDBbind-CN_v2020_PP_20231108_annotated.xlsx')
assert os.path.exists(filepath), f'{filepath} not exists!'
df = pd.read_excel(filepath,index_col=0, header=1, dtype={'Pubmed ID':str, 'PDB code':str})

# read

In [5]:
# 原始的PDBbind v2020(未被我们标注过和筛选过的)
filepath = os.path.join(root_dir, r'PDBbind-CN_v2020_PP_20231108.xlsx')
assert os.path.exists(filepath), f'{filepath} not exists!'
df_org = pd.read_excel(filepath,index_col=0, header=1, dtype={'Pubmed ID':str, 'PDB code':str})

# Screening phase1

In [6]:
# 第一批在标注阶段(因为无法确定受体链和配体链)就被删除的样本：
unannotated_pdbs = set(df_org['PDB code'].tolist()) - set(df['PDB code'].tolist())
df_unannotated = df_org.loc[df_org['PDB code'].apply(lambda x: x in unannotated_pdbs),:]
df_unannotated.insert(loc=0,column='Reason for Deletion',value=np.nan)
df_unannotated.loc[:,'Reason for Deletion'] = 'unable to annotate ligand and receptor chains'

In [7]:
# 删除亲和力非Ki或Kd值的样本
for ind in df.index:
    src,trg = affinity_text_transform( df.loc[ind,'Affinity Data'], 'KD' )
    if trg is not None:
        df.loc[ind,'KD(M)'] = format(trg['value'],'.4') # 单位为M/L

# 记录(亲和力非Ki或Kd值的样本)
df_notKD_dG = df[df['KD(M)'].isna()]
df_notKD_dG.insert(loc=0,column='Reason for Deletion',value=np.nan)
df_notKD_dG.loc[:,'Reason for Deletion'] = 'cannot concert affinity to KD(M)'

# save as excel
df_delphase1 = pd.concat([df_unannotated, df_notKD_dG], axis=0)
df_delphase1.to_excel( os.path.join(save_dir,'PDBbind v2020(samples_deleted_phase1).xlsx') )
print("Records deleted by scrrening phase 1")
display(df_delphase1)

# 删除(亲和力非Ki或Kd值的样本)
df = df[~df['KD(M)'].isna()]
print("Records after scrrening phase 1")
display(df)

Records deleted by scrrening phase 1


Unnamed: 0_level_0,Reason for Deletion,PDB code,Subset,Complex Type,Resolution,Affinity Data,pKd pKi pIC50,Release Year,Protein Name,Ligand Name,...,XLOGP3,open banel LogP,HB donor,HB acceptor,Rotatable bonds,Canonical SMILES,Ligand Chains,Protein Chains,Paired_Chains,KD(M)
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
50,unable to annotate ligand and receptor chains,1e3u,general,Protein-Protein,1.66,Kd=1uM,6.00,2001,BETA-LACTAMASE OXA-10 dimer,BETA-LACTAMASE OXA-10,...,,,,,,,,,,
104,unable to annotate ligand and receptor chains,1hqr,general,Protein-Protein,3.2,Kd=0.10uM,7.00,2001,MHC class II,SAGs,...,,,,,,,,,,
106,unable to annotate ligand and receptor chains,1hxy,general,Protein-Protein,2.6,Kd=0.5nM,9.30,2001,STAPHYLOCOCCAL ENTEROTOXIN H,HUMAN MHC CLASS II(HLA-DR1),...,,,,,,,,,,
139,unable to annotate ligand and receptor chains,1kbh,general,Protein-Protein,NMR,Kd=34nM,7.47,2002,CREB-BINDING PROTEIN,nuclear receptor coactivator ACTR,...,,,,,,,,,,
244,unable to annotate ligand and receptor chains,1rvj,general,Protein-Protein,2.75,Kd=4uM,5.40,2004,dark-adapted DN(L213),RH(H177) revertant RC in the trigonal form,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2643,cannot concert affinity to KD(M),6oan,general,Protein-Protein,2.9,IC50=4.88nM,8.31,2019,"Duffy binding surface protein region II, DBP",human neutralizing antibody 053054,...,0.0,0.0,,,,,"b, d","a, c",b_a\nd_c,
2780,cannot concert affinity to KD(M),6dgf,general,Protein-Protein,2.34,IC50=9.5nM,8.02,2019,"Ubiquitin carboxyl-terminal hydrolase 2, USP2","Ubiquitin Variant, UbV 2.6",...,0.0,0.0,,,,,b,a,,
2788,cannot concert affinity to KD(M),6d68,general,Protein-Protein,2.36,IC50=250nM,6.60,2019,"Ubiquitin-conjugating enzyme E2 G1, Ube2G1",Ubiquitin Variant Ubv.G1.1,...,0.0,0.0,,,,,"c,d","a,b",,
2789,cannot concert affinity to KD(M),6d4p,general,Protein-Protein,2.11,IC50=65nM,7.19,2019,"Ubiquitin-conjugating enzyme E2 D1, Ube2D1",Ubiquitin Variant Ubv.D1.1,...,0.0,0.0,,,,,c,a,,


Records after scrrening phase 1


Unnamed: 0_level_0,PDB code,Subset,Complex Type,Resolution,Affinity Data,pKd pKi pIC50,Release Year,Protein Name,Ligand Name,Ligand Chains,...,No. of atoms,No. of bonds,Polar Surface Area,XLOGP3,open banel LogP,HB donor,HB acceptor,Rotatable bonds,Canonical SMILES,KD(M)
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1a22,general,Protein-Protein,2.6,Kd=0.34nM,9.47,1998,GROWTH HORMONE RECEPTOR,G120R mutant human growth hormone (hGH),a,...,,,,,,,,,,3.4e-10
2,1a2k,general,Protein-Protein,2.5,Kd=150nM,6.82,1998,Ran GTPase,Nuclear Transport Factor 2,"a,b",...,,,,,,,,,,1.5e-07
3,1a3b,general,Protein-Protein,1.8,Kd=0.3nM,9.52,1998,thrombin alpha,21-mer,i,...,,,,,,,,,,3e-10
4,1a4y,general,Protein-Protein,2,Ki=1fM,15.00,1998,ANGIOGENIN,RIBONUCLEASE INHIBITOR,"a,d",...,,,,,,,,,,1e-15
5,1acb,general,Protein-Protein,2,Kd=0.2nM,9.70,1993,bovine alpha-chymotrypsin,leech (Hirudo medicinalis) protein proteinase ...,i,...,,,,,,,,,,2e-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2848,6um5,general,Protein-Protein,4.2,Kd=532nM,6.27,2019,CH848 10.17DT Env,HIV-1 neutralizing antibody DH270 UCA3,"c, g, k, d, h, l",...,,,,0.0,0.0,,,,,5.32e-07
2849,6umt,general,Protein-Protein,1.99,Kd=2.6nM,8.59,2019,"Programmed cell death protein 1, human PD-1 (N...","Programmed cell death 1 ligand 2, PD-L2 IgV",b,...,,,,0.0,0.0,,,,,2.6e-09
2850,6uvo,general,Protein-Protein,2.9,Kd=579pM,9.24,2019,"RSV G central conserved domain, RSV G (157-197)",broadly neutralizing human monoclonal antibody...,"l,h",...,,,,0.0,0.0,,,,,5.79e-10
2851,6uys,general,Protein-Protein,1.59,Kd=1.6uM,5.80,2019,K37-acetylated SUMO1,phosphorylated PML-SIM,"b,d",...,,,,0.0,0.0,,,,,1.6e-06


# rename

In [8]:
df['Source Data Set'] = 'PDBbind v2020'
df.loc[:,['Model','Mutations','Affinity Method','Temperature(K)','Structure Method','Affinity Release Date']] = np.nan

df.rename(columns={
    'ID': 'Index',
    'PDB code': 'PDB',
    'Release Year': 'PDB Release Date',
    'Pubmed ID': 'Affinity PubMed ID',#'PDB PubMed ID',
    'Protein Name': 'Receptor Name',
    'Protein Chains': 'Receptor Chains',
    'Resolution': 'Resolution(Å)',
},inplace=True)

df['PDB'] = df['PDB'].apply(lambda x: x.upper())
# df['Ligand Chains'] = df['Ligand Chains'].apply(lambda x: x.upper() if isinstance(x,str) else x)
# df['Receptor Chains'] = df['Receptor Chains'].apply(lambda x: x.upper() if isinstance(x,str) else x)

df['Affinity PubMed ID'] = df['Affinity PubMed ID'].astype(str)
df['Paired_Chains'] = df['Paired_Chains'].apply(lambda x: [pair for pair in x.split('\n')] if isinstance(x,str) else x)

# split pairs

In [9]:
rows = []
for ind in df.index:
    if isinstance(df.loc[ind,'Paired_Chains'], list):
        for pair in df.loc[ind,'Paired_Chains']:
            row = df.loc[[ind],:]
            row['Ligand Chains'] = pair.split('_')[0]
            row['Receptor Chains'] = pair.split('_')[1]
            rows.append(row)
    else:
        row = df.loc[[ind],:]
        rows.append(row)
        
df = pd.concat(rows,axis=0 )
df.reset_index(drop=True, inplace=True)

## upper & lower chains
'/{CHARACTOR}' is 'charactor' indeed

In [10]:
# upper
df['Ligand Chains'] = df['Ligand Chains'].apply(lambda x: x.upper())
df['Receptor Chains'] = df['Receptor Chains'].apply(lambda x: x.upper())
# lower
df['Ligand Chains'] = df['Ligand Chains'].apply(lambda x: lower_chain(x))
df['Receptor Chains'] = df['Receptor Chains'].apply(lambda x: lower_chain(x))
assert len(df[df['Ligand Chains'].apply(lambda x: '/' in x)])==0
assert len(df[df['Receptor Chains'].apply(lambda x: '/' in x)])==0

# Screening phage2

In [11]:
# 精筛：筛除受体链、配体链不是氨基酸链的样本
"""
# 串行版
from utils import check_chains
data_dir = r"E:\1.THPRD\1.AI_LifeScience\3.AntiBody\data\PDBbind v2020\PP" 
keep_inds = []
for pdb in tqdm.tqdm(set(df.PDB.tolist()),'Filtering PDBs that has non-aa-chains in complex'):
    pdb_file = os.path.join( data_dir, f'{pdb.lower()}.ent.pdb' )
    df_tmp = df[df['PDB']==pdb]
    for ind in df_tmp.index:
        ligand_chains = [chain.strip() for chain in df.loc[ind,'Ligand Chains'].split(',')]
        receptor_chains = [chain.strip() for chain in df.loc[ind,'Receptor Chains'].split(',')]
        result = check_chains(pdb_file, ligand_chains, receptor_chains)
        if all(result.values()):
            keep_inds.append(ind)
        else:
            print(f"Delete {pdb}, because: {result}")
            
drop_inds = [ind for ind in df.index.tolist() if ind not in keep_inds]
df_drop = df.loc[drop_inds]
pdbs_drop = set(df_drop['PDB'].values.tolist())
print(f'Number of dropped PDBs: {len(pdbs_drop)}, which is:\n{pdbs_drop}')
df = df.loc[keep_inds]
print('After the filtering, we get df:')
display(df)
"""

# 并行版
from utils import check_chains
data_dir = "processed_data/PDB/PDBbind v2020" 

def is_pp_complex(pdb):
    inds = []
    pdb_file = os.path.join( data_dir, f'{pdb.lower()}.ent.pdb' )
    df_tmp = df[df['PDB']==pdb]
    for ind in df_tmp.index:
        ligand_chains = [chain.strip() for chain in df.loc[ind,'Ligand Chains'].split(',')]
        receptor_chains = [chain.strip() for chain in df.loc[ind,'Receptor Chains'].split(',')]
        result = check_chains(pdb_file, ligand_chains, receptor_chains)
        if all(result.values()):
            inds.append(ind)
        else:
            pass
    return inds

pdbs = set(df['PDB'].values)
keep_inds = Parallel(n_jobs=-1)(delayed(is_pp_complex)(pdb) for pdb in tqdm.tqdm(pdbs,desc='Screening protein-protein complexes'))
keep_inds = [ind for sublist in keep_inds for ind in sublist]# 展平
drop_inds = [ind for ind in df.index.tolist() if ind not in keep_inds]
df_notPPC = df.loc[drop_inds]
df_notPPC.insert(loc=0,column='Reason for Deletion',value=np.nan)
df_notPPC['Reason for Deletion'] = 'not Protein-Protein-Complex'

# save deleted records as an excel file
df_delphase1 = df_notPPC
df_delphase1.to_excel( os.path.join(save_dir,'PDBbind v2020(samples_deleted_phase2).xlsx') )
pdbs_drop = set(df_notPPC['PDB'].values.tolist())
print(f"Deleted  {len(pdbs_drop)} PDBs in screening phase2 due to 'not Protein-Protein-Complex', they are:\n{pdbs_drop}")

df = df.loc[keep_inds]
print('After the screening phase 2, we get:')
display(df)

Screening protein-protein complexes: 100%|██████████| 2727/2727 [00:41<00:00, 65.80it/s]


Deleted  6 PDBs in screening phase2 due to 'not Protein-Protein-Complex', they are:
{'6AND', '6ANA', '4R8I', '3SWP', '6ANI', '4WB2'}
After the screening phase 2, we get:


Unnamed: 0,PDB,Subset,Complex Type,Resolution(Å),Affinity Data,pKd pKi pIC50,PDB Release Date,Receptor Name,Ligand Name,Ligand Chains,...,Rotatable bonds,Canonical SMILES,KD(M),Source Data Set,Model,Mutations,Affinity Method,Temperature(K),Structure Method,Affinity Release Date
2335,4JMF,general,Protein-Protein,2.1,Kd=0.298uM,6.53,2014,Exoenzyme T (residues 28-77),Probable chaperone SpcS,"B,C",...,,,2.98e-07,PDBbind v2020,,,,,,
3910,6DCN,general,Protein-Protein,2.44,Kd=1.4uM,5.85,2018,"BCL-xl protein, BCL2L1","Beclin 1 BH3 domain T108pThr, BECN1 pThr","D,C",...,,,1.4e-06,PDBbind v2020,,,,,,
1156,2Z3R,general,Protein-Protein,2,Kd=38pM,10.42,2007,IL-15Ra,IL-15,A,...,,,3.8e-11,PDBbind v2020,,,,,,
1157,2Z3R,general,Protein-Protein,2,Kd=38pM,10.42,2007,IL-15Ra,IL-15,C,...,,,3.8e-11,PDBbind v2020,,,,,,
1158,2Z3R,general,Protein-Protein,2,Kd=38pM,10.42,2007,IL-15Ra,IL-15,E,...,,,3.8e-11,PDBbind v2020,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2317,4IYP,general,Protein-Protein,2.8,Kd=0.3uM,6.52,2013,"Immunoglobulin-binding protein 1, alpha 4",catalytic subunit of protein phosphatase 2A (P...,C,...,,,3e-07,PDBbind v2020,,,,,,
2268,4HH3,general,Protein-Protein,1.75,Kd=1.3uM,5.89,2013,"Transcriptional regulator, PpsR",AppA protein,C,...,,,1.3e-06,PDBbind v2020,,,,,,
3050,5FO8,general,Protein-Protein,2.4,Kd=0.35uM,6.46,2016,Human Complement C3b,"MEMBRANE COFACTOR PROTEIN, MCP (CCP1-4)",C,...,,,3.5e-07,PDBbind v2020,,,,,,
1802,3TVM,general,Protein-Protein,2.8,Kd=51.9nM,7.28,2012,CD1d-SMC124,iNKT TCR,"C,D",...,,,5.19e-08,PDBbind v2020,,,,,,


# PDB info

In [12]:
"""
# 串行版
from utils import get_pdb_info
pdbs = set(df['PDB'].values)
for pdb in tqdm.tqdm(pdbs,desc="Fetch PDB infomation..."):
    results = get_pdb_info(pdb)
    if results['Resolution']!=None:
        df.loc[df[df.PDB==pdb].index,'Resolution(Å)'] = results['Resolution'].replace('Å','').strip()
    if results['Method'] != None:
        df.loc[df[df.PDB==pdb].index,'Structure Method'] = results['Method']
    if results['Release Date'] != None:
        df.loc[df[df.PDB==pdb].index,'PDB Release Date'] = results['Release Date']   
    #这是PDB的PubMed ID, 不是亲和力的PubMed ID
    if results['PubMed ID'] != None:
        df.loc[df[df.PDB==pdb].index,'PDB PubMed ID'] = results['PubMed ID']
"""      
    
# # 并行版
# from utils import get_pdb_info

# pdbs = set(df['PDB'].values)
# # 获取pdb信息
# infos = Parallel(n_jobs=-1)(delayed(get_pdb_info)(pdb) for pdb in tqdm.tqdm(pdbs,desc='Fetching PDB infomations...'))
# infos = dict([(info['PDB'], info) for info in infos])
# # 填写到df中
# for pdb in tqdm.tqdm(pdbs,desc="Fill in the PDB infomations..."):
#     info = infos[pdb]
#     if info['Resolution']!=None:
#         df.loc[df[df.PDB==pdb].index,'Resolution(Å)'] = info['Resolution'].replace('Å','').strip()
#     if info['Method'] != None:
#         df.loc[df[df.PDB==pdb].index,'Structure Method'] = info['Method']
#     if info['Release Date'] != None:
#         df.loc[df[df.PDB==pdb].index,'PDB Release Date'] = info['Release Date']   
#     # 注意：这是PDB的PubMed ID, 不是亲和力的PubMed ID
#     if info['PubMed ID'] != None:
#         df.loc[df[df.PDB==pdb].index,'PDB PubMed ID'] = info['PubMed ID']
        
        
        
# 并行版
from utils import get_pdb_info

df['PDB PubMed ID'] = np.nan
pdbs = set(df.loc[df['PDB PubMed ID'].isna(), 'PDB'].values.tolist())
N = len(pdbs)
early_stop = 0
while N>0 and early_stop<5:
    # 获取pdb信息
    infos = Parallel(n_jobs=-1)(delayed(get_pdb_info)(pdb) for pdb in tqdm.tqdm(pdbs,desc=f'Fetching PDB infomation of {len(pdbs)} PDBs...'))
    infos = dict([(info['PDB'], info) for info in infos])
    # 填写到df中
    for pdb in tqdm.tqdm(pdbs,desc="Fill in the PDB infomations..."):
        info = infos[pdb]
        if info['Resolution']!=None:
            df.loc[df[df.PDB==pdb].index,'Resolution(Å)'] = info['Resolution'].replace('Å','').strip()
        if info['Method'] != None:
            df.loc[df[df.PDB==pdb].index,'Structure Method'] = info['Method']
        if info['Release Date'] != None:
            df.loc[df[df.PDB==pdb].index,'PDB Release Date'] = info['Release Date']   
        # 注意：这是PDB的PubMed ID, 不是亲和力的PubMed ID
        if info['PubMed ID'] != None:
            df.loc[df[df.PDB==pdb].index,'PDB PubMed ID'] = info['PubMed ID']
            
    pdbs = set(df.loc[df['PDB PubMed ID'].isna(), 'PDB'].values.tolist())
    if len(pdbs) < N:
        early_stop = 0
        N = len(pdbs)
    else:
        early_stop += 1

N_notfound = df['PDB PubMed ID'].isna().sum()
if N_notfound==0:
    print(f">>> 成功找到所有Affinity PubMed ID的Release Date！")
else:
    print(f">>> 还有{N_notfound}个Affinity PubMed ID的Release Date没找到。他们是：{set(df.loc[df['PDB PubMed ID'].isna(),'PDB'].values.tolist())}")

Fetching PDB infomation of 2721 PDBs...: 100%|██████████| 2721/2721 [01:20<00:00, 33.92it/s]
Fill in the PDB infomations...: 100%|██████████| 2721/2721 [00:15<00:00, 174.02it/s]
Fetching PDB infomation of 19 PDBs...: 100%|██████████| 19/19 [00:00<00:00, 6073.14it/s]
Fill in the PDB infomations...: 100%|██████████| 19/19 [00:00<00:00, 186.66it/s]
Fetching PDB infomation of 19 PDBs...: 100%|██████████| 19/19 [00:00<00:00, 6057.45it/s]
Fill in the PDB infomations...: 100%|██████████| 19/19 [00:00<00:00, 185.26it/s]
Fetching PDB infomation of 19 PDBs...: 100%|██████████| 19/19 [00:00<00:00, 6014.02it/s]
Fill in the PDB infomations...: 100%|██████████| 19/19 [00:00<00:00, 192.09it/s]
Fetching PDB infomation of 19 PDBs...: 100%|██████████| 19/19 [00:00<00:00, 7291.10it/s]
Fill in the PDB infomations...: 100%|██████████| 19/19 [00:00<00:00, 193.49it/s]
Fetching PDB infomation of 19 PDBs...: 100%|██████████| 19/19 [00:00<00:00, 6730.72it/s]
Fill in the PDB infomations...: 100%|██████████| 19/1

>>> 还有45个Affinity PubMed ID的Release Date没找到。他们是：{'3RVW', '2PTC', '6MM6', '1X1X', '4HX3', '2TGP', '1X1Y', '5FT8', '3OHM', '1X1W', '3V3K', '1X1U', '6MGP', '3RVV', '4E41', '6DLD', '1A2K', '3BZD', '5V5I'}





# affinity info

In [13]:
def get_pubdate(pubmed_id:str, dst_datetime_format:str='%Y-%m-%d'):
    """
    根据输入的pumbed_id， 查询PubMed数据库，获取其发表日期。
    Reference: https://www.ncbi.nlm.nih.gov/pmc/tools/get-metadata/
    """
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id={pubmed_id}&retmode=json"
    try:
        response = requests.get(url)
        date_str = response.json()['result'][pubmed_id]['pubdate']
        return {'PMID':pubmed_id, 'publish date':date_str}
    except:
        return {'PMID':pubmed_id, 'publish date':None}

In [14]:
# 根据Affinity PubMed ID查询Affinity Release Date

# 并行版：
from utils import get_pubdate

# df['Affinity Release Date'] = np.nan
PMIDs = set(df.loc[~df['Affinity PubMed ID'].isna() * df['Affinity Release Date'].isna(), 'Affinity PubMed ID'].values.tolist())
N = len(PMIDs)
early_stop = 0
while N>0 and early_stop<5:
    # 获取pdb信息
    pubdate_infos = Parallel(n_jobs=8)(delayed(get_pubdate)(PMID) for PMID in tqdm.tqdm(PMIDs,desc='Fetching affinity release datas...'))
    # 填写到df中
    for pubdate_info in tqdm.tqdm(pubdate_infos,desc="Fill in affinity release dates..."):
        PMID,pubdate = pubdate_info['PMID'], pubdate_info['publish date']
        # 是亲和力的PubMed ID
        df.loc[df['Affinity PubMed ID']==PMID, 'Affinity Release Date'] = pubdate
    
    PMIDs = set(df.loc[~df['Affinity PubMed ID'].astype(float).isna() * df['Affinity Release Date'].isna(), 'Affinity PubMed ID'].values.tolist())
    if len(PMIDs) < N:
        early_stop = 0
        N = len(PMIDs)
    else:
        early_stop += 1
    
N_notfound = df['Affinity Release Date'].isna().sum() - df['Affinity PubMed ID'].astype(float).isna().sum()
if N_notfound==0:
    print(f">>> 成功找到所有Affinity PubMed ID的Release Date！")
else:
    print(f">>> 还有{N_notfound}个Affinity PubMed ID的Release Date没找到。")

Fetching affinity release datas...: 100%|██████████| 2016/2016 [08:09<00:00,  4.12it/s]
Fill in affinity release dates...: 100%|██████████| 2016/2016 [00:01<00:00, 1093.02it/s]
Fetching affinity release datas...: 100%|██████████| 859/859 [03:09<00:00,  4.54it/s]
Fill in affinity release dates...: 100%|██████████| 859/859 [00:00<00:00, 1083.51it/s]
Fetching affinity release datas...: 100%|██████████| 396/396 [01:36<00:00,  4.10it/s]
Fill in affinity release dates...: 100%|██████████| 396/396 [00:00<00:00, 1025.29it/s]
Fetching affinity release datas...: 100%|██████████| 161/161 [00:30<00:00,  5.23it/s]
Fill in affinity release dates...: 100%|██████████| 161/161 [00:00<00:00, 942.75it/s]
Fetching affinity release datas...: 100%|██████████| 76/76 [00:12<00:00,  5.86it/s]
Fill in affinity release dates...: 100%|██████████| 76/76 [00:00<00:00, 819.28it/s]
Fetching affinity release datas...: 100%|██████████| 32/32 [00:04<00:00,  7.73it/s]
Fill in affinity release dates...: 100%|██████████| 3

>>> 成功找到所有Affinity PubMed ID的Release Date！





# sorted

In [15]:
inds = sorted(df.index.tolist())
df = df.loc[inds]
df.reset_index(drop=True, inplace=True)

# save

In [16]:
df = df.loc[:,[
    'PDB', 'Source Data Set', 'Model', 'Mutations', 'Ligand Chains', 'Receptor Chains', 'Ligand Name', 'Receptor Name', 
    'KD(M)', 'Affinity Method', 'Structure Method', 'Temperature(K)', 'Resolution(Å)', 
    'PDB PubMed ID', 'PDB Release Date', 'Affinity PubMed ID', 'Affinity Release Date', 
]]
display(df)
df.to_excel(os.path.join(save_dir,'PDBbind v2020.xlsx'))

Unnamed: 0,PDB,Source Data Set,Model,Mutations,Ligand Chains,Receptor Chains,Ligand Name,Receptor Name,KD(M),Affinity Method,Structure Method,Temperature(K),Resolution(Å),PDB PubMed ID,PDB Release Date,Affinity PubMed ID,Affinity Release Date
0,1A22,PDBbind v2020,,,A,B,G120R mutant human growth hormone (hGH),GROWTH HORMONE RECEPTOR,3.4e-10,,X-RAY DIFFRACTION,,2.60,9571026,1998-04-29,9571026,1998 Apr 17
1,1A2K,PDBbind v2020,,,"A,B",C,Nuclear Transport Factor 2,Ran GTPase,1.5e-07,,,,2.5,,1998-04-29,9533885,1998 Apr 3
2,1A2K,PDBbind v2020,,,"A,B",D,Nuclear Transport Factor 2,Ran GTPase,1.5e-07,,,,2.5,,1998-04-29,9533885,1998 Apr 3
3,1A2K,PDBbind v2020,,,"A,B",E,Nuclear Transport Factor 2,Ran GTPase,1.5e-07,,,,2.5,,1998-04-29,9533885,1998 Apr 3
4,1A3B,PDBbind v2020,,,I,"L,H",21-mer,thrombin alpha,3e-10,,X-RAY DIFFRACTION,,1.80,9772168,1998-06-03,8272424,1993 Nov
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3980,6UMT,PDBbind v2020,,,B,A,"Programmed cell death 1 ligand 2, PD-L2 IgV","Programmed cell death protein 1, human PD-1 (N...",2.6e-09,,X-RAY DIFFRACTION,,1.99,31727844,2019-11-27,31727844,2019 Dec 3
3981,6UVO,PDBbind v2020,,,"L,H",D,broadly neutralizing human monoclonal antibody...,"RSV G central conserved domain, RSV G (157-197)",5.79e-10,,X-RAY DIFFRACTION,,2.90,31852779,2019-12-18,31852779,2020 Feb 28
3982,6UYS,PDBbind v2020,,,B,A,phosphorylated PML-SIM,K37-acetylated SUMO1,1.6e-06,,X-RAY DIFFRACTION,,1.59,31879127,2019-11-27,31879127,2020 Feb 4
3983,6UYS,PDBbind v2020,,,D,C,phosphorylated PML-SIM,K37-acetylated SUMO1,1.6e-06,,X-RAY DIFFRACTION,,1.59,31879127,2019-11-27,31879127,2020 Feb 4
