# Affinity Benchmark v5.5

In [1]:
import os
import numpy as np
import pandas as pd
import tqdm
from joblib import Parallel, delayed

In [2]:
save_dir = './processed_data'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [3]:
root_dir = r"source_data"
filepath = os.path.join(root_dir, r"Affinity Benchmark v5.5.xlsx")
assert os.path.exists(filepath), f'{filepath} not exists!'
df = pd.read_excel(filepath,header=0,converters={'Kd(M)':str})
display(df)

Unnamed: 0,Complex_PDB(a),Class(b),Unbound_PDB(a),Component 1,Unbound_PDB(a).1,Component 2,reference (c),Kd(M),∆G (kcal/ mol)(d),I_rmsd (Å)(e),∆ASA(Å2)(f),Method,Temp (°C),pH,BM version
0,1A2K_C:AB,OG,1QG4_A,Ran GTPase-GDP,1OUN_AB,Nuclear transport factor 2,"Chaillan-Huntington C, Braslavsky CV, Kuhlmann...",1.5e-07,9.3,1.11,1603.0,ITC,25,7.5,2.0
1,1ACB_E:I,EI,4CHA_ABC,Chymotrypsin,1EGL_A,Eglin C,"Ascenzi P, Amiconi G, Menegatti E, Guarneri M,...",2e-10,13.1,1.08,1544.0,spectro-scopy,21,8,2.0
2,1AHW_AB:C,A,1FGN_LH,Fab 5g9,1TFH_A,Tissue factor,"Huang M, Syed R, Stura EA, Stone MJ, Stefanko ...",3.4e-09,11.6,0.69,1899.0,Inhibition,ambient,not stated,2.0
3,1AK4_A:D,OX,2CPL_A,Cyclophilin,1E6J_P,HIV capsid,"Yoo S, Myszka DG, Yeh C, McMurray M, Hill CP, ...",1.6e-05,6.4,1.33,1029.0,ITC,20,6.5,2.0
4,1AKJ_AB:DE,OX,2CLR_DE,MHC class 1 HLA-A2,1CD8_AB,T-cell CD8 coreceptor,"Wyer JR, Willcox BE, Gao GF, Gerth UC, Davis S...",0.00013,5.3,1.14,1995.0,SPR,25,7.4,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
280,1R8S_A:E,OG,1HUR_A,Arf1 GTPase,1R8M_E,Sec 7 domain,,,,3.73,2986.0,,,,3.0
281,1RKE_A:B,OX,1SYQ_A,Vinculin head,3MYI_A,Vinculin tail,,,,4.25,2614.0,,,,5.0
282,1Y64_A:B,OX,2FXU_A,Actin,1UX5_A,BNI1 protein,,,,4.69,2745.0,,,,3.0
283,2J7P_A:D,OX,1NG1_A,SRP GTPase Ffh,2IYL_D,Cell division protein FtsY,,,,2.67,3008.0,,,,4.0


In [4]:
df['Ligand Chains'] = df['Complex_PDB(a)'].apply(lambda x: ','.join([c for c in x.split('_')[1].split(':')[0]]))
df['Receptor Chains'] = df['Complex_PDB(a)'].apply(lambda x: ','.join([c for c in x.split('_')[1].split(':')[1]]))

df.rename(columns={
    'Complex_PDB(a)': 'PDB',
    'Component 1': 'Ligand Name',
    'Component 2': 'Receptor Name',
    'reference(c)': 'Reference',
    'Kd(M)': 'KD(M)',
    'Method': 'Affinity Method',
    'Temp (°C)': 'Temperature(K)',
},inplace=True)


df['PDB'] = df['PDB'].apply(lambda x: x.split('_')[0])
df['Temperature(K)'] = df['Temperature(K)'].apply(lambda x: float(x) if x not in ['ambient','not stated'] else np.nan)
df['Temperature(K)'] = df['Temperature(K)'].apply(lambda x: x+275.15 if x<275 else x)

df['Source Data Set'] = 'Affinity Benchmark v5.5'
df.loc[:,[
    'Model','Release Date','Mutations','Affinity Method','Structure Method','Resolution(Å)', 
    'Affinity PubMed ID', 'Affinity Release Date',
]] = np.nan

# download PDB

In [5]:
from utils import download_pdb
    
save_dir = r"./processed_data/PDB/Affinity Benchmark v5.5"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
for pdb in tqdm.tqdm(set(df['PDB'].values.tolist())):
    if not os.path.exists( os.path.join(save_dir, f"{pdb}.pdb") ):
        file_path = download_pdb(pdb_code=pdb, save_dir=save_dir)
        if file_path:
            pass
        else:
            print(f"下载 {pdb}.pdb 失败，请检查PDB代码是否正确。")
    else:
        pass


100%|██████████| 279/279 [00:00<00:00, 88659.05it/s]


# Screening

## phase1

In [6]:
# 删除KD(M)为nan的样本
df_delphase1 = df[df['KD(M)'].isna()]
df_delphase1.insert(loc=0,column='Reason for Deletion',value=np.nan)
df_delphase1.loc[:,'Reason for Deletion'] = 'no affinity label'

df = df[~df['KD(M)'].isna()]

## phase2

In [7]:
# 精筛：筛除受体链、配体链不是氨基酸链的样本

"""
# 串行版
df.reset_index(inplace=True, drop=True)
from utils import check_chains
assert len(set(df.index.values.tolist())) == len(df)
data_dir = save_dir 
keep_inds = []
for pdb in tqdm.tqdm(set(df.PDB.tolist()),'Iterate over all PDBs'):
    pdb_file = os.path.join( data_dir, f'{pdb.upper()}.pdb' )
    df_tmp = df[df['PDB']==pdb]
    for ind in df_tmp.index:
        ligand_chains = [chain.strip() for chain in df_tmp.loc[ind,'Ligand Chains'].split(',')]
        receptor_chains = [chain.strip() for chain in df_tmp.loc[ind,'Receptor Chains'].split(',')]
        result = check_chains(pdb_file, ligand_chains, receptor_chains)
        if all(result.values()):
            keep_inds.append(ind)
        else:
            print(f"Delete {pdb}, because: {result}")
drop_inds = [ind for ind in df.index.tolist() if ind not in keep_inds]
df_drop = df.loc[drop_inds]
pdbs_drop = set(df_drop['PDB'].values.tolist())
print(f'Number of dropped PDBs: {len(pdbs_drop)}, which is:\n{pdbs_drop}')
df = df.loc[keep_inds]
print('After the filtering, we get df:')
display(df)
"""


# 并行版
from utils import check_chains
data_dir = "processed_data/PDB/Affinity Benchmark v5.5" 

def is_pp_complex(pdb):
    inds = []
    pdb_file = os.path.join( data_dir, f'{pdb.upper()}.pdb' )
    df_tmp = df[df['PDB']==pdb]
    for ind in df_tmp.index:
        ligand_chains = [chain.strip() for chain in df.loc[ind,'Ligand Chains'].split(',')]
        receptor_chains = [chain.strip() for chain in df.loc[ind,'Receptor Chains'].split(',')]
        result = check_chains(pdb_file, ligand_chains, receptor_chains)
        if all(result.values()):
            inds.append(ind)
        else:
            pass
    return inds

pdbs = set(df['PDB'].values)
keep_inds = Parallel(n_jobs=-1)(delayed(is_pp_complex)(pdb) for pdb in tqdm.tqdm(pdbs,desc='Screening protein-protein complexes'))
keep_inds = [ind for sublist in keep_inds for ind in sublist]# 展平
del_inds = [ind for ind in df.index.tolist() if ind not in keep_inds]

df_delphase2 = df.loc[del_inds]
pdbs_del = set(df_delphase2['PDB'].values.tolist())
print(f'Number of dropped PDBs: {len(pdbs_del)}, which is:\n{pdbs_del}')
df_delphase2.insert(loc=0,column='Reason for Deletion',value=np.nan)
df_delphase2['Reason for Deletion'] = 'not Protein-Protein-Complex'

df = df.loc[keep_inds]
print('After the screening, we get df:')
display(df)

Screening protein-protein complexes: 100%|██████████| 206/206 [00:02<00:00, 72.80it/s]


Number of dropped PDBs: 0, which is:
set()
After the screening, we get df:


Unnamed: 0,PDB,Class(b),Unbound_PDB(a),Ligand Name,Unbound_PDB(a).1,Receptor Name,reference (c),KD(M),∆G (kcal/ mol)(d),I_rmsd (Å)(e),...,Ligand Chains,Receptor Chains,Source Data Set,Model,Release Date,Mutations,Structure Method,Resolution(Å),Affinity PubMed ID,Affinity Release Date
63,1KTZ,OR,1TGK_A,TGF-beta,1M9Z_A,TGF-beta receptor,"De Crescenzo G, Hinck CS, Shu Z, Zuniga J, Yan...",2.9e-07,8.9,0.39,...,A,B,Affinity Benchmark v5.5,,,,,,,
57,1JTG,EI,3GMU_B,beta-lactamase inhibitor protein,1ZG4_A,beta-lactamase TEM-1,"Albeck S, Schreiber G (1999) Biophysical chara...",4e-10,12.8,0.49,...,B,A,Affinity Benchmark v5.5,,,,,,,
241,4ETQ,AA,4EBQ_HL,LA5,4E9O_X,vaccinia D8L IMV,"Matho et al., 2012",1.8e-10,-13.29,0.47,...,"H,L",C,Affinity Benchmark v5.5,,,,,,,
16,1DE4,OX,1A6Z_AB,hemochromatosis protein HFE,1CX8_AB,Transferrin receptor ectodom.,"West AP, Giannetti AM, Herr AB, Bennett MJ, Na...",6.8e-08,9.8,2.59,...,"A,B","C,F",Affinity Benchmark v5.5,,,,,,,
56,1JPS,A,1JPT_HL,Fab D3H44,1TFH_B,Tissue factor,"Presta L, Sims P, Meng YG, Moran P, Bullens S,...",1e-10,13.6,0.51,...,"H,L",T,Affinity Benchmark v5.5,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,1AHW,A,1FGN_LH,Fab 5g9,1TFH_A,Tissue factor,"Huang M, Syed R, Stura EA, Stone MJ, Stefanko ...",3.4e-09,11.6,0.69,...,"A,B",C,Affinity Benchmark v5.5,,,,,,,
174,2W9E,A,2W9D_HL,ICSM 18 Fab fragment,1QM1_A,Prion protein fragment,"Antonyuk, S. V., Trevitt, C. R., Strange, R. W...",1.3e-10,-13.49,,...,"H,L",A,Affinity Benchmark v5.5,,,,,,,
37,1GL1,EI,4CHA_ABC,Alpha-chymotrypsin,1PMC_A(6),PMP-C (LCMI II),"Kellenberger C, Boudier C, Bermudez I, Bieth J...",2e-10,13.2,1.20,...,A,I,Affinity Benchmark v5.5,,,,,,,
15,1CBW,NC,4CHA_ABC,Chymotrypsin,9PTI_A,BPTI,"Castro MJ, Anderson S (1996) Alanine point-mut...",1.1e-08,10.7,0.74,...,"A,B,C",D,Affinity Benchmark v5.5,,,,,,,


## save deleted samples as an excel file

In [8]:
save_dir = './processed_data'
df_del = pd.concat([df_delphase1,df_delphase2], axis=0) if len(df_delphase2)>0 else df_delphase1
df_del.to_excel( os.path.join(save_dir,'Affinity Benchmark v5.5(samples_deleted).xlsx') )

# PDB info

In [9]:
"""
# 串行版
from utils import get_pdb_info
pdbs = set(df['PDB'].values)
for pdb in tqdm.tqdm(pdbs):
    results = get_pdb_info(pdb)
    if results['Resolution']!=None:
        df.loc[df[df.PDB==pdb].index,'Resolution(Å)'] = results['Resolution'].replace('Å','').strip()
    if results['Method'] != None:
        df.loc[df[df.PDB==pdb].index,'Structure_Method'] = results['Method']
    if results['Release Date'] != None:
        df.loc[df[df.PDB==pdb].index,'PDB Release Date'] = results['Release Date']   
    #这是PDB的PubMed ID, 不是亲和力的PubMed ID
    if results['PubMed ID'] != None:
        df.loc[df[df.PDB==pdb].index,'PDB PubMed ID'] = results['PubMed ID']
"""
        
        
# 并行版
from utils import get_pdb_info
pdbs = set(df['PDB'].values)
# 获取pdb信息
infos = Parallel(n_jobs=-1)(delayed(get_pdb_info)(pdb) for pdb in tqdm.tqdm(pdbs,desc='Fetching PDB infomations...'))
infos = dict([(info['PDB'], info) for info in infos])
# 填写到df中
for pdb in tqdm.tqdm(pdbs,desc="Fill in the PDB infomations..."):
    info = infos[pdb]
    if info['Resolution']!=None:
        df.loc[df[df.PDB==pdb].index,'Resolution(Å)'] = info['Resolution'].replace('Å','').strip()
    if info['Method'] != None:
        df.loc[df[df.PDB==pdb].index,'Structure Method'] = info['Method']
    if info['Release Date'] != None:
        df.loc[df[df.PDB==pdb].index,'PDB Release Date'] = info['Release Date']   
    #这是PDB的PubMed ID, 不是亲和力的PubMed ID
    if info['PubMed ID'] != None:
        df.loc[df[df.PDB==pdb].index,'PDB PubMed ID'] = info['PubMed ID']

Fetching PDB infomations...: 100%|██████████| 206/206 [00:03<00:00, 53.82it/s]
Fill in the PDB infomations...: 100%|██████████| 206/206 [00:00<00:00, 251.68it/s]


# affinity info
根据'Reference'去PubMed查询Affinity PubMed ID, 继而查询Affinity Release Date

In [10]:
"""
# 串行版
from utils import get_PubMedID_by_title, get_pubdate
for ind in tqdm.tqdm(df.index):
    pmid = get_PubMedID_by_title(title=df.loc[ind, 'reference (c)'])
    if isinstance(pmid,str):
        df.loc[ind, 'Affinity PubMed ID'] = pmid
    #  Affinity Release Date
    try:
        pubdate = get_pubdate(str(pmid), dst_datetime_format='%Y-%m-%d')
        df.loc[ind, 'Affinity Release Date' ] = pubdate
    except:
        pass
"""    
    
# 并行版
from utils import get_PubMedID_by_title, get_pubdate

# Affinity PubMed ID
queries = set(df.loc[~df['reference (c)'].isna() * df['Affinity PubMed ID'].isna(), 'reference (c)'].values.tolist())
N = len(queries)
early_stop = 0
while N>0 and early_stop<5:
    # 获取pmid信息
    pmid_infos = Parallel(n_jobs=8)(delayed(get_PubMedID_by_title)(query) for query in 
                                     tqdm.tqdm(queries,desc='Fetching affinity PubMed IDs...'))
    query2pmids = dict([(pmid_info['title'], pmid_info['PMID']) for pmid_info in pmid_infos])
    # 填写到df中
    for query in tqdm.tqdm(queries,desc="Fill in the Affinity PubMed ID..."):
        # PubMed ID
        pmid = query2pmids[query]
        if isinstance(pmid,str):
            df.loc[df['reference (c)']==query, 'Affinity PubMed ID'] = pmid     
    queries = set(df.loc[~df['reference (c)'].isna() * df['Affinity PubMed ID'].isna(), 'reference (c)'].values.tolist())
    # early stop
    if len(queries) < N:
        early_stop = 0
        N = len(queries)
    else:
        early_stop += 1
    
N_notfound = df['Affinity PubMed ID'].isna().sum() - df['reference (c)'].isna().sum()
if N_notfound==0:
    print(f">>> 成功找到所有reference (c)的Affinity PubMed ID！")
else:
    print(f">>> 还有{N_notfound}个reference (c)的Affinity PubMed ID没找到。")
# Lin et al., 2018这类query输入给  get_PubMedID_by_title 后放回额PMID是None
# 此时将query本身赋值给 'Affinity PubMed ID'
df.loc[df['Affinity PubMed ID'].isna(),'Affinity PubMed ID'] = df.loc[df['Affinity PubMed ID'].isna(),'reference (c)']
  
    
# Affinity Release Date
df['Affinity Release Date'] = np.nan
queries = set(df.loc[~df['Affinity PubMed ID'].isna() * df['Affinity Release Date'].isna(), 'Affinity PubMed ID'].values.tolist())
N = len(queries)
early_stop = 0
while N>0 and early_stop<5:
    # 获取pmid信息
    pubdate_infos = Parallel(n_jobs=8)(delayed(get_pubdate)(str(query), dst_datetime_format='%Y-%m-%d') for query in 
                                        tqdm.tqdm(queries,desc='Fetching Affinity Release Date...'))
    query2dates = dict([(pubdate_info['PMID'],pubdate_info['publish date']) for pubdate_info in pubdate_infos])
    # 填写到df中
    for query in tqdm.tqdm(queries,desc="Fill in Affinity Release Dates..."):
        # Affinity Release Date
        if query2dates[query] is not None:
            df.loc[df['Affinity PubMed ID']==query, 'Affinity Release Date' ] = query2dates[query]     
    queries = set(df.loc[~df['Affinity PubMed ID'].isna() * df['Affinity Release Date'].isna(), 'Affinity PubMed ID'].values.tolist())
    # early stop
    if len(queries) < N:
        early_stop = 0
        N = len(queries)
    else:
        early_stop += 1
        
N_notfound = df['Affinity Release Date'].isna().sum() - df['Affinity PubMed ID'].isna().sum()
if N_notfound==0:
    print(f">>> 成功找到所有Affinity PubMed ID的Release Date！")
else:
    print(f">>> 还有{N_notfound}个Affinity PubMed ID的Release Date没找到。")

Fetching affinity PubMed IDs...: 100%|██████████| 202/202 [00:53<00:00,  3.75it/s]
Fill in the Affinity PubMed ID...: 100%|██████████| 202/202 [00:00<00:00, 2986.72it/s]
Fetching affinity PubMed IDs...: 100%|██████████| 36/36 [00:03<00:00,  9.37it/s]
Fill in the Affinity PubMed ID...: 100%|██████████| 36/36 [00:00<00:00, 4078.08it/s]
Fetching affinity PubMed IDs...: 100%|██████████| 28/28 [00:02<00:00, 10.36it/s]
Fill in the Affinity PubMed ID...: 100%|██████████| 28/28 [00:00<00:00, 69409.29it/s]
Fetching affinity PubMed IDs...: 100%|██████████| 28/28 [00:02<00:00, 11.08it/s]
Fill in the Affinity PubMed ID...: 100%|██████████| 28/28 [00:00<00:00, 130634.61it/s]
Fetching affinity PubMed IDs...: 100%|██████████| 28/28 [00:02<00:00, 10.69it/s]
Fill in the Affinity PubMed ID...: 100%|██████████| 28/28 [00:00<00:00, 79084.52it/s]
Fetching affinity PubMed IDs...: 100%|██████████| 28/28 [00:02<00:00, 11.27it/s]
Fill in the Affinity PubMed ID...: 100%|██████████| 28/28 [00:00<00:00, 136400.13

>>> 还有28个reference (c)的Affinity PubMed ID没找到。


Fetching Affinity Release Date...: 100%|██████████| 202/202 [00:35<00:00,  5.68it/s]
Fill in Affinity Release Dates...: 100%|██████████| 202/202 [00:00<00:00, 5162.82it/s]
Fetching Affinity Release Date...: 100%|██████████| 119/119 [00:20<00:00,  5.76it/s]
Fill in Affinity Release Dates...: 100%|██████████| 119/119 [00:00<00:00, 5329.31it/s]
Fetching Affinity Release Date...: 100%|██████████| 68/68 [00:11<00:00,  5.72it/s]
Fill in Affinity Release Dates...: 100%|██████████| 68/68 [00:00<00:00, 4086.55it/s]
Fetching Affinity Release Date...: 100%|██████████| 42/42 [00:05<00:00,  8.17it/s]
Fill in Affinity Release Dates...: 100%|██████████| 42/42 [00:00<00:00, 6136.51it/s]
Fetching Affinity Release Date...: 100%|██████████| 36/36 [00:05<00:00,  6.91it/s]
Fill in Affinity Release Dates...: 100%|██████████| 36/36 [00:00<00:00, 7485.00it/s]
Fetching Affinity Release Date...: 100%|██████████| 32/32 [00:04<00:00,  7.06it/s]
Fill in Affinity Release Dates...: 100%|██████████| 32/32 [00:00<00:0

>>> 还有28个Affinity PubMed ID的Release Date没找到。





# save

In [11]:
df.reset_index(drop=True, inplace=True)
df = df.loc[:,[
    'PDB', 'Source Data Set', 'Model', 'Mutations', 'Ligand Chains', 'Receptor Chains', 'Ligand Name', 'Receptor Name', 
    'KD(M)', 'Affinity Method', 'Structure Method', 'Temperature(K)', 'Resolution(Å)', 
    'PDB PubMed ID', 'PDB Release Date', 'Affinity PubMed ID', 'Affinity Release Date', 
]]

df.to_excel('./processed_data/Affinity Benchmark v5.5.xlsx')
display(df)

Unnamed: 0,PDB,Source Data Set,Model,Mutations,Ligand Chains,Receptor Chains,Ligand Name,Receptor Name,KD(M),Affinity Method,Structure Method,Temperature(K),Resolution(Å),PDB PubMed ID,PDB Release Date,Affinity PubMed ID,Affinity Release Date
0,1KTZ,Affinity Benchmark v5.5,,,A,B,TGF-beta,TGF-beta receptor,2.9e-07,,X-RAY DIFFRACTION,300.15,2.15,11850637,2002-02-27,16300789,2006 Jan 6
1,1JTG,Affinity Benchmark v5.5,,,B,A,beta-lactamase inhibitor protein,beta-lactamase TEM-1,4e-10,,X-RAY DIFFRACTION,300.15,1.73,11573088,2001-10-17,9890878,1999 Jan 5
2,4ETQ,Affinity Benchmark v5.5,,,"H,L",C,LA5,vaccinia D8L IMV,1.8e-10,,X-RAY DIFFRACTION,,2.10,22623786,2012-06-06,23152530,2013 Feb
3,1DE4,Affinity Benchmark v5.5,,,"A,B","C,F",hemochromatosis protein HFE,Transferrin receptor ectodom.,6.8e-08,,X-RAY DIFFRACTION,,2.80,10638746,2000-01-19,11800564,2001 Oct 19
4,1JPS,Affinity Benchmark v5.5,,,"H,L",T,Fab D3H44,Tissue factor,1e-10,,X-RAY DIFFRACTION,,1.85,11601848,2002-02-03,11307801,2001 Mar
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202,1AHW,Affinity Benchmark v5.5,,,"A,B",C,Fab 5g9,Tissue factor,3.4e-09,,X-RAY DIFFRACTION,,3.00,9480775,1998-02-25,9480775,1998 Feb 6
203,2W9E,Affinity Benchmark v5.5,,,"H,L",A,ICSM 18 Fab fragment,Prion protein fragment,1.3e-10,,X-RAY DIFFRACTION,,2.90,19204296,2009-02-03,19204296,2009 Feb 24
204,1GL1,Affinity Benchmark v5.5,,,A,I,Alpha-chymotrypsin,PMP-C (LCMI II),2e-10,,X-RAY DIFFRACTION,300.15,2.10,11495915,2001-11-28,7592720,1995 Oct 27
205,1CBW,Affinity Benchmark v5.5,,,"A,B,C",D,Chymotrypsin,BPTI,1.1e-08,,X-RAY DIFFRACTION,297.15,2.60,9300481,1997-07-23,8784199,1996 Sep 3
