In [1]:
import os
import numpy as np
import pandas as pd
import tqdm
from joblib import Parallel, delayed

In [2]:
save_dir = './processed_data'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [3]:
root_dir = r'source_data'
filepath = os.path.join(root_dir, r'skempi_v2.csv')
assert os.path.exists(filepath), f'{filepath} not exists!'
df = pd.read_csv(filepath, sep=";", na_values=['n.b'])
print(f"df.shape: {df.shape}")
display(df.head())

df.shape: (7085, 29)


Unnamed: 0,#Pdb,Mutation(s)_PDB,Mutation(s)_cleaned,iMutation_Location(s),Hold_out_type,Hold_out_proteins,Affinity_mut (M),Affinity_mut_parsed,Affinity_wt (M),Affinity_wt_parsed,...,koff_mut_parsed,koff_wt (s^(-1)),koff_wt_parsed,dH_mut (kcal mol^(-1)),dH_wt (kcal mol^(-1)),dS_mut (cal mol^(-1) K^(-1)),dS_wt (cal mol^(-1) K^(-1)),Notes,Method,SKEMPI version
0,1CSE_E_I,LI45G,LI38G,COR,Pr/PI,Pr/PI,5.26e-11,5.26e-11,1.12e-12,1.12e-12,...,,,,,,,,,IASP,1
1,1CSE_E_I,LI45S,LI38S,COR,Pr/PI,Pr/PI,8.33e-12,8.33e-12,1.12e-12,1.12e-12,...,,,,,,,,,IASP,1
2,1CSE_E_I,LI45P,LI38P,COR,Pr/PI,Pr/PI,1.02e-07,1.02e-07,1.12e-12,1.12e-12,...,,,,,,,,,IASP,1
3,1CSE_E_I,LI45I,LI38I,COR,Pr/PI,Pr/PI,1.72e-10,1.72e-10,1.12e-12,1.12e-12,...,,,,,,,,,IASP,1
4,1CSE_E_I,LI45D,LI38D,COR,Pr/PI,Pr/PI,1.92e-09,1.92e-09,1.12e-12,1.12e-12,...,,,,,,,,,IASP,1


In [4]:
df['Source Data Set'] = 'SKEMPI v2.0'

df.rename(columns={
    '#Pdb': 'PDB',
#     'Mutation(s)_cleaned': 'Mutations',
#     'Mutation(s)_PDB': 'Mutations',
    'Affinity_mut_parsed': 'KD(M)',
    'Protein 1': 'Ligand Name', 
    'Protein 2': 'Receptor Name',
    'Temperature': 'Temperature(K)',
    'Reference': 'Affinity PubMed ID',
    'Method': 'Affinity Method',
},inplace=True)

# 添加WT
df_WT = df.groupby(by='PDB').apply(lambda x: x.iloc[0])
# df_WT['Mutations'] = ''
df_WT['Mutation(s)_cleaned'] = ''
df_WT['Mutation(s)_PDB'] = ''
df_WT['KD(M)'] = df_WT['Affinity_wt_parsed']
df_WT.reset_index(drop=True,inplace=True)
print(f"df_WT.shape: {df_WT.shape}")
df = pd.concat((df_WT,df), axis=0)

# KD(M)
print(f"Before this step, there are {df['KD(M)'].isna().sum()} records with unlabeled affinity")
for index, kd, kd_mut in zip(df.index, df['KD(M)'],df['Affinity_mut (M)']):
    if np.isnan(kd):
        try:
            kd_mut = np.isnan(float(kd_mut))
            df.loc[index,'KD(M)'] = kd_mut
        except:
            pass
print(f"Before this step, there are {df['KD(M)'].isna().sum()} records with unlabeled affinity")


# 添加chain id, mutations, PDB...
df.loc[:,['Model','Affinity Release Date','Resolution(Å)']] = np.nan
df['Ligand Chains'] = df['PDB'].apply(lambda x: ','.join([c for c in x.split('_')[1]]))
df['Receptor Chains'] = df['PDB'].apply(lambda x: ','.join([c for c in x.split('_')[2]]))
df['PDB'] = df['PDB'].apply(lambda x: x[:4])
df['Mutation(s)_cleaned'] = df['Mutation(s)_cleaned'].apply( lambda x: ', '.join([f'{mut[1]}_{mut[0]}{mut[2:]}' for mut in x.split(',') if mut!='']) )
df['Mutation(s)_PDB'] = df['Mutation(s)_PDB'].apply( lambda x: ', '.join([f'{mut[1]}_{mut[0]}{mut[2:]}' for mut in x.split(',') if mut!='']) )
print(f"df.shape: {df.shape}")

print(f"Number of unique PDBs in df: {len(set(df['PDB'].tolist()))}")

df_WT.shape: (348, 30)
Before this step, there are 287 records with unlabeled affinity
Before this step, there are 57 records with unlabeled affinity
df.shape: (7433, 35)
Number of unique PDBs in df: 345


# Manually fix errors

In [5]:
# SKEMPI v2.0的5E9D的标注错误,原标为A,B_C,D,E,应该改为A,B,C_D,E。
ind = df[df['PDB']=='5E9D'].index
df.loc[ind,'Ligand Chains'] = 'A,B,C'
df.loc[ind,'Receptor Chains'] = 'D,E'

# Screening

## Screen out those without affinity labels

In [6]:
df_delphase1 = df[df['KD(M)'].isna()]
df_delphase1.insert(loc=0,column='Reason for Deletion',value=np.nan)
df_delphase1['Reason for Deletion'] = 'without affinity labels'
df = df[~df['KD(M)'].isna()]

pdbs_drop = set(df_delphase1['PDB'].values.tolist())
print(f"Deleted {len(df_delphase1)} records with {len(pdbs_drop)} unique PDBs in screening phase1 due to 'without affinity labels', they are:\n{pdbs_drop}")

Deleted 57 records with 7 unique PDBs in screening phase1 due to 'without affinity labels', they are:
{'4UYP', '1MHP', '1B41', '2B2X', '4CPA', '1FSS', '4UYQ'}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_delphase1['Reason for Deletion'] = 'without affinity labels'


## Screening out samples that are not Protein-Protein-Complex

In [7]:
# ** SKEMPI v2.0数据集全是蛋白蛋白复合体，因此不许执行这一步筛选。 **

# # 精筛：筛除受体链、配体链不是氨基酸链的样本
# df.reset_index(inplace=True, drop=True)

# from utils import check_chains
# assert len(set(df.index.values.tolist())) == len(df)
# # data_dir = r"E:\1.THPRD\1.AI_LifeScience\3.AntiBody\我们的创新\1.RDE-PPdG\参考文献\公开数据集\SKEMPI v2.0\SKEMPI2_PDBs\PDBs" 
# data_dir = r"./source_data/SKEMPI v2.0/PDBs" 
# keep_inds = []
# for pdb in tqdm.tqdm(set(df.PDB.tolist()),'Iterate over all PDBs'):
#     pdb_file = os.path.join( data_dir, f'{pdb.upper()}.pdb' )
#     df_tmp = df[df['PDB']==pdb]
#     for ind in df_tmp.index:
#         ligand_chains = [chain.strip() for chain in df_tmp.loc[ind,'Ligand Chains'].split(',')]
#         receptor_chains = [chain.strip() for chain in df_tmp.loc[ind,'Receptor Chains'].split(',')]
#         result = check_chains(pdb_file, ligand_chains, receptor_chains)
#         if all(result.values()):
#             keep_inds.append(ind)
#         else:
#             print(f"Delete {pdb}, because: {result}")
# drop_inds = [ind for ind in df.index.tolist() if ind not in keep_inds]
# df_delphase2 = df.loc[drop_inds]
# df_delphase2['Reason for Deletion'] = 'not Protein-Protein-Complex'

# pdbs_drop = set(df_drop['PDB'].values.tolist())
# print(f'Number of dropped PDBs: {len(pdbs_drop)}, which is:\n{pdbs_drop}')
# df = df.loc[keep_inds]
# print('After the filtering, we get df:')
# display(df)

## save deleted samples as an excel file

In [8]:
# save as excel file
df_del = df_delphase1# pd.concat( [df_delphase1, df_delphase2], axis=0 )
df_del.to_excel( os.path.join(save_dir,'SKEMPI v2.0(samples_deleted).xlsx') )

# PDB info

In [10]:
# # 串行版
# from utils import get_pdb_info
# pdbs = set(df['PDB'].values)
# for pdb in tqdm.tqdm_notebook(pdbs):
#     results = get_pdb_info(pdb)
#     if results['Resolution']!=None:
#         df.loc[df[df.PDB==pdb].index,'Resolution(Å)'] = results['Resolution'].replace('Å','').strip()
#     if results['Method'] != None:
#         df.loc[df[df.PDB==pdb].index,'Structure Method'] = results['Method']
#     if results['Release Date'] != None:
#         df.loc[df[df.PDB==pdb].index,'PDB Release Date'] = results['Release Date']
#     # 这是PDB的PubMed ID, 不是亲和力的PubMed ID
#     if results['PubMed ID'] != None:
#         df.loc[df[df.PDB==pdb].index,'PDB PubMed ID'] = results['PubMed ID']

        
        
# 并行版
from utils import get_pdb_info

df['PDB PubMed ID'] = np.nan
pdbs = set(df.loc[df['PDB PubMed ID'].isna(), 'PDB'].values.tolist())
N = len(pdbs)
try_times = 0
while N>0 and try_times<20:
    # 获取pdb信息
    infos = Parallel(n_jobs=-1)(delayed(get_pdb_info)(pdb) for pdb in tqdm.tqdm(pdbs,desc=f'Fetching PDB infomation of {len(pdbs)} PDBs...'))
    infos = dict([(info['PDB'], info) for info in infos])
    # 填写到df中
    for pdb in tqdm.tqdm(pdbs,desc="Fill in the PDB infomations..."):
        info = infos[pdb]
        if info['Resolution']!=None:
            df.loc[df[df.PDB==pdb].index,'Resolution(Å)'] = info['Resolution'].replace('Å','').strip()
        if info['Method'] != None:
            df.loc[df[df.PDB==pdb].index,'Structure Method'] = info['Method']
        if info['Release Date'] != None:
            df.loc[df[df.PDB==pdb].index,'PDB Release Date'] = info['Release Date']   
        # 注意：这是PDB的PubMed ID, 不是亲和力的PubMed ID
        if info['PubMed ID'] != None:
            df.loc[df[df.PDB==pdb].index,'PDB PubMed ID'] = info['PubMed ID']
            
    try_times += 1
    pdbs = set(df.loc[df['PDB PubMed ID'].isna(), 'PDB'].values.tolist())
    N = len(pdbs)

N_notfound = df['PDB PubMed ID'].isna().sum()
if N_notfound==0:
    print(f">>> 成功找到所有Affinity PubMed ID的Release Date！")
else:
    print(f">>> 还有{N_notfound}个record的Release Date没找到。他们的Affinity PubMed ID是：{set(df.loc[df['Resolution(Å)'].isna(),'PDB'].values.tolist())}")

Fetching PDB infomation of 345 PDBs...: 100%|██████████| 345/345 [00:11<00:00, 30.90it/s]
Fill in the PDB infomations...: 100%|██████████| 345/345 [00:04<00:00, 83.22it/s]
Fetching PDB infomation of 16 PDBs...: 100%|██████████| 16/16 [00:00<00:00, 2364.40it/s]
Fill in the PDB infomations...: 100%|██████████| 16/16 [00:00<00:00, 87.96it/s]
Fetching PDB infomation of 16 PDBs...: 100%|██████████| 16/16 [00:00<00:00, 5719.67it/s]
Fill in the PDB infomations...: 100%|██████████| 16/16 [00:00<00:00, 86.96it/s]
Fetching PDB infomation of 16 PDBs...: 100%|██████████| 16/16 [00:00<00:00, 4933.02it/s]
Fill in the PDB infomations...: 100%|██████████| 16/16 [00:00<00:00, 83.72it/s]
Fetching PDB infomation of 16 PDBs...: 100%|██████████| 16/16 [00:00<00:00, 5692.50it/s]
Fill in the PDB infomations...: 100%|██████████| 16/16 [00:00<00:00, 97.54it/s]
Fetching PDB infomation of 16 PDBs...: 100%|██████████| 16/16 [00:00<00:00, 4773.37it/s]
Fill in the PDB infomations...: 100%|██████████| 16/16 [00:00<0

>>> 还有51个record的Release Date没找到。他们的Affinity PubMed ID是：{'2KSO', '1KBH'}





# affinity info

In [11]:
# # 根据Affinity PubMed ID查询Affinity Release Date
# from utils import get_pubdate
# for pubmed_id in tqdm.tqdm_notebook(set(df['Affinity PubMed ID'].values)):
#     try:
#         pubdate = get_pubdate(str(pubmed_id), dst_datetime_format='%Y-%m-%d')
#         df.loc[df[df['Affinity PubMed ID']==pubmed_id].index, 'Affinity Release Date' ] = pubdate
#     except:
#         print(f"Cound not fetch pubdate for pubmed_id=='{pubmed_id}'")
        
        
        
# 根据Affinity PubMed ID查询Affinity Release Date
# 并行版：
from utils import get_pubdate
df['Affinity Release Date'] = np.nan
PMIDs = set(df.loc[~df['Affinity PubMed ID'].isna() * df['Affinity Release Date'].isna(), 'Affinity PubMed ID'].values.tolist())
N = len(PMIDs)
try_times = 0
while N>0 and try_times<20:
    # 获取pdb信息
    pubdate_infos = Parallel(n_jobs=8)(delayed(get_pubdate)(PMID) for PMID in tqdm.tqdm(PMIDs,desc=f'Fetching affinity release datas of {len(PMIDs)} papers...'))
    # 填写到df中
    for pubdate_info in tqdm.tqdm(pubdate_infos,desc="Fill in affinity release dates..."):
        PMID, pubdate = pubdate_info['PMID'], pubdate_info['publish date']
        # 是亲和力的PubMed ID
        df.loc[df['Affinity PubMed ID']==PMID, 'Affinity Release Date'] = pubdate
    
    PMIDs = set(df.loc[~df['Affinity PubMed ID'].isna() * df['Affinity Release Date'].isna(), 'Affinity PubMed ID'].values.tolist())
    N = len(PMIDs)
    try_times += 1
    
N_notfound = df['Affinity Release Date'].isna().sum() - df['Affinity PubMed ID'].isna().sum()
if N_notfound==0:
    print(f">>> 成功找到所有Affinity PubMed ID的Release Date！")
else:
    tmp_pmids = set(df.loc[df['Affinity Release Date'].isna(),'Affinity PubMed ID'].values.tolist())
    print(f">>> 还有{N_notfound}个record的亲和力Release Date没找到。他们属于{len(tmp_pmids)}篇不同的paper,分别是：")
    print('\n'.join(tmp_pmids))

Fetching affinity release datas of 295 papers...: 100%|██████████| 295/295 [00:54<00:00,  5.46it/s]
Fill in affinity release dates...: 100%|██████████| 295/295 [00:00<00:00, 697.17it/s]
Fetching affinity release datas of 158 papers...: 100%|██████████| 158/158 [00:27<00:00,  5.70it/s]
Fill in affinity release dates...: 100%|██████████| 158/158 [00:00<00:00, 656.10it/s]
Fetching affinity release datas of 79 papers...: 100%|██████████| 79/79 [00:12<00:00,  6.11it/s]
Fill in affinity release dates...: 100%|██████████| 79/79 [00:00<00:00, 564.00it/s]
Fetching affinity release datas of 37 papers...: 100%|██████████| 37/37 [00:03<00:00,  9.82it/s]
Fill in affinity release dates...: 100%|██████████| 37/37 [00:00<00:00, 537.76it/s]
Fetching affinity release datas of 15 papers...: 100%|██████████| 15/15 [00:00<00:00, 5497.12it/s]
Fill in affinity release dates...: 100%|██████████| 15/15 [00:00<00:00, 488.60it/s]
Fetching affinity release datas of 9 papers...: 100%|██████████| 9/9 [00:00<00:00, 

>>> 还有338个record的亲和力Release Date没找到。他们属于2篇不同的paper,分别是：
Water-mediated interaction at a protein-protein interface, Chemical Physics, Volume 307, Issues 2-3, 2004
Stephen Ming-teh Lu, PhD Thesis, Purdue University, 2000





# save

In [16]:
df.reset_index(drop=True, inplace=True)
df = df.loc[:,[
    'PDB', 'Source Data Set', 'Model', 'Mutation(s)_cleaned', 'Mutation(s)_PDB', 'Ligand Chains', 'Receptor Chains', 'Ligand Name', 'Receptor Name', 
    'KD(M)', 'Affinity Method', 'Structure Method', 'Temperature(K)', 'Resolution(Å)', 
    'PDB PubMed ID', 'PDB Release Date', 'Affinity PubMed ID', 'Affinity Release Date', 
]]
display(df)
df.to_excel(os.path.join(save_dir,'SKEMPI v2.0.xlsx'))

Unnamed: 0,PDB,Source Data Set,Model,Mutation(s)_cleaned,Mutation(s)_PDB,Ligand Chains,Receptor Chains,Ligand Name,Receptor Name,KD(M),Affinity Method,Structure Method,Temperature(K),Resolution(Å),PDB PubMed ID,PDB Release Date,Affinity PubMed ID,Affinity Release Date
0,1A22,SKEMPI v2.0,,,,A,B,Human growth hormone,hGH binding protein,0.0,SPR,X-RAY DIFFRACTION,298,1.20,3301348,1988-07-16,7504735,1993 Dec 5
1,1A4Y,SKEMPI v2.0,,,,A,B,Ribonuclease inhibitor,Angiogenin,0.0,SFPF,X-RAY DIFFRACTION,298,1.20,3301348,1988-07-16,9050852,1997 Mar 4
2,1ACB,SKEMPI v2.0,,,,E,I,Bovine alpha-chymotrypsin,Eglin c,0.0,IASP,X-RAY DIFFRACTION,294,1.20,3301348,1988-07-16,9048543,1997 Feb 18
3,1AHW,SKEMPI v2.0,,,,"A,B",C,Immunoglobulin fab 5G9,Tissue factor,0.0,IASP,X-RAY DIFFRACTION,298(assumed),1.20,3301348,1988-07-16,9480775,1998 Feb 6
4,1AK4,SKEMPI v2.0,,,,A,D,Cyclophilin A,HIV-1 capsid protein,0.000012,SPR,X-RAY DIFFRACTION,298(assumed),1.20,3301348,1988-07-16,9223641,1997 Jun 27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7371,3QIB,SKEMPI v2.0,,P_K8R,P_K9R,"A,B,P","C,D",I-Ek plus MCC peptide,2B4 TCR,0.00024,SPR,X-RAY DIFFRACTION,298,2.70,21490152,2011-04-27,12152083,2002 Aug 1
7372,3QIB,SKEMPI v2.0,,P_T11A,P_T12A,"A,B,P","C,D",I-Ek plus MCC peptide,2B4 TCR,0.0011,SPR,X-RAY DIFFRACTION,298,2.70,21490152,2011-04-27,12152083,2002 Aug 1
7373,3QIB,SKEMPI v2.0,,P_T11S,P_T12S,"A,B,P","C,D",I-Ek plus MCC peptide,2B4 TCR,0.000034,SPR,X-RAY DIFFRACTION,298,2.70,21490152,2011-04-27,12152083,2002 Aug 1
7374,3QIB,SKEMPI v2.0,,P_T11N,P_T12N,"A,B,P","C,D",I-Ek plus MCC peptide,2B4 TCR,0.000043,SPR,X-RAY DIFFRACTION,298,2.70,21490152,2011-04-27,12152083,2002 Aug 1


In [18]:
df['Affinity Method'].value_counts()

Affinity Method
SPR         2798
FL          1464
ITC          616
IASP         518
RA           371
SFFL         365
SP           349
ELISA        220
IAFL         190
KinExA       134
IARA         133
BI            72
CSPRIA        45
ELFA          26
ESMA          23
SE            14
SFPF          12
SPR,SFFL      10
IAGE           9
EMSA           7
Name: count, dtype: int64