In [1]:
import pandas as pd
import numpy as np
import tqdm

In [2]:
import os
save_dir = './processed_data'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [3]:
root_dir = r"./source_data"
filepath = os.path.join(root_dir, r"sabdab_summary_all-by20240725.xlsx")
if not os.path.exists(filepath.replace('.tsv','.xlsx')):
    assert os.path.exists(filepath), f'{filepath} not exists!'
    df = pd.read_csv(filepath, sep='\t', dtype={'pmid':str})
    df.to_excel(filepath.replace('.tsv','.xlsx'))
else:
    df = pd.read_excel(filepath.replace('.tsv','.xlsx'), dtype={'pmid':str}, index_col=0)
display(df)

Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain,antigen_type,antigen_het_name,antigen_name,short_header,date,...,scfv,engineered,heavy_subclass,light_subclass,light_ctype,affinity,delta_g,affinity_method,temperature,pmid
0,8udz,C,D,0,B,protein,,transforming growth factor beta-1 proprotein,SIGNALING PROTEIN/IMMUNE SYSTEM,07/17/24,...,False,True,IGHV3,IGLV6,Lambda,,,,,
1,8udz,E,F,0,B | A,protein | protein,NA | NA,transforming growth factor beta-1 proprotein |...,SIGNALING PROTEIN/IMMUNE SYSTEM,07/17/24,...,False,True,IGHV3,IGLV6,Lambda,,,,,
2,8vyl,F,,0,D | C,protein | protein,NA | NA,hemoglobin subunit beta | hemoglobin subunit a...,OXYGEN TRANSPORT,07/17/24,...,False,True,IGHV1,,,,,,,
3,8vyl,E,,0,B | A,protein | protein,NA | NA,hemoglobin subunit beta | hemoglobin subunit a...,OXYGEN TRANSPORT,07/17/24,...,False,True,IGHV1,,,,,,,
4,8q5y,R,L,0,D,protein,,spike glycoprotein,VIRAL PROTEIN,07/17/24,...,False,True,IGHV3,IGKV1,Kappa,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17270,6ejm,H,h,0,B,protein,,cd81 antigen,CELL ADHESION,09/22/17,...,True,True,unknown,unknown,unknown,8.600000e-10,-12.367557,SPR,,TBD
17271,7lo6,J,I,0,C,protein,,envelope glycoprotein bg505 sosip.664 gp120,VIRAL PROTEIN/IMMUNE SYSTEM,02/09/21,...,False,True,IGHV1,IGKV3,Kappa,,,,,
17272,3vi3,H,L,0,D,protein,,integrin beta-1,CELL ADHESION/IMMUNE SYSTEM,09/21/11,...,False,True,IGHV1,IGKV2,Kappa,,,,,
17273,6zdg,F,G,0,D,protein,,spike glycoprotein,VIRAL PROTEIN,06/14/20,...,False,True,IGHV3,IGKV1,Kappa,,,,,


# Screening

## Affinity label

In [4]:
assert len(df[df['affinity'].isna() * ~df['delta_g'].isna()]) == 0

In [5]:
# 没有记录亲和力的
df_delphase1 = df[df['affinity'].isna()]
df_delphase1.insert(loc=0,column='Reason for Deletion', value='no affinity label')

# 记录了亲和力的
df = df[~df['affinity'].isna()]

## Protein Protein Complex

In [6]:
# 初筛：删除抗原不是氨基酸链的样本
del_flags = df['antigen_type'].apply(
    lambda x: x in [
        'Hapten', 'carbohydrate', 'carbohydrate | protein', 'carbohydrate | protein | protein',
        'nucleic-acid', 'nucleic-acid | nucleic-acid', 'nucleic-acid | nucleic-acid | nucleic-acid', 'protein | nucleic-acid'])
df_delphase2 = df[del_flags]
df_delphase2.insert(loc=0,column='Reason for Deletion',value=np.nan)
df_delphase2.loc[:,'Reason for Deletion'] = 'not Protein-Protein-Complex'

df = df[~del_flags]

## no antigen chain

In [7]:
df_delphase3 = df[df['antigen_chain'].isna()]
df_delphase3.insert(loc=0,column='Reason for Deletion',value='no antigen chain')

df = df[~df['antigen_chain'].isna()]

In [8]:
# 少量样本仅有H链（或L链），且antigen_chain也是H链（或L链），删除这类样本
df_delphase4 = df[ ((df['antigen_chain']==df['Lchain'])*df['Hchain'].isna()) | ((df['antigen_chain']==df['Hchain'])*df['Lchain'].isna()) ]
df_delphase4.insert(loc=0,column='Reason for Deletion',value='no antigen chain')

df = df.drop(index=df_delphase4.index)

## manually delete records with chain annotation error

In [9]:
# # 手动删除错误样本
del_pdbs = ['5j74', '1h8s', '1zea', '3gm0', '3gkz', '4gqp', '5j74', '6df1']
df_delphase5 = df.loc[ df['pdb'].apply(lambda x: x in del_pdbs), : ]
df_delphase5.insert(loc=0,column='Reason for Deletion',value='chain annotation error')

df = df[ df['pdb'].apply(lambda x: x not in del_pdbs) ]

## save deleted samples as an excel file

In [10]:
df_del = pd.concat([df_delphase1,df_delphase2,df_delphase3,df_delphase4,df_delphase5], axis=0)
df_del.to_excel( os.path.join(save_dir,'SAbDab(samples_deleted).xlsx') )

# rename columns

In [11]:
df.rename(columns={
    'pdb': 'PDB', 
    'antigen_chain': 'Receptor Chains', 
    'affinity': 'KD(M)', 
    'affinity_method': 'Affinity Method',
    'temperature': 'Temperature(K)',
    'antigen_name': 'Receptor Name',
    'resolution': 'Resolution(Å)',
    'pmid': 'Affinity PubMed ID',
    'date': 'Affinity Release Date',
    'model': 'Model',
},inplace=True)

df['Receptor Chains'] = df['Receptor Chains'].apply(lambda x: x.split(' | '))
df['Hchain'].fillna(value='', inplace=True)
df['Lchain'].fillna(value='', inplace=True)
assert set(df.loc[:,'Hchain'].apply(lambda x: len(x)).tolist())=={0,1}
assert set(df.loc[:,'Lchain'].apply(lambda x: len(x)).tolist())=={0,1}
for ind in df.index:
    df.loc[ind,'Ligand Chains'] = ', '.join( [chain for chain in df.loc[ind,['Hchain','Lchain']] if chain!=''] )                                                                    
df['Receptor Chains'] = df['Receptor Chains'].apply(lambda x: ', '.join(x))                                                                       

df.drop(columns=['Hchain','Lchain'], inplace=True)

df['Affinity PubMed ID'] = df['Affinity PubMed ID'].astype(str)
df.loc[:,'Mutations'] = np.nan
df.loc[:,'PDB'] = df['PDB'].apply(lambda x: x.upper())
df.loc[:,'Temperature(K)'].fillna(value='None',inplace=True)
df.loc[:,'Temperature(K)'] = df.loc[:,'Temperature(K)'].apply(lambda x:np.nan if x=='None' else 273.15+float(x))
df.loc[:,'Ligand Name'] = 'antibody'
df.loc[:, 'Source Data Set'] = 'SAbDab'
df

Unnamed: 0,PDB,Model,Receptor Chains,antigen_type,antigen_het_name,Receptor Name,short_header,Affinity Release Date,compound,organism,...,light_ctype,KD(M),delta_g,Affinity Method,Temperature(K),Affinity PubMed ID,Ligand Chains,Mutations,Ligand Name,Source Data Set
4880,6FE4,0,A,protein,,shiga-like toxin 2 subunit b,TOXIN,12/29/17,Crystal structure of the complex between Shiga...,ENTEROBACTERIA PHAGE 933W; VICUGNA PACOS,...,,9.600000e-09,-10.938139,SPR,,TBD,F,,antibody,SAbDab
4887,5W08,0,C,protein,,hemagglutinin ha1,VIRAL PROTEIN/IMMUNE SYSTEM,05/30/17,A/Texas/50/2012(H3N2) Influenza hemagglutinin ...,INFLUENZA A VIRUS (A/TEXAS/50/2012(H3N2)); HOM...,...,Lambda,4.000000e-08,-10.092596,SPR,,TBD,"K, L",,antibody,SAbDab
4901,2NY3,0,A,protein,,envelope glycoprotein gp120,VIRAL PROTEIN/IMMUNE SYSTEM,11/20/06,"HIV-1 gp120 Envelope Glycoprotein (K231C, T257...",HUMAN IMMUNODEFICIENCY VIRUS 1; HOMO SAPIENS,...,Kappa,8.150000e-07,,SPR,,17301785,"D, C",,antibody,SAbDab
4905,6H7O,0,B,protein,,beta-1 adrenergic receptor,ELECTRON TRANSPORT,07/31/18,ACTIVATED TURKEY BETA1 ADRENOCEPTOR WITH BOUND...,ESCHERICHIA COLI (STRAIN K12); MELEAGRIS GALLO...,...,,1.400000e-10,-13.443087,Unknown,,TBD,D,,antibody,SAbDab
4911,1NCA,0,N,protein,,influenza a subtype n9 neuraminidase,HYDROLASE(O-GLYCOSYL),01/21/92,REFINED CRYSTAL STRUCTURE OF THE INFLUENZA VIR...,INFLUENZA A VIRUS; MUS MUSCULUS,...,Kappa,8.300000e-09,-11.020000,Other,298.15,9692956,"H, L",,antibody,SAbDab
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17242,3IDX,0,G,protein,,hiv-1 hxbc2 gp120 core,IMMUNE SYSTEM,07/22/09,Crystal structure of HIV-gp120 core in complex...,HUMAN IMMUNODEFICIENCY VIRUS 1; HOMO SAPIENS,...,Kappa,4.040000e-09,-11.450945,SPR,,TBD,"H, L",,antibody,SAbDab
17253,4PS4,0,A,protein,,interleukin-13,IMMUNE SYSTEM,03/06/14,Crystal structure of the complex between IL-13...,HOMO SAPIENS,...,Kappa,5.700000e-12,-15.339731,SPR,,TBD,"H, L",,antibody,SAbDab
17257,5KVD,0,E,protein,,zika envelope diii,VIRAL PROTEIN/IMMUNE SYSTEM,07/14/16,"Zika specific antibody, ZV-2, bound to ZIKA en...",ZIKA VIRUS; MUS MUSCULUS,...,Kappa,2.660000e-07,-8.970066,SPR,,TBD,"H, L",,antibody,SAbDab
17258,2X89,0,F,protein,,beta-2-microglobulin,IMMUNE SYSTEM,03/07/10,Structure of the Beta2_microglobulin involved ...,HOMO SAPIENS; CAMELUS DROMEDARIUS,...,,4.400000e-08,-10.030000,SPR,,21220305,B,,antibody,SAbDab


# PDB info

In [12]:
from utils import get_pdb_info

pdbs = set(df['PDB'].values)

for pdb in tqdm.tqdm(pdbs):
    results = get_pdb_info(pdb)
    inds = df[df.PDB==pdb].index
    if results['Resolution']!=None:
        df.loc[inds,'Resolution(Å)'] = results['Resolution'].replace('Å','').strip()
    if results['Method'] != None:
        df.loc[inds,'Structure Method'] = results['Method']
    if results['Release Date'] != None:
        df.loc[inds,'PDB Release Date'] = results['Release Date']
    #这是PDB的PubMed ID, 不是亲和力的PubMed ID
    if results['PubMed ID'] != None:
        df.loc[inds,'PDB PubMed ID'] = results['PubMed ID']

100%|██████████| 600/600 [17:01<00:00,  1.70s/it]


# affinity info

In [13]:
# 调整日期格式
from datetime import datetime
for ind in df.index:
    date_obj = datetime.strptime(df.loc[ind,'Affinity Release Date'], '%m/%d/%y')
    formatted_date = date_obj.strftime('%Y-%m-%d')
    df.loc[ind,'Affinity Release Date'] = formatted_date

# save

In [14]:
df.reset_index(drop=True, inplace=True)
df = df.loc[:,[
    'PDB', 'Source Data Set', 'Model', 'Mutations', 'Ligand Chains', 'Receptor Chains', 'Ligand Name', 'Receptor Name', 
    'KD(M)', 'Affinity Method', 'Structure Method', 'Temperature(K)', 'Resolution(Å)', 
    'PDB PubMed ID', 'PDB Release Date', 'Affinity PubMed ID', 'Affinity Release Date', 
]]
df.to_excel( os.path.join(save_dir,'SAbDab.xlsx'), index=True)
display(df)

Unnamed: 0,PDB,Source Data Set,Model,Mutations,Ligand Chains,Receptor Chains,Ligand Name,Receptor Name,KD(M),Affinity Method,Structure Method,Temperature(K),Resolution(Å),PDB PubMed ID,PDB Release Date,Affinity PubMed ID,Affinity Release Date
0,6FE4,SAbDab,0,,F,A,antibody,shiga-like toxin 2 subunit b,9.600000e-09,SPR,X-RAY DIFFRACTION,,3.00,29494518,2018-03-07,TBD,2017-12-29
1,5W08,SAbDab,0,,"K, L",C,antibody,hemagglutinin ha1,4.000000e-08,SPR,X-RAY DIFFRACTION,,2.60,29343437,2018-02-14,TBD,2017-05-30
2,2NY3,SAbDab,0,,"D, C",A,antibody,envelope glycoprotein gp120,8.150000e-07,SPR,X-RAY DIFFRACTION,,2.00,17301785,2007-02-06,17301785,2006-11-20
3,6H7O,SAbDab,0,,D,B,antibody,beta-1 adrenergic receptor,1.400000e-10,Unknown,X-RAY DIFFRACTION,,2.80,31072904,2018-10-17,TBD,2018-07-31
4,1NCA,SAbDab,0,,"H, L",N,antibody,influenza a subtype n9 neuraminidase,8.300000e-09,Other,X-RAY DIFFRACTION,298.15,2.50,1381757,1994-01-31,9692956,1992-01-21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1064,3IDX,SAbDab,0,,"H, L",G,antibody,hiv-1 hxbc2 gp120 core,4.040000e-09,SPR,X-RAY DIFFRACTION,,2.50,19965434,2009-11-17,TBD,2009-07-22
1065,4PS4,SAbDab,0,,"H, L",A,antibody,interleukin-13,5.700000e-12,SPR,X-RAY DIFFRACTION,,2.80,20226193,2014-03-19,TBD,2014-03-06
1066,5KVD,SAbDab,0,,"H, L",E,antibody,zika envelope diii,2.660000e-07,SPR,X-RAY DIFFRACTION,,1.65,27475895,2016-08-03,TBD,2016-07-14
1067,2X89,SAbDab,0,,B,F,antibody,beta-2-microglobulin,4.400000e-08,SPR,X-RAY DIFFRACTION,,2.16,21220305,2011-01-19,21220305,2010-03-07
