In [1]:
import os
import numpy as np
import pandas as pd
import tqdm

In [2]:
save_dir = './processed_data'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [3]:
root_dir = r"source_data"
filepath = os.path.join(root_dir, r'ATLAS.xlsx')
assert os.path.exists(filepath), f'{filepath} not exists!'
df = pd.read_excel(filepath,index_col=0,dtype={'PMID':str})

# PDB: 若有true_PDB，则取true_PDB赋值给PDB，否则取template_PDB赋值给PDB
inds = df[~df['true_PDB'].isna()].index
df.loc[inds,'PDB'] = df.loc[inds,'true_PDB']
inds =  df[df['true_PDB'].isna()&~df['template_PDB'].isna()].index
df.loc[inds,'PDB'] = df.loc[inds,'template_PDB']
## 取值template_PDB的，要留意template_PDB是否和MT序列一致，不一致的话要标注其中突变##
assert sum(df['PDB'].isna())==0

print(f"Original dataset has {len(df)} samples with {len(set(df['PDB'].tolist()))} unique PDBs")

Original dataset has 694 samples with 87 unique PDBs


In [4]:
df['Kd_microM'] = df['Kd_microM'].apply(lambda x: x if x!='n.d.' else np.nan)
df['KD(M)'] = df['Kd_microM'].apply(lambda x: float(str(x).split('>')[-1].split('+/-')[0].strip())*1E-6)
df = df[~df['KD(M)'].isna()]

print(f"The dataset has {len(df)} samples with known affinity, among which, there are {len(set(df['PDB'].tolist()))} unique PDBs")

The dataset has 637 samples with known affinity, among which, there are 84 unique PDBs


In [5]:
df['Source Data Set'] = 'ATLAS'
df['Model'] = np.nan
df['Affinity Release Date'] = np.nan

In [6]:
df.rename( columns={
    'Exp. Method': 'Affinity Method',
    'PMID': 'Affinity PubMed ID',
    'TCRname': 'Receptor Name',
    'Temperature_K': 'Temperature(K)',
    'Resolution': 'Resolution(Å)',
},inplace=True)

df['Affinity PubMed ID'] = df['Affinity PubMed ID'].astype(str)
df['Temperature(K)'] = df['Temperature(K)'].apply(lambda x: x if x!='\\N' else np.nan)
df['Resolution(Å)'] = df['Resolution(Å)'].apply(lambda x: x if x!='\\N' else np.nan)
df['Ligand Name'] = df['PEPseq']+'-'+df['MHCname']

# Add Mutattians

## TCR_mut

In [7]:
TCR_chain_convert = {'A':'D', 'B':'E'}
df['TCR_mut_chain'].fillna(value='', inplace=True)
df['TCR_mut_chain'] = df['TCR_mut_chain'].apply(lambda x: x if x not in ['\\N','N/A'] else '')
df['TCR_mut'] = df['TCR_mut'].apply(lambda x: '' if x=='WT' else x)
for ind in df.index:
    if df.loc[ind,'TCR_mut']=='':
        # assert df.loc[ind,'TCR_mut_chain']=='', f"{df.loc[ind,'PDB']}: TCR_mut_chain must be empty while TCR_mut is empty"
        pass
    else:
        mut_chains = [chain.strip() for chain in df.loc[ind,'TCR_mut_chain'].split('|')]
        muts = [mut.strip() for mut in df.loc[ind,'TCR_mut'].split('|')]
        df.loc[ind,'TCR_mut'] = ', '.join([f'{TCR_chain_convert[mut_chain]}_{mut}' for mut_chain, mut in zip(mut_chains,muts)])
assert len(df[df['TCR_mut'].apply(lambda x: '|' in x if isinstance(x,str) else False)].loc[:,['TCR_mut']]) == 0, "Error: TCR_mut with '|' is note allowed!"

## MHC_mut

In [8]:
MHC_chain_convert = {'A':'A', 'B':'B'}
df['MHC_mut_chain'].fillna(value='', inplace=True)
df['MHC_mut_chain'] = df['MHC_mut_chain'].apply(lambda x: x if x not in ['\\N','N/A'] else '')
df['MHC_mut'] = df['MHC_mut'].apply(lambda x: '' if x=='WT' else x)
for ind in df.index:
    if df.loc[ind,'MHC_mut_chain']=='':
        # assert df.loc[ind,'TCR_mut_chain']=='', f"{df.loc[ind,'PDB']}: TCR_mut_chain must be empty while TCR_mut is empty"
        pass
    else:
        mut_chains = [chain.strip() for chain in df.loc[ind,'MHC_mut_chain'].split('|')]
        muts = [mut.strip() for mut in df.loc[ind,'MHC_mut'].split('|')]
        df.loc[ind,'MHC_mut'] = ', '.join([f'{mut_chain}_{mut}' for mut_chain, mut in zip(mut_chains,muts)])
assert len(df[df['MHC_mut'].apply(lambda x: '|' in x if isinstance(x,str) else False)].loc[:,['MHC_mut']]) == 0, "Error: Mut_mut with '|' is note allowed!"

## PEP_mut

In [10]:
df['PEP_mut'] = df['PEP_mut'].apply( lambda x: ', '.join([x.strip('') for mut in x.split('|') if x!='WT']) )
# # 读取pdb文件，根据序列，找出其中的多肽链
df['PEPseq'] = df['PEPseq'].apply(lambda x: x.strip())

# ATLAS数据集中的多肽链默认是C
df['PEP_mut'] = df['PEP_mut'].apply(lambda x: ', '.join([f'C_{mut.strip()}' for mut in x.split(',') if mut!='']) )

## merge

In [None]:
for ind in df.index:
    muts = [df.loc[ind,'MHC_mut'], df.loc[ind,'PEP_mut'], df.loc[ind,'TCR_mut']] 
    muts = list(filter(lambda x : x !='', muts))
    df.loc[ind,'Mutations'] = ', '.join( muts ).strip(', ')
df[df['Mutations']==''].loc[:,'Mutations'] = np.nan

# Screening

In [10]:
# 精筛：筛除受体链、配体链不是氨基酸链的样本
df.reset_index(inplace=True, drop=True)

from utils import check_chains
assert len(set(df.index.values.tolist())) == len(df)
data_dir = r"processed_data/PDB/ATLAS"
keep_inds = []
for pdb in tqdm.tqdm(set(df.PDB.tolist()),'Iterate over all PDBs'):
    pdb_file = os.path.join( data_dir, f'{pdb.upper()}.pdb' )
    df_tmp = df[df['PDB']==pdb]
    for ind in df_tmp.index:
        ligand_chains = [chain.strip() for chain in df_tmp.loc[ind,'Ligand Chains'].split(',')]
        receptor_chains = [chain.strip() for chain in df_tmp.loc[ind,'Receptor Chains'].split(',')]
        result = check_chains(pdb_file, ligand_chains, receptor_chains)
        if all(result.values()):
            keep_inds.append(ind)
        else:
            print(f"Delete {pdb}, because: {result}")
drop_inds = [ind for ind in df.index.tolist() if ind not in keep_inds]
df_drop = df.loc[drop_inds]
pdbs_drop = set(df_drop['PDB'].values.tolist())
print(f'Number of dropped PDBs: {len(pdbs_drop)}, which is:\n{pdbs_drop}')
df = df.loc[keep_inds]
print('After the filtering, we get df:')
display(df)

Iterate over all PDBs: 100%|██████████| 84/84 [00:44<00:00,  1.88it/s]

Number of dropped PDBs: 0, which is:
set()
After the filtering, we get df:





Unnamed: 0,Receptor Name,MHCname,MHCname_PDB,MHC_mut,MHC_mut_chain,TCR_mut,Kd_microM,Kon_per_M_per_s,Koff_per_s,Kd_wt/Kd_mut,...,Coupling Method,Analyte,PDB,KD(mM),KD(M),Source Data Set,Model,Affinity Release Date,Ligand Name,Mutations
430,S13,HLA-DQA1*03:01 | HLA-DQB1*03:02,HLA-DQA1*03:01 | HLA-DQB1*03:02,,,,1.05+/-0.1,,,,...,indirect,S13 WT,4Z7U,1.05+/-0.1,1.050000e-06,ATLAS,,,EGSFQPSQE-HLA-DQA1*03:01 | HLA-DQB1*03:02,
459,JR5.1,HLA-DQA1*05:01 | HLA-DQB1*02:01,HLA-DQA1*05:01 | HLA-DQB1*02:01,,,,79.4+/-2.9,,,,...,indirect,JR5.1 WT,4OZF,79.4+/-2.9,7.940000e-05,ATLAS,,,PQPELPYPQ-HLA-DQA1*05:01 | HLA-DQB1*02:01,
162,A6,HLA-A*02:01,HLA-A*02:01,,,,48 +/- 4,\N,\N,2.92,...,indirect,A6 WT,3PWP,48 +/- 4,4.800000e-05,ATLAS,,,LLYGFVNYI-HLA-A*02:01,C_G2L
163,A6,HLA-A*02:01,HLA-A*02:01,,,,73 +/- 6,\N,\N,1.92,...,indirect,A6 WT,3PWP,73 +/- 6,7.300000e-05,ATLAS,,,LLYGFVNYV-HLA-A*02:01,"C_G2L, C_I9V"
164,A6,HLA-A*02:01,HLA-A*02:01,,,,140 +/- 14,\N,\N,1,...,indirect,A6 WT,3PWP,140 +/- 14,1.400000e-04,ATLAS,,,LGYGFVNYI-HLA-A*02:01,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
390,DMF5,HLA-A*02:01,HLA-A*02:01,A_E166A,A,,32.2,\N,\N,1.24,...,direct,ELAGIGILTV-HLA-A*02:01 E166A,4L3E,32.2,3.220000e-05,ATLAS,,,ELAGIGILTV-HLA-A*02:01,A_E166A
391,DMF5,HLA-A*02:01,HLA-A*02:01,A_Q155A,A,D_Y50A,77.1,\N,\N,0.52,...,direct,ELAGIGILTV-HLA-A*02:01 Q155A,4L3E,77.1,7.710000e-05,ATLAS,,,ELAGIGILTV-HLA-A*02:01,"A_Q155A, D_Y50A"
392,DMF5,HLA-A*02:01,HLA-A*02:01,A_E166A,A,D_N52A,26.1,\N,\N,1.53,...,direct,ELAGIGILTV-HLA-A*02:01 E166A,4L3E,26.1,2.610000e-05,ATLAS,,,ELAGIGILTV-HLA-A*02:01,"A_E166A, D_N52A"
523,DMF5,HLA-A*02:01,HLA-A*02:01,,,"D_D26Y, E_L98W",.043+/-.007,,,255.81,...,direct,ELAGIGILTV-HLA-A*02:01,4L3E,.043+/-.007,4.300000e-08,ATLAS,,,ELAGIGILTV-HLA-A*02:01,"D_D26Y, E_L98W"


# pdb info

In [11]:
from utils import get_pdb_info
pdbs = set(df['PDB'].values)

for pdb in tqdm.tqdm(pdbs, desc='Adding PDB informations'):
    results = get_pdb_info(pdb)
    if results['Resolution']!=None:
        df.loc[df[df.PDB==pdb].index,'Resolution(Å)'] = results['Resolution'].replace('Å','').strip()
    if results['Method'] != None:
        df.loc[df[df.PDB==pdb].index,'Structure Method'] = results['Method']
    if results['Release Date'] != None:
        df.loc[df[df.PDB==pdb].index,'PDB Release Date'] = results['Release Date']   
    # 这是PDB的PubMed ID, 不是亲和力的PubMed ID
    if results['PubMed ID'] != None:
        df.loc[df[df.PDB==pdb].index,'PDB PubMed ID'] = results['PubMed ID']

Adding PDB informations: 100%|██████████| 84/84 [02:14<00:00,  1.60s/it]


# affinity info

In [12]:
# 根据Affinity PubMed ID查询Affinity Release Date
from utils import get_pubdate
for pubmed_id in tqdm.tqdm(set(df['Affinity PubMed ID'].values), desc='Adding affinity informations'):
    try:
        pubdate = get_pubdate(str(pubmed_id), dst_datetime_format='%Y-%m-%d')
        df.loc[df[df['Affinity PubMed ID']==pubmed_id].index, 'Affinity Release Date'] = pubdate
    except:
        pass

Adding affinity informations: 100%|██████████| 67/67 [01:18<00:00,  1.17s/it]


# save

In [13]:
df.reset_index(drop=True, inplace=True)
df = df.loc[:,[
    'PDB', 'Source Data Set', 'Model', 'Mutations', 'Ligand Chains', 'Receptor Chains', 'Ligand Name', 'Receptor Name', 'KD(M)', 
    'Affinity Method', 'Structure Method', 'Temperature(K)', 'Resolution(Å)', 
    'PDB PubMed ID', 'PDB Release Date', 'Affinity PubMed ID', 'Affinity Release Date',
    
]]
display(df)
save_dir = './processed_data'
df.to_excel( os.path.join(save_dir,'ATLAS.xlsx') )

Unnamed: 0,PDB,Source Data Set,Model,Mutations,Ligand Chains,Receptor Chains,Ligand Name,Receptor Name,KD(M),Affinity Method,Structure Method,Temperature(K),Resolution(Å),PDB PubMed ID,PDB Release Date,Affinity PubMed ID,Affinity Release Date
0,4Z7U,ATLAS,,,"C,A,B","D,E",EGSFQPSQE-HLA-DQA1*03:01 | HLA-DQB1*03:02,S13,1.050000e-06,Biacore 3000,X-RAY DIFFRACTION,298.15,2.70,25948817,2015-06-03,23063329,2012-10-19
1,4OZF,ATLAS,,,"C,A,B","D,E",PQPELPYPQ-HLA-DQA1*05:01 | HLA-DQB1*02:01,JR5.1,7.940000e-05,Biacore 3000,X-RAY DIFFRACTION,298.15,2.70,24777060,2014-04-16,24777060,2014-05-01
2,3PWP,ATLAS,,C_G2L,"C,A","D,E",LLYGFVNYI-HLA-A*02:01,A6,4.800000e-05,Biacore 3000,X-RAY DIFFRACTION,298.15,2.69,21282516,2011-03-09,21282516,2011-03-01
3,3PWP,ATLAS,,"C_G2L, C_I9V","C,A","D,E",LLYGFVNYV-HLA-A*02:01,A6,7.300000e-05,Biacore 3000,X-RAY DIFFRACTION,298.15,2.69,21282516,2011-03-09,21282516,2011-03-01
4,3PWP,ATLAS,,,"C,A","D,E",LGYGFVNYI-HLA-A*02:01,A6,1.400000e-04,Biacore 3000,X-RAY DIFFRACTION,298.15,2.69,21282516,2011-03-09,21282516,2011-03-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
632,4L3E,ATLAS,,A_E166A,"C,A","D,E",ELAGIGILTV-HLA-A*02:01,DMF5,3.220000e-05,Biacore T100,X-RAY DIFFRACTION,298.15,2.56,24550723,2014-06-11,23736024,2013-01-01
633,4L3E,ATLAS,,"A_Q155A, D_Y50A","C,A","D,E",ELAGIGILTV-HLA-A*02:01,DMF5,7.710000e-05,Biacore T100,X-RAY DIFFRACTION,298.15,2.56,24550723,2014-06-11,23736024,2013-01-01
634,4L3E,ATLAS,,"A_E166A, D_N52A","C,A","D,E",ELAGIGILTV-HLA-A*02:01,DMF5,2.610000e-05,Biacore T100,X-RAY DIFFRACTION,298.15,2.56,24550723,2014-06-11,23736024,2013-01-01
635,4L3E,ATLAS,,"D_D26Y, E_L98W","C,A","D,E",ELAGIGILTV-HLA-A*02:01,DMF5,4.300000e-08,Biacore 3000,X-RAY DIFFRACTION,298.15,2.56,24550723,2014-06-11,22611242,2012-06-15


In [14]:
# df[ df['Mutations'].apply(lambda x: ', ,' in x if isinstance(x,str) else False) ]