In [1]:
import os
import re
import pandas as pd
from Bio.PDB import *
from Bio import SeqIO
import nglview as nv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from biofunctions.biofunctions import *
pd.set_option('display.max_columns', 100)

The objective of this notebook is to obtain the dataset that contains the epitope residue sequences based on the distance betweem CDR residues and antigen residues.

# 1. Get Metadata

## 1.1 Load summary file

In [2]:
metadata = pd.read_csv('sabdab-data/20221001_0807534_summary.tsv', sep='\t')
len(metadata.pdb.unique())

740

# 1.2 Filter out non-protein antigens

After inspecting the metadata table, this is the list of antigen types:
* nan
* 'peptide'
* 'Hapten'
* 'protein'
* 'protein | protein'
* 'carbohydrate | protein'
* 'nucleic-acid'
* 'carbohydrate'
* 'carbohydrate | protein | protein'
* 'protein | peptide'
* 'nucleic-acid | nucleic-acid | nucleic-acid'
* 'peptide | peptide | peptide'

Therefore we only keep the ones that have a peptide or a protein

In [3]:
metadata.antigen_type.unique()

array([nan, 'peptide', 'Hapten', 'protein', 'protein | protein',
       'carbohydrate | protein', 'nucleic-acid', 'carbohydrate',
       'carbohydrate | protein | protein', 'protein | peptide',
       'nucleic-acid | nucleic-acid | nucleic-acid',
       'peptide | peptide | peptide'], dtype=object)

In [4]:
metadata = metadata[~metadata['antigen_type'].isin(['Hapten','nucleid-acid','carbohydrate','nucleic-acid | nucleic-acid | nucleic-acid'])]
len(metadata.pdb.unique())

646

# 2. Get Ab-Ag residue distances
1. From every PDB calculate the distance between the alpha carbon (CA) of the Ab residues in CDR regions and the CA of every Ag residue, but keep only those under 15 Angstroms (Å). 15Å was chosen because, based on previous data exploration, it is believed that paratope-epitope contacts occur within this distance.
2. From all the calculated distances under 15Å most of the Ab-Ag residue comparison will be outside of the paratope-epitope region; therfore, based on the distance distributions, we will narrow down Ab-Ag residues pairs to get the Ab-Ag residues of interest.


## 2.1 Get PDB IDs

In [5]:
pdb_ids = metadata.pdb.unique()
len(pdb_ids)

646

## 2.2 Calculate distances and keep distances under 15Å

In [6]:
# TODO: This code is a brute-force code. Make code more efficient.
count = 0
pending_pdbs = {'pdb':[],'error':[]} #PDBs that can't be processed because an error occurs
n_closest_ag_ress = 6
for pdb in pdb_ids:
    print(f'{count}.- Analyzing pdb {pdb}')
    # Parse pdb
    pdb_parser = PDBParser()
    structure = pdb_parser.get_structure(pdb, "sabdab-data/imgt/{}.pdb".format(pdb))

    # Get chains
    chain_label_list = []
    for model in structure.get_models():
        for chain in model.get_chains():
            chain_label_list.append(chain.get_id())

    # The label given to the H and L chains in the PDB
    H_labels = list(metadata[(metadata['pdb']==pdb)&(~metadata.Hchain.isna())].Hchain.values)
    L_labels = list(metadata[(metadata['pdb']==pdb)&(~metadata.Lchain.isna())].Lchain.values)

    Ab_labels = H_labels + L_labels

    if not metadata[(metadata['pdb']==pdb)&(~metadata.antigen_chain.isnull())].empty:
        Ag_labels = list(metadata[metadata['pdb']==pdb].antigen_chain.values)
        if len(Ag_labels) == 1:
            Ag_labels = re.findall('[A-Z]',Ag_labels[0])
    else:
        Ag_labels = []
        for label in chain_label_list:
            if label not in H_labels and label not in L_labels:
                Ag_labels.append(label)

    try:
        comparisons_dict = get_ab_ag_distances(model, Ab_labels, Ag_labels, 15)

        comparisons_df = pd.DataFrame(comparisons_dict)

        if comparisons_df.empty:
            print(f'Empty comparisons_df with pdb {pdb}')
        else:
            comparisons_df['ab_ress_seqid'] = comparisons_df.ab_res + '-' + comparisons_df.ab_seqid.astype(str)
            comparisons_df['ag_ress_seqid'] = comparisons_df.ag_res + '-' + comparisons_df.ag_seqid.astype(str)
            comparisons_df['pdb'] = pdb
            distances_df = comparisons_df.groupby('ab_ress_seqid').apply(lambda df: df.nlargest(n_closest_ag_ress, columns='distances')).reset_index(level=[0,1], drop=True)
            distances_df.sort_values(['ab_ress_seqid','distances','ag_ress_seqid'],inplace=True)
            if not os.path.exists('distances.csv'):
                distances_df.to_csv('distances.csv',index=False, )
            else:
                distances_df.to_csv('distances.csv',mode='a',index=False,header=False)
    
    except Exception as e:
        pending_pdbs['pdb'].append(pdb)
        pending_pdbs['error'].append(e)

    print('\n')
    if count%50 == 0:
        print(f'Analyzed {count} PDBs so far.')
    count += 1

pending_pdbs_df = pd.DataFrame(pending_pdbs)
pending_pdbs_df.to_csv('pending_pdbs.csv')
print('Finished...')

0.- Analyzing pdb 1mhh
0 total comparisons
Finished...
Empty comparisons_df with pdb 1mhh


Analyzed 0 PDBs so far.
1.- Analyzing pdb 1hh9
100 comparisons made so far.
167 total comparisons
Finished...


2.- Analyzing pdb 2r56
100 comparisons made so far.
200 comparisons made so far.
300 comparisons made so far.
400 comparisons made so far.
500 comparisons made so far.
594 total comparisons
Finished...


3.- Analyzing pdb 5wux
100 comparisons made so far.
200 comparisons made so far.
300 comparisons made so far.
400 comparisons made so far.
500 comparisons made so far.
600 comparisons made so far.
700 comparisons made so far.
800 comparisons made so far.
900 comparisons made so far.
944 total comparisons
Finished...


4.- Analyzing pdb 4xi5
100 comparisons made so far.
200 comparisons made so far.
270 total comparisons
Finished...


5.- Analyzing pdb 4k9e
100 comparisons made so far.
200 comparisons made so far.
300 comparisons made so far.
400 comparisons made so far.
406 total compar

These are the PDBs that were not processed due to errors.

In [7]:
pending_pdbs_df.error.unique()

array([KeyError('CA'), KeyError('CA'), KeyError('CA'), KeyError('CA'),
       KeyError('CA'), KeyError('CA'), KeyError('CA'), KeyError('CA'),
       KeyError('D | C'), KeyError('CA'), KeyError('B | A'),
       KeyError('CA'), KeyError('CA'), KeyError('CA'), KeyError('CA'),
       KeyError('CA'), KeyError('CA'),
       IndexError('Too many levels: Index has only 1 level, not 2'),
       KeyError(nan), KeyError('CA'), KeyError('CA'), KeyError('CA'),
       KeyError('CA'), KeyError('CA'), KeyError('A | C'), KeyError('CA'),
       KeyError('CA'), KeyError('D | C'), KeyError('CA'), KeyError('CA'),
       KeyError('B | A'), KeyError('CA'), KeyError('CA'), KeyError(nan),
       KeyError(nan), KeyError('Y | X'), KeyError('CA'), KeyError('CA'),
       KeyError('CA'), KeyError('CA'), KeyError('CA'), KeyError('CA'),
       KeyError('D | F'), KeyError(nan), KeyError(nan), KeyError('CA'),
       KeyError('CA'), KeyError('CA'), KeyError('E | G'),
       IndexError('Too many levels: Index has only 1 

# 3. Check distances

In [22]:
distances_df = pd.read_csv('distances.csv')

In [23]:
len(distances_df)

65402

In [24]:
distances_df.head(20)

Unnamed: 0,ab_label,ab_res,ab_seqid,ag_label,ag_res,ag_seqid,distances,ab_ress_seqid,ag_ress_seqid,pdb
0,A,ALA,57,C,GLU,5,11.268807,ALA-57,GLU-5,1hh9
1,A,ALA,57,C,ALA,2,12.260887,ALA-57,ALA-2,1hh9
2,A,ALA,57,C,LEU,7,12.740383,ALA-57,LEU-7,1hh9
3,A,ALA,57,C,THR,3,13.943336,ALA-57,THR-3,1hh9
4,A,ALA,57,C,PRO,4,14.627425,ALA-57,PRO-4,1hh9
5,A,ALA,57,C,ASP,1,14.756581,ALA-57,ASP-1,1hh9
6,A,ARG,56,C,LEU,7,9.369978,ARG-56,LEU-7,1hh9
7,A,ARG,56,C,THR,3,10.328367,ARG-56,THR-3,1hh9
8,A,ARG,56,C,PRO,4,10.817346,ARG-56,PRO-4,1hh9
9,A,ARG,56,C,ASP,1,12.066611,ARG-56,ASP-1,1hh9


In [25]:
distances_df.groupby(['pdb','ab_ress_seqid']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,ab_label,ab_res,ab_seqid,ag_label,ag_res,ag_seqid,distances,ag_ress_seqid
pdb,ab_ress_seqid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1bj1,ASN-36,6,6,6,6,6,6,6,6
1bj1,ASN-57,6,6,6,6,6,6,6,6
1bj1,GLU-64,6,6,6,6,6,6,6,6
1bj1,GLY-38,6,6,6,6,6,6,6,6
1bj1,GLY-63,6,6,6,6,6,6,6,6
...,...,...,...,...,...,...,...,...,...
6mnf,SER-59,6,6,6,6,6,6,6,6
6mnf,SER-62,6,6,6,6,6,6,6,6
6mnf,THR-58,4,4,4,4,4,4,4,4
6mnf,THR-63,5,5,5,5,5,5,5,5


# 4. Add amino acid letter code

## 4.1 Load amino_acids_df

In [26]:
amino_acids_df = pd.read_csv('amino_acids.csv',index_col=0)
amino_acids_df.head()

Unnamed: 0,amino acid,3 letter code,letter code
0,alanine,ala,A
1,arginine,arg,R
2,asparagine,asn,N
3,aspartic acid,asp,D
4,cysteine,cys,C


In [27]:
amino_acids_df['3 letter code'] = amino_acids_df['3 letter code'].str.upper()
amino_acids_df.head()

Unnamed: 0,amino acid,3 letter code,letter code
0,alanine,ALA,A
1,arginine,ARG,R
2,asparagine,ASN,N
3,aspartic acid,ASP,D
4,cysteine,CYS,C


## 4.2 Merge distances_df and amino_acids_df to get letter code for ab_res

In [28]:
distances_df = pd.merge(left=distances_df, right=amino_acids_df[['3 letter code','letter code']], 
left_on='ab_res',right_on='3 letter code', how='left')
distances_df.drop('3 letter code', axis=1, inplace=True)
distances_df.rename({'letter code':'ab_letter'},axis=1,inplace=True)

In [29]:
distances_df.head()

Unnamed: 0,ab_label,ab_res,ab_seqid,ag_label,ag_res,ag_seqid,distances,ab_ress_seqid,ag_ress_seqid,pdb,ab_letter
0,A,ALA,57,C,GLU,5,11.268807,ALA-57,GLU-5,1hh9,A
1,A,ALA,57,C,ALA,2,12.260887,ALA-57,ALA-2,1hh9,A
2,A,ALA,57,C,LEU,7,12.740383,ALA-57,LEU-7,1hh9,A
3,A,ALA,57,C,THR,3,13.943336,ALA-57,THR-3,1hh9,A
4,A,ALA,57,C,PRO,4,14.627425,ALA-57,PRO-4,1hh9,A


## 4.2 Merge distances_df and amino_acids_df to get letter code for ab_res

In [30]:
distances_df = pd.merge(left=distances_df, right=amino_acids_df[['3 letter code','letter code']], 
left_on='ag_res',right_on='3 letter code', how='left')
distances_df.drop('3 letter code', axis=1, inplace=True)
distances_df.rename({'letter code':'ag_letter'},axis=1,inplace=True)

In [31]:
distances_df.head()

Unnamed: 0,ab_label,ab_res,ab_seqid,ag_label,ag_res,ag_seqid,distances,ab_ress_seqid,ag_ress_seqid,pdb,ab_letter,ag_letter
0,A,ALA,57,C,GLU,5,11.268807,ALA-57,GLU-5,1hh9,A,E
1,A,ALA,57,C,ALA,2,12.260887,ALA-57,ALA-2,1hh9,A,A
2,A,ALA,57,C,LEU,7,12.740383,ALA-57,LEU-7,1hh9,A,L
3,A,ALA,57,C,THR,3,13.943336,ALA-57,THR-3,1hh9,A,T
4,A,ALA,57,C,PRO,4,14.627425,ALA-57,PRO-4,1hh9,A,P


## 4.3 Add ab_letter_seqid and ag_letter_seqid columns

In [32]:
distances_df['ab_letter_seqid'] = distances_df.ab_letter + '-' + distances_df.ab_seqid.astype(str)
distances_df['ag_letter_seqid'] = distances_df.ag_letter + '-' + distances_df.ag_seqid.astype(str)


In [33]:
distances_df.sort_values(['pdb','ab_label','ab_seqid','ag_label','ag_seqid'],inplace=True)

In [34]:
distances_df.head(20)

Unnamed: 0,ab_label,ab_res,ab_seqid,ag_label,ag_res,ag_seqid,distances,ab_ress_seqid,ag_ress_seqid,pdb,ab_letter,ag_letter,ab_letter_seqid,ag_letter_seqid
8248,H,TYR,28,W,GLN,79,14.298218,TYR-28,GLN-79,1bj1,Y,Q,Y-28,Q-79
8249,H,TYR,28,W,MET,81,14.552553,TYR-28,MET-81,1bj1,Y,M,Y-28,M-81
8252,H,TYR,28,W,GLN,89,14.830989,TYR-28,GLN-89,1bj1,Y,Q,Y-28,Q-89
8225,H,THR,29,V,TYR,21,14.333929,THR-29,TYR-21,1bj1,T,Y,T-29,Y-21
8227,H,THR,29,W,LYS,48,14.978921,THR-29,LYS-48,1bj1,T,K,T-29,K-48
8223,H,THR,29,W,ILE,80,13.96092,THR-29,ILE-80,1bj1,T,I,T-29,I-80
8212,H,PHE,30,V,PHE,17,13.223936,PHE-30,PHE-17,1bj1,F,F,F-30,F-17
8216,H,PHE,30,W,GLY,88,14.412254,PHE-30,GLY-88,1bj1,F,G,F-30,G-88
8215,H,PHE,30,W,GLY,92,14.345451,PHE-30,GLY-92,1bj1,F,G,F-30,G-92
8233,H,THR,35,V,LYS,16,14.343775,THR-35,LYS-16,1bj1,T,K,T-35,K-16


In [35]:
distances_df.to_csv('distances.csv')