In [1]:
import os
import re
import json
import pandas as pd
from Bio.PDB import *
from Bio import SeqIO
import nglview as nv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from biofunctions.biofunctions import *
pd.set_option('display.max_columns', 100)

**The objective of this notebook is to obtain the Ab-Ag residues that are part of the paratope-epitope interaction.**

**This interaction is defined by two parameters:**
1. The distance from the **CA** of the Ab and Ag residues is less that **max_dist**
2. The Number of Ag residues that are considered to be in contact to a given Ab residue in a CDR is defined by **n_closest_ress**.



# 1. Load metadata and amino acids dictionary

## 1.1 Load summary file with the metadata

In [2]:
metadata = pd.read_csv('sabdab-data/20221001_0807534_summary.tsv', sep='\t')
len(metadata.pdb.unique())

740

# 1.2 Filter out non-protein antigens

After inspecting the metadata table, this is the list of antigen types:
* nan
* 'peptide'
* 'Hapten'
* 'protein'
* 'protein | protein'
* 'carbohydrate | protein'
* 'nucleic-acid'
* 'carbohydrate'
* 'carbohydrate | protein | protein'
* 'protein | peptide'
* 'nucleic-acid | nucleic-acid | nucleic-acid'
* 'peptide | peptide | peptide'

Therefore we only keep the ones that have a peptide or a protein

In [3]:
metadata.antigen_type.unique()

array([nan, 'peptide', 'Hapten', 'protein', 'protein | protein',
       'carbohydrate | protein', 'nucleic-acid', 'carbohydrate',
       'carbohydrate | protein | protein', 'protein | peptide',
       'nucleic-acid | nucleic-acid | nucleic-acid',
       'peptide | peptide | peptide'], dtype=object)

In [4]:
filter_out = ['Hapten',
              'nucleid-acid',
              'carbohydrate',
              'nucleic-acid | nucleic-acid | nucleic-acid']
metadata = metadata[~metadata['antigen_type'].isin(filter_out)]
len(metadata.pdb.unique())

646

## 1.3 Load amino acids dictionary

In [5]:
with open('amino_acids.json','r') as f:
    amino_acids_dict = json.load(f)

# 2. Get Ab-Ag residue distances
From every PDB calculate the distance between the alpha carbon (CA) of the Ab residues in CDR regions and the CA of every Ag residue, but keep only those under 15 Angstroms (Å). 15Å was chosen because, based on previous data exploration. It is believed that paratope-epitope interactions occur within this distance.



## 2.1 Get PDB IDs

In [6]:
pdb_ids = metadata.pdb.unique()
len(pdb_ids)

646

## 2.2 Calculate distances and keep distances under 15Å

Some terms:
* **n_closest_ag_ress:** How many Ag residues are considered close to any given CDR Ab residue
* **seqid:** The sequence id of the Ab and AG residues given by the IMGT numbering system.
* **pdb_dict:** Dictionary with the informacion of the closest Ab-Ag residues. I has de following information
    * pdbs: a dictionary key
        * pdb: a pdb id
            * interactions_dict: one of the outputs of get_closest_ress() function
            

In [7]:
n_closest_ag_ress = 6
max_dist = 15
count = 1
pdb_dict = {'pdbs':{}}

#PDBs that can't be processed because an error occurs. Will be added later to the analysis
pending_pdbs = {'pdb':[],'error':[]} 
lambda_func = lambda df: df.nlargest(n_closest_ag_ress, columns='distance')
for pdb in pdb_ids:
    print(f'{count}.- Analyzing pdb {pdb}')
    
    if pdb not in pdb_dict['pdbs']:
        pdb_dict['pdbs'][pdb] = {}
        
    # Parse pdb
    pdb_parser = PDBParser()
    structure = pdb_parser.get_structure(pdb, f'sabdab-data/imgt/{pdb}.pdb')

    # Get chains
    chain_label_list = []
    for model in structure.get_models():
        for chain in model.get_chains():
            chain_label_list.append(chain.get_id())

    # Find all labels given to the Heavy and Light chains in the PDB
    H_labels = list(metadata[(metadata['pdb']==pdb)&(~metadata.Hchain.isna())].Hchain.values)
    L_labels = list(metadata[(metadata['pdb']==pdb)&(~metadata.Lchain.isna())].Lchain.values)

    Ab_labels = H_labels + L_labels
    
    # Find all labels given to the Antigen in the PDB
    if not metadata[(metadata['pdb']==pdb)&(~metadata.antigen_chain.isnull())].empty:
        Ag_labels = list(metadata[metadata['pdb']==pdb].antigen_chain.values)

        if len(Ag_labels) == 1:
            Ag_labels = re.findall('[A-Z]',Ag_labels[0])

    else:
        Ag_labels = []

        for label in chain_label_list:

            if label not in H_labels and label not in L_labels:
                Ag_labels.append(label)
         
    try:
        closest_ress_dict, interactions_dict = get_closest_ress(
                model, Ab_labels, Ag_labels, 
                max_dist, amino_acids_dict)
        
        pdb_dict['pdbs'][pdb] = interactions_dict

        temp_df = pd.DataFrame(closest_ress_dict)

        if temp_df.empty:
            print(f'Empty temp_df with pdb {pdb}')

        else:
            temp_df['ab_ress_seqid'] = temp_df.ab_res + '-' + temp_df.ab_seqid.astype(str)
            temp_df['ag_ress_seqid'] = temp_df.ag_res + '-' + temp_df.ag_seqid.astype(str)
            closest_ress_df = temp_df.groupby('ab_ress_seqid').apply(lambda_func).reset_index(level=[0,1], 
                    drop=True)
            closest_ress_df.sort_values(['ab_ress_seqid','distance','ag_ress_seqid'],inplace=True)
            closest_ress_df['pdb'] = pdb
            
            if not os.path.exists('closest_residues.csv'):
                closest_ress_df.to_csv('closest_residues.csv',index=False)
            else:
                closest_ress_df.to_csv('closest_residues.csv',mode='a',index=False,header=False)
    
    except Exception as e:
        del pdb_dict['pdbs'][pdb]
        pending_pdbs['pdb'].append(pdb)
        pending_pdbs['error'].append(e)
        print(e)

    print('\n')
    if count%50 == 0:
        print(f'Analyzed {count} PDBs so far.')
    count += 1

with open('pdb_dict.json','w') as f:
    json.dump(pdb_dict,f)
    
pending_pdbs_df = pd.DataFrame(pending_pdbs)
pending_pdbs_df.to_csv('pending_pdbs.csv')

all_closest_ress_df = pd.read_csv('closest_residues.csv')
all_closest_ress_df = all_closest_ress_df.sort_values(['pdb','ab_label','ab_seqid','ag_label','ag_seqid'])
all_closest_ress_df.to_csv('closest_residues.csv',index=False)

1.- Analyzing pdb 1mhh
0 total comparisons
Finished processing pdb.
Empty temp_df with pdb 1mhh


2.- Analyzing pdb 1hh9
100 comparisons made so far.
200 comparisons made so far.
268 total comparisons
Finished processing pdb.


3.- Analyzing pdb 2r56
100 comparisons made so far.
200 comparisons made so far.
300 comparisons made so far.
400 comparisons made so far.
500 comparisons made so far.
600 comparisons made so far.
700 comparisons made so far.
800 comparisons made so far.
900 comparisons made so far.
1000 comparisons made so far.
1090 total comparisons
Finished processing pdb.


4.- Analyzing pdb 5wux
100 comparisons made so far.
200 comparisons made so far.
300 comparisons made so far.
400 comparisons made so far.
500 comparisons made so far.
600 comparisons made so far.
700 comparisons made so far.
800 comparisons made so far.
900 comparisons made so far.
1000 comparisons made so far.
1100 comparisons made so far.
1200 comparisons made so far.
1300 comparisons made so far.
1305

These are the PDBs that were not processed due to errors.

In [8]:
pending_pdbs_df.error.unique()

array([KeyError(' DT'), KeyError('CA'), KeyError('CA'), KeyError('CA'),
       KeyError('CA'), KeyError('  G'), KeyError('CA'), KeyError('CA'),
       KeyError('D | C'), KeyError('  G'), KeyError('B | A'),
       KeyError('CA'), KeyError(' DT'), KeyError('CA'), KeyError('CA'),
       KeyError('CA'), KeyError('CA'),
       IndexError('Too many levels: Index has only 1 level, not 2'),
       KeyError(nan), KeyError('CA'), KeyError('CA'), KeyError('CA'),
       KeyError('CA'), KeyError('CA'), KeyError('A | C'), KeyError('  G'),
       KeyError('CA'), KeyError('D | C'), KeyError('CA'), KeyError('CA'),
       KeyError('B | A'), KeyError('CA'), KeyError('  G'), KeyError(nan),
       KeyError(nan), KeyError('Y | X'), KeyError('CA'), KeyError('CA'),
       KeyError('CA'), KeyError('CA'), KeyError('CA'), KeyError('CA'),
       KeyError('D | F'), KeyError('UNK'), KeyError('UNK'),
       KeyError('UNK'), KeyError(nan), KeyError(nan), KeyError(' DC'),
       KeyError('  G'), KeyError('  G'), KeyEr

# 3. Check closest residues

In [9]:
closest_ress_df = pd.read_csv('closest_residues.csv')

In [10]:
len(closest_ress_df)

120472

In [11]:
closest_ress_df.head(20)

Unnamed: 0,ab_label,ab_res,ab_seqid,ab_letter,ab_letter_seqid,cdr,ab_list_idx,ag_label,ag_res,ag_seqid,ag_letter,ag_letter_seqid,ag_list_idx,distance,ab_ress_seqid,ag_ress_seqid,pdb
0,H,TYR,28,Y,Y-28,CDR1,26,W,GLN,79,Q,Q-79,65,14.298218,TYR-28,GLN-79,1bj1
1,H,TYR,28,Y,Y-28,CDR1,26,W,MET,81,M,M-81,67,14.552553,TYR-28,MET-81,1bj1
2,H,TYR,28,Y,Y-28,CDR1,26,W,GLN,89,Q,Q-89,75,14.830989,TYR-28,GLN-89,1bj1
3,H,THR,29,T,T-29,CDR1,27,V,TYR,21,Y,Y-21,7,14.333929,THR-29,TYR-21,1bj1
4,H,THR,29,T,T-29,CDR1,27,W,LYS,48,K,K-48,34,14.978921,THR-29,LYS-48,1bj1
5,H,THR,29,T,T-29,CDR1,27,W,ILE,80,I,I-80,66,13.96092,THR-29,ILE-80,1bj1
6,H,PHE,30,F,F-30,CDR1,28,V,PHE,17,F,F-17,3,13.223936,PHE-30,PHE-17,1bj1
7,H,PHE,30,F,F-30,CDR1,28,W,GLY,88,G,G-88,74,14.412254,PHE-30,GLY-88,1bj1
8,H,PHE,30,F,F-30,CDR1,28,W,GLY,92,G,G-92,78,14.345451,PHE-30,GLY-92,1bj1
9,H,THR,35,T,T-35,CDR1,29,V,LYS,16,K,K-16,2,14.343775,THR-35,LYS-16,1bj1


In [12]:
closest_ress_df.groupby(['pdb','ab_ress_seqid']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,ab_label,ab_res,ab_seqid,ab_letter,ab_letter_seqid,cdr,ab_list_idx,ag_label,ag_res,ag_seqid,ag_letter,ag_letter_seqid,ag_list_idx,distance,ag_ress_seqid
pdb,ab_ress_seqid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1bj1,ALA-105,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
1bj1,ASN-36,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
1bj1,ASN-57,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
1bj1,ASP-116,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
1bj1,GLN-105,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6mnf,SER-59,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
6mnf,SER-62,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
6mnf,THR-58,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
6mnf,THR-63,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
