In [53]:
import os
import pandas as pd
import json
from Bio.PDB import *
from Bio import SeqIO
import nglview as nv
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from biofunctions.pdb_handler import PDBHandler
pd.set_option('display.max_columns', 100)

The objective of this notebook is to obtain the dataset that contains the epitope residue sequences based on the distance betweem CDR residues and antigen residues.

# 1. Load distances as a dictionary

In [54]:
distances_df = pd.read_csv('distances.csv',index_col=0)

In [55]:
distances_df.head(10)

Unnamed: 0,ab_label,ab_res,ab_seqid,ag_label,ag_res,ag_seqid,distances,ab_ress_seqid,ag_ress_seqid,pdb,ab_letter,ag_letter,ab_letter_seqid,ag_letter_seqid
8248,H,TYR,28,W,GLN,79,14.298218,TYR-28,GLN-79,1bj1,Y,Q,Y-28,Q-79
8249,H,TYR,28,W,MET,81,14.552553,TYR-28,MET-81,1bj1,Y,M,Y-28,M-81
8252,H,TYR,28,W,GLN,89,14.830989,TYR-28,GLN-89,1bj1,Y,Q,Y-28,Q-89
8225,H,THR,29,V,TYR,21,14.333929,THR-29,TYR-21,1bj1,T,Y,T-29,Y-21
8227,H,THR,29,W,LYS,48,14.978921,THR-29,LYS-48,1bj1,T,K,T-29,K-48
8223,H,THR,29,W,ILE,80,13.96092,THR-29,ILE-80,1bj1,T,I,T-29,I-80
8212,H,PHE,30,V,PHE,17,13.223936,PHE-30,PHE-17,1bj1,F,F,F-30,F-17
8216,H,PHE,30,W,GLY,88,14.412254,PHE-30,GLY-88,1bj1,F,G,F-30,G-88
8215,H,PHE,30,W,GLY,92,14.345451,PHE-30,GLY-92,1bj1,F,G,F-30,G-92
8233,H,THR,35,V,LYS,16,14.343775,THR-35,LYS-16,1bj1,T,K,T-35,K-16


## 1.1 Read distances.csv line by line to create pdb_dict.json

In [63]:
pdb_dict = {'pdbs':{}}

file = open('distances.csv', 'r')
count = 0

header = file.readline()[:-1].split(',')

header_to_idx = {value:i for i,value in enumerate(header)}

#while line:
while count:
  
    # Get next line from file
    line = file.readline()
    values = line[:-1].split(',')

    # If line is empty
    # end of file is reached
    if not line:
        break

    ab_label = values[header_to_idx['ab_label']]
    ab_res = values[header_to_idx['ab_res']]
    ab_seqid = values[header_to_idx['ab_seqid']]
    ab_letter = values[header_to_idx['ab_letter']]
    ag_label = values[header_to_idx['ag_label']]
    ab_res = values[header_to_idx['ag_res']]
    ag_seqid = values[header_to_idx['ag_seqid']]
    ag_letter = values[header_to_idx['ag_letter']]
    distances = values[header_to_idx['distances']]
    ab_ress_seqid = values[header_to_idx['ab_ress_seqid']]
    ag_ress_seqid = values[header_to_idx['ag_ress_seqid']]
    ab_letter_seqid = values[header_to_idx['ab_letter_seqid']]
    ag_letter_seqid = values[header_to_idx['ag_letter_seqid']]
    pdb = values[header_to_idx['pdb']]

    if pdb not in pdb_dict['pdbs']:
        pdb_dict['pdbs'][pdb] = {'ab_labels':{}}

    
    ag_letter_dict = {'ag_letter':ag_letter,
                    'ag_seqid':ag_seqid,
                    'ag_letter_seqid':ag_letter_seqid,
                    'ag_label':ag_label}
    ab_letter_dict = {'ab_letter':ab_letter,
                    'ab_seqid':ab_seqid,
                    'ab_letter_seqid':ab_letter_seqid,
                    'ag_contacts':[ag_letter_dict]}

    if ab_label not in pdb_dict['pdbs'][pdb]['ab_labels']: 
        # If ab_label is new, initialize it with ab_letter and ag_letter
        pdb_dict['pdbs'][pdb]['ab_labels'][ab_label] = {'ab_letters':[ab_letter_dict]}
    else:
        # Else, check if the current ab_letter_ is the same as the previous ab_letter. If so, append new ag_letter 
        # to ag_contacts.
        prev_ab_letter_seqid = pdb_dict['pdbs'][pdb]['ab_labels'][ab_label]['ab_letters'][-1]['ab_letter_seqid']
        if prev_ab_letter_seqid == ab_letter_seqid:
            pdb_dict['pdbs'][pdb]['ab_labels'][ab_label]['ab_letters'][-1]['ag_contacts'].append(ag_letter_dict)
        else:
            # Otherwise,append new ab_letter and ag_letter
            pdb_dict['pdbs'][pdb]['ab_labels'][ab_label]['ab_letters'].append(ab_letter_dict)
    
    # Prepare for reading next line
    count += 1
    line.strip()

  
file.close()

# 2. Make contact matrix per pdb

In [64]:
with open('pdb_dict.json','r') as f:
    pdb_dict = json.load(f)

In [65]:
pdb = '1hh9'

In [66]:
pdb_parser = PDBParser()
structure = pdb_parser.get_structure(pdb, "sabdab-data/imgt/{}.pdb".format(pdb))

In [67]:
view = nv.show_biopython(structure)
view

NGLWidget()

In [68]:
pdb_handler = PDBHandler(pdb_dict['pdbs'][pdb])

In [70]:
contacts_dict = pdb_dict['pdbs'][pdb]


In [None]:
def get_full_seq(res_list:list,chain_label:str,n=6)->str:
    '''
    Pass the list of consecutive residues sequence ids that are in a contact region
    in sorted order and n (how many residues before and after the first 
    and last contact residue we will extract).

    The function gets min-n (or min if min-n is less than the first residue) and 
    max+n(or max if max+n is more than the max residue), then loads the PDB and 
    gets the complete sequence.

    Parameters
    ----------
    res_list : list of residues that are in a contact region, either 
    in the paratope or the epitope
    chain_label : the label of the chain they belong to in the structure.
    n : how many residues are going to be added before and after the
    min and max limits of the contact region. 
    '''
    
    full_seq = ''
    return full_seq

In [36]:
contacts_dict.keys()

dict_keys(['ab_labels'])

In [71]:
contacts_dict['ab_labels'].keys()

dict_keys(['A', 'B'])

In [72]:
chain_label = 'A'
n=6

In [73]:
contacts_dict['ab_labels'][chain_label]
seqids = []
for ab_letter_dict in contacts_dict['ab_labels'][chain_label]['ab_letters']:
    seqid = ab_letter_dict['ab_seqid']
    seqids.append(seqid)

In [74]:
seqids

['28', '29', '36', '37', '38', '56', '57', '65']

In [None]:
from itertools import groupby
from operator import itemgetter
data = [ 1, 4,5,6, 10, 15,16,17,18, 22, 25,26,27,28]
for k, g in groupby(enumerate(data), lambda (i, x): i-x):
    print map(itemgetter(1), g)