In [1]:
import pandas as pd
from Bio.PDB import *
from Bio import SeqIO
import nglview as nv
pd.set_option('display.max_columns', 100)

# 1. Get metadata

## 1.1 Load file

In [2]:
metadata = pd.read_csv('sabdab-data/20221001_0807534_summary.tsv', sep='\t')
len(metadata.pdb.unique())

740

# 1.2 Filter out structure with Hapten Antigen

Haptens are small molecules and we are interested in protein antigens

In [3]:
metadata = metadata[metadata['antigen_type']!='Hapten']
len(metadata.pdb.unique())

672

# 2. Get the metadata of a single structure as an example

idx pdb
0   1mhh
100 4liq


In [4]:
pdb = metadata.pdb.unique()[0]
print(pdb)

1mhh


In [5]:

metadata[metadata['pdb']==pdb]

Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain,antigen_type,antigen_het_name,antigen_name,short_col,date,compound,organism,heavy_species,light_species,antigen_species,authors,resolution,method,r_free,r_factor,scfv,engineered,heavy_subclass,light_subclass,light_ctype,affinity,delta_g,affinity_method,temperature,pmid
0,1mhh,D,C,0,,,,,IMMUNE SYSTEM,08/20/02,Antibody-antigen complex,FINEGOLDIA MAGNA; MUS MUSCULUS,mus musculus,mus musculus,,"Graille, M., Stura, E.A.",2.1,X-RAY DIFFRACTION,0.247,0.197,False,False,IGHV9,IGKV8,Kappa,1e-09,-12.278197,Unknown,,TBD
1,1mhh,B,A,0,,,,,IMMUNE SYSTEM,08/20/02,Antibody-antigen complex,FINEGOLDIA MAGNA; MUS MUSCULUS,mus musculus,mus musculus,,"Graille, M., Stura, E.A.",2.1,X-RAY DIFFRACTION,0.247,0.197,False,False,IGHV9,IGKV8,Kappa,1e-09,-12.278197,Unknown,,TBD


## 2.1 Parse structure

In [6]:
pdb_parser = PDBParser()

In [7]:
structure = pdb_parser.get_structure(pdb, "sabdab-data/imgt/{}.pdb".format(pdb))

## 2.2 View structure

In [8]:
view = nv.show_biopython(structure)
view

NGLWidget()

## 2.3 Extract substructures from structure

In [58]:
for model in structure.get_models():
    print(model)
    for chain in model.get_chains():
        print(chain)
        ppb = CaPPBuilder()
        for pp in ppb.build_peptides(chain):
            print(pp)
            print(pp.get_sequence())

<Model id=0>
<Chain id=A>
<Polypeptide start=1 end=159>
MISLIAALAVDRVIGMENAMPWNLPADLAWFKRNTLNKPVIMGRHTWESIGRPLPGRKNIILSSQPGTDDRVTWVKSVDEAIAACGDVPEIMVIGGGRVYEQFLPKAQKLYLTHIDAEVEGDTHFPDYEPDDWESVFSEFHDADAQNSHSYCFEILERR
<Chain id=B>
<Polypeptide start=1 end=128>
QVQLQESGGGLVQAGGSLRLSCKASGIIFSVYKMTWYRQAPGKERELVALITTNNNTMTVDSVKGRFTISRDNVQNTVYLEMNNLKPEDTAVYYCNANRGLAGPAYWGQGTQVTVSS


In [20]:
print(chain)
seq = ''
for res in chain.get_residues():
    seq += res.get_resname()

print(seq)

<Chain id=C>
ASPILEVALMETSERGLNSERPROSERSERLEUALAVALSERALAGLYGLULYSVALTHRMETSERCYSLYSSERSERGLNSERLEULEUASNSERARGTHRARGLYSASNTYRLEUALATRPTYRGLNGLNLYSPROGLYGLNSERPROLYSVALLEUILETYRTRPALASERTHRARGGLUSERGLYVALPROASPARGPHETHRGLYARGGLYSERGLYTHRASPPHETHRLEUTHRILESERSERVALGLNALAGLUASPGLNALAVALTYRTYRCYSLYSGLNALATYRILEPROPROLEUTHRPHEGLYALAGLYTHRLYSLEUGLULEULYSARGALAASPALAALAPROTHRVALSERILEPHEPROPROSERSERGLUGLNLEUTHRSERGLYGLYALASERVALVALCYSPHELEUASNASNPHETYRPROLYSASPILEASNVALLYSTRPLYSILEASPGLYSERGLUARGGLNASNGLYVALLEUASNSERTRPTHRASPGLNASPSERLYSASPSERTHRTYRSERMETSERSERTHRLEUTHRLEUTHRLYSASPGLUTYRGLUARGHISASNSERTYRTHRCYSGLUALATHRHISLYSTHRSERTHRSERPROILEVALLYSSERPHEASNARGASNGLUAEAHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOH

In [12]:
len(seq)/3

411.0

In [13]:
len(pp.get_sequence())

219

# 2.4 Get the distance between 2 chains

In [14]:
Ag = model['F']#Antigen
H = model['C']#LChain

In [15]:
h_ppb = CaPPBuilder()
for h_pp in h_ppb.build_peptides(H):
    h_seq = h_pp.get_sequence()
    print(h_seq)
h_len = len(h_seq)
print(h_len)

DIVMSQSPSSLAVSAGEKVTMSCKSSQSLLNSRTRKNYLAWYQQKPGQSPKVLIYWASTRESGVPDRFTGRGSGTDFTLTISSVQAEDQAVYYCKQAYIPPLTFGAGTKLELKRADAAPTVSIFPPSSEQLTSGGASVVCFLNNFYPKDINVKWKIDGSERQNGVLNSWTDQDSKDSTYSMSSTLTLTKDEYERHNSYTCEATHKTSTSPIVKSFNRNE
219


In [16]:
ag_ppb = CaPPBuilder()
for ag_pp in ag_ppb.build_peptides(Ag):
    ag_seq = ag_pp.get_sequence()
    ag_start, ag_end = ag_pp.get_start_end()
    print(ag_seq)
ag_len = len(ag_seq)
print(ag_len, ag_start,ag_end)

EVTIKVNLIFADGKIQTAEFKGTFEEATAEAYRYAALLAKVNGEWTADLEDGGNHMNIKFAG
62 1820 1881


In [17]:
ag_pp

<Polypeptide start=1820 end=1881>

In [18]:
distances = []
h_ress = []
ag_ress = []
h_seqid = []
ag_seqid = []
ab = 1
comparison_num = 0
for h_res in H.get_residues():
    h_res_name = h_res.get_resname()
    if h_res_name != 'HOH' and h_res_name != 'AEA':
        h_atom = h_res['CA']
        ag = ag_start 
        for ag_res in Ag.get_residues():
            ag_res_name = ag_res.get_resname()
            print(ag_res_name, ag)
            if ag_res_name != 'HOH' and ag_res_name != 'AEA':
                ag_atom = ag_res['CA']
                dist = h_atom - ag_atom
                h_ress.append(h_res_name)
                ag_ress.append(ag_res_name)
                h_seqid.append(ab)
                ag_seqid.append(ag)
                ag += 1
                distances.append(dist) 
                comparison_num += 1
    ab += 1
            


GLU 1820
VAL 1821
THR 1822
ILE 1823
LYS 1824
VAL 1825
ASN 1826
LEU 1827
ILE 1828
PHE 1829
ALA 1830
ASP 1831
GLY 1832
LYS 1833
ILE 1834
GLN 1835
THR 1836
ALA 1837
GLU 1838
PHE 1839
LYS 1840
GLY 1841
THR 1842
PHE 1843
GLU 1844
GLU 1845
ALA 1846
THR 1847
ALA 1848
GLU 1849
ALA 1850
TYR 1851
ARG 1852
TYR 1853
ALA 1854
ALA 1855
LEU 1856
LEU 1857
ALA 1858
LYS 1859
VAL 1860
ASN 1861
GLY 1862
GLU 1863
TRP 1864
THR 1865
ALA 1866
ASP 1867
LEU 1868
GLU 1869
ASP 1870
GLY 1871
GLY 1872
ASN 1873
HIS 1874
MET 1875
ASN 1876
ILE 1877
LYS 1878
PHE 1879
ALA 1880
GLY 1881
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
GLU 1820
VAL 1821
THR 1822
ILE 1823
LYS 1824
VAL 1825
ASN 1826
LEU 1827
ILE 1828
PHE 1829
ALA 1830
ASP 1831
GLY 1832
LYS 1833
ILE 1834
GLN 1835
THR 1836
ALA 1837
GLU 1838
PHE 1839
LYS 1840
GLY 1841
THR 1842
P

In [19]:
compar_df = pd.DataFrame({'h_ress':h_ress,'h_seqid':h_seqid,'ag_ress':ag_ress,'ag_seqid':ag_seqid,'distances':distances})
compar_df['ag_ress_seqid'] = compar_df.ag_ress + '-' + compar_df.ag_seqid.astype(str)
compar_df.head()

Unnamed: 0,h_ress,h_seqid,ag_ress,ag_seqid,distances,ag_ress_seqid
0,ASP,1,GLU,1820,23.690613,GLU-1820
1,ASP,1,VAL,1821,26.037287,VAL-1821
2,ASP,1,THR,1822,26.619587,THR-1822
3,ASP,1,ILE,1823,27.920298,ILE-1823
4,ASP,1,LYS,1824,30.233057,LYS-1824


In [20]:
compar_df.sort_values(['distances']).head(20)

Unnamed: 0,h_ress,h_seqid,ag_ress,ag_seqid,distances,ag_ress_seqid
637,LEU,11,ALA,1837,4.334361,ALA-1837
759,VAL,13,GLN,1835,4.939712,GLN-1835
515,SER,9,PHE,1839,4.992039,PHE-1839
698,ALA,12,THR,1836,5.121083,THR-1836
576,SER,10,GLU,1838,5.144905,GLU-1838
514,SER,9,GLU,1838,5.380033,GLU-1838
453,PRO,8,PHE,1839,5.493803,PHE-1839
575,SER,10,ALA,1837,5.513206,ALA-1837
697,ALA,12,GLN,1835,5.648639,GLN-1835
636,LEU,11,THR,1836,5.776641,THR-1836


In [59]:
ress_paratope = compar_df.ab_ress_seqid.unique()
ress_paratope

array(['TYR-37', 'ASP-38', 'ILE-56', 'TRP-57', 'THR-58', 'ASP-59',
       'GLY-63', 'GLY-64', 'THR-65', 'GLU-27', 'ASP-28', 'VAL-29',
       'ASN-36', 'THR-37', 'TYR-38', 'ALA-56', 'ALA-57'], dtype=object)

In [61]:
sorted_compar_df = compar_df.sort_values(['distances'])
sorted_compar_df.head(30)

Unnamed: 0,ab_label,ab_res,ab_seqid,ag_label,ag_res,ag_seqid,distances,ab_ress_seqid,ag_ress_seqid
197,L,TYR,38,E,PRO,369,8.192389,TYR-38,PRO-369
167,L,ASN,36,E,SER,367,8.372197,ASN-36,SER-367
64,H,GLY,64,E,PRO,373,8.557732,GLY-64,PRO-373
25,H,TRP,57,E,LYS,372,8.582211,TRP-57,LYS-372
137,L,VAL,29,E,GLY,318,8.615442,VAL-29,GLY-318
149,L,VAL,29,E,PRO,369,8.699121,VAL-29,PRO-369
63,H,GLY,64,E,LYS,372,9.219422,GLY-64,LYS-372
169,L,ASN,36,E,PRO,369,9.399396,ASN-36,PRO-369
89,H,THR,65,E,LYS,372,9.516323,THR-65,LYS-372
98,H,THR,65,E,GLU,403,9.52795,THR-65,GLU-403


In [65]:
sorted_compar_df[sorted_compar_df.ab_ress_seqid.isin(ress_paratope)].sort_values(['ab_seqid','ag_seqid']).head(20)

Unnamed: 0,ab_label,ab_res,ab_seqid,ag_label,ag_res,ag_seqid,distances,ab_ress_seqid,ag_ress_seqid
112,L,GLU,27,E,GLU,317,12.789067,GLU-27,GLU-317
113,L,GLU,27,E,GLY,318,10.982326,GLU-27,GLY-318
114,L,GLU,27,E,LEU,319,11.618443,GLU-27,LEU-319
115,L,GLU,27,E,ASN,320,11.970379,GLU-27,ASN-320
116,L,GLU,27,E,SER,367,14.015538,GLU-27,SER-367
117,L,GLU,27,E,LEU,368,14.787553,GLU-27,LEU-368
118,L,GLU,27,E,PRO,369,13.178085,GLU-27,PRO-369
119,L,GLU,27,E,ARG,370,14.499504,GLU-27,ARG-370
120,L,ASP,28,E,GLU,317,12.968072,ASP-28,GLU-317
121,L,ASP,28,E,GLY,318,10.194002,ASP-28,GLY-318


In [66]:
view = nv.show_biopython(structure)
view

NGLWidget()

# 3. Example 2

In [4]:
pdb = metadata.pdb.unique()[100]
print(pdb)

1mhh


In [5]:
metadata[metadata['pdb']==pdb]

Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain,antigen_type,antigen_het_name,antigen_name,short_col,date,compound,organism,heavy_species,light_species,antigen_species,authors,resolution,method,r_free,r_factor,scfv,engineered,heavy_subclass,light_subclass,light_ctype,affinity,delta_g,affinity_method,temperature,pmid
0,1mhh,D,C,0,,,,,IMMUNE SYSTEM,08/20/02,Antibody-antigen complex,FINEGOLDIA MAGNA; MUS MUSCULUS,mus musculus,mus musculus,,"Graille, M., Stura, E.A.",2.1,X-RAY DIFFRACTION,0.247,0.197,False,False,IGHV9,IGKV8,Kappa,1e-09,-12.278197,Unknown,,TBD
1,1mhh,B,A,0,,,,,IMMUNE SYSTEM,08/20/02,Antibody-antigen complex,FINEGOLDIA MAGNA; MUS MUSCULUS,mus musculus,mus musculus,,"Graille, M., Stura, E.A.",2.1,X-RAY DIFFRACTION,0.247,0.197,False,False,IGHV9,IGKV8,Kappa,1e-09,-12.278197,Unknown,,TBD


In [6]:
H_label = metadata[metadata['pdb']==pdb].Hchain.values[0]
L_label = metadata[metadata['pdb']==pdb].Lchain.values[0]
Ag_label = metadata[metadata['pdb']==pdb].antigen_chain.values[0]
print(f'H lable: {H_label}\nL label: {L_label}\nAntigen label: {Ag_label}')

H lable: D
L label: C
Antigen label: nan


In [7]:
label_2_chain_dict = {H_label:'H_ch',L_label:'L_ch',Ag_label:'Ag_ch'}
label_2_chain_dict

{'D': 'H_ch', 'C': 'L_ch', nan: 'Ag_ch'}

## 3.0 Save fasta for tests with ANARCI

In [10]:
pdbfile = f'sabdab-data/original_pdb/{pdb}.pdb'
with open(pdbfile) as handle:
    for record in SeqIO.parse(handle, 'pdb-seqres'):
        print(record.id)
        chain_label = record.id.split(':')[-1]
        with open(f'sabdab-data/fasta/{pdb}_{chain_label}.fasta', 'w') as output_handle:
           SeqIO.write(record, output_handle, 'fasta')

1MHH:A
1MHH:B
1MHH:C
1MHH:D
1MHH:E
1MHH:F


## 3.1 Parse structure

In [11]:
pdb_parser = PDBParser()

In [12]:
structure = pdb_parser.get_structure(pdb, "sabdab-data/imgt/{}.pdb".format(pdb))

## 3.2 View structure

In [13]:
view = nv.show_biopython(structure)
view

NGLWidget()

## 3.3 Extract substructures from structure

In [16]:
pp_chains_dict = {H_label:[],L_label:[],Ag_label:[]}
pp_chains_dict

{'D': [], 'C': [], nan: []}

In [15]:
for model in structure.get_models():
    print(model)
    for chain in model.get_chains():
        ppb = CaPPBuilder()
        chain_id = chain.get_id()
        print(f'chain: {chain}, type: {label_2_chain_dict[chain_id]}')
        for pp in ppb.build_peptides(chain):
            start, end = pp.get_start_end()
            pp_chains_dict[chain_id].append((pp,start,end))

            print(f'sequence:{pp.get_sequence()}')
pp_chains_dict


<Model id=0>


KeyError: 'E'

## 3.4 Get the distance between 2 chains

The IMGT unique numbering provides a standardized delimitation of the framework regions (FR1-IMGT: positions 1 to 26, FR2-IMGT: 39 to 55, FR3-IMGT: 66 to 104 and FR4-IMGT: 118 to 128) and of the complementarity determining regions: CDR1-IMGT: 27 to 38, CDR2-IMGT: 56 to 65 and CDR3-IMGT: 105 to 117

In [14]:
Ab_labels_list = [H_label, L_label]
Ag_labels_list = [Ag_label]

In [15]:
def seqid_is_cdr(ab_res_seqid):
    '''
    Returns whether an antibody residue position is in a CDR region.

    Parameters
    ----------
    ab_res_pos : int
        IMGT sequence identifier (obtained from the get_id() function of the residue object.)

    Output
    ------
    is_cdr = bool
        True if the antibody residue sequence id is in a CDR region and False otherwise.

    '''
    cdr1_start = 27
    cdr1_end = 38
    cdr2_start = 56
    cdr2_end = 65
    cdr3_start = 105
    cdr3_end = 117
    if cdr1_start <= ab_res_seqid and ab_res_seqid <= cdr1_end:
        is_cdr = True
    elif cdr2_start <= ab_res_seqid and ab_res_seqid <= cdr2_end:
        is_cdr = True
    elif cdr2_start <= ab_res_seqid and ab_res_seqid <= cdr2_end:
        is_cdr = True
    else:
        is_cdr = False
    return is_cdr

In [46]:
distances = []
ab_ress = []
ag_ress = []
ab_seqids = []
ag_seqids = []
ab_labels = []
ag_labels = []
comparison_num = 0
largest_distance = 0
for ab_label in Ab_labels_list:
    Ab_ch = model[ab_label]
    for ab_res in Ab_ch.get_residues():
        ab_res_name = ab_res.get_resname()
        ab_res_het_tag, ab_seqid, ab_insertion = ab_res.get_id()
        if ab_res_het_tag == ' ' or ab_res_het_tag == '':
            ab_atom = ab_res['CA']
            for ag_label in Ag_labels_list:
                Ag_ch = model[ag_label]
                for ag_res in Ag_ch.get_residues():
                    ag_res_name = ag_res.get_resname()
                    ag_res_het_tag, ag_seqid, ag_insertion = ag_res.get_id()
                    if ag_res_het_tag == ' ' or ag_res_het_tag == '':
                        ag_atom = ag_res['CA']
                        dist = ab_atom - ag_atom
                        if dist > largest_distance:
                            largest_distance = dist
                        is_cdr = seqid_is_cdr(ab_seqid)
                        if is_cdr and (ab_label in Ab_labels_list) and (dist<=15):
                            #print(ab_label,ab_res_name, ab_res.get_id(), is_cdr)
                            ab_ress.append(ab_res_name)
                            ag_ress.append(ag_res_name)
                            ab_seqids.append(ab_seqid)
                            ag_seqids.append(ag_seqid)
                            ab_labels.append(ab_label)
                            ag_labels.append(ag_label)
                            distances.append(dist) 
                            comparison_num += 1
                            if comparison_num % 100 == 0:
                                print(f'{comparison_num} comparisons made so far.')

print(f'{comparison_num} total comparisons')
print('Finished...')



100 comparisons made so far.
200 comparisons made so far.
207 total comparisons
Finished...


In [23]:
compar_df = pd.DataFrame({'ab_label':ab_labels,'ab_res':ab_ress,'ab_seqid':ab_seqids,'ag_label':ag_labels,'ag_res':ag_ress,'ag_seqid':ag_seqids,'distances':distances})

In [24]:
compar_df['ab_ress_seqid'] = compar_df.ab_res + '-' + compar_df.ab_seqid.astype(str)
compar_df['ag_ress_seqid'] = compar_df.ag_res + '-' + compar_df.ag_seqid.astype(str)
compar_df.head()

Unnamed: 0,ab_label,ab_res,ab_seqid,ag_label,ag_res,ag_seqid,distances,ab_ress_seqid,ag_ress_seqid
0,H,TYR,37,E,ARG,370,14.871644,TYR-37,ARG-370
1,H,TYR,37,E,LYS,372,13.327469,TYR-37,LYS-372
2,H,ASP,38,E,GLY,316,13.063834,ASP-38,GLY-316
3,H,ASP,38,E,PRO,369,14.989226,ASP-38,PRO-369
4,H,ASP,38,E,ARG,370,12.575559,ASP-38,ARG-370


In [26]:
compar_df.sort_values(['ab_seqid','distances','ag_seqid']).head(50)

Unnamed: 0,ab_label,ab_res,ab_seqid,ag_label,ag_res,ag_seqid,distances,ab_ress_seqid,ag_ress_seqid
113,L,GLU,27,E,GLY,318,10.982326,GLU-27,GLY-318
114,L,GLU,27,E,LEU,319,11.618443,GLU-27,LEU-319
115,L,GLU,27,E,ASN,320,11.970379,GLU-27,ASN-320
112,L,GLU,27,E,GLU,317,12.789067,GLU-27,GLU-317
118,L,GLU,27,E,PRO,369,13.178085,GLU-27,PRO-369
116,L,GLU,27,E,SER,367,14.015538,GLU-27,SER-367
119,L,GLU,27,E,ARG,370,14.499504,GLU-27,ARG-370
117,L,GLU,27,E,LEU,368,14.787553,GLU-27,LEU-368
123,L,ASP,28,E,ASN,320,9.839213,ASP-28,ASN-320
121,L,ASP,28,E,GLY,318,10.194002,ASP-28,GLY-318


Keep just the closest 6 Ag residues to each CDR residue

In [33]:
compar_df.groupby('ab_ress_seqid').apply(lambda p: p.nlargest(6, columns='distances')).reset_index(level=[0,1], drop=True).sort_values(['ab_seqid','distances','ag_seqid']).head(50)

Unnamed: 0,ab_label,ab_res,ab_seqid,ag_label,ag_res,ag_seqid,distances,ab_ress_seqid,ag_ress_seqid
36,L,GLU,27,E,ASN,320,11.970379,GLU-27,ASN-320
35,L,GLU,27,E,GLU,317,12.789067,GLU-27,GLU-317
34,L,GLU,27,E,PRO,369,13.178085,GLU-27,PRO-369
33,L,GLU,27,E,SER,367,14.015538,GLU-27,SER-367
32,L,GLU,27,E,ARG,370,14.499504,GLU-27,ARG-370
31,L,GLU,27,E,LEU,368,14.787553,GLU-27,LEU-368
18,L,ASP,28,E,LYS,322,13.595772,ASP-28,LYS-322
17,L,ASP,28,E,ARG,370,13.623747,ASP-28,ARG-370
16,L,ASP,28,E,LEU,366,13.649034,ASP-28,LEU-366
15,L,ASP,28,E,ALA,352,13.946301,ASP-28,ALA-352


In [27]:
view = nv.show_biopython(structure)

view

NGLWidget()