In [1]:
import pandas as pd
from Bio.PDB import *
import nglview as nv

pd.set_option('display.max_columns', 100)

# 1. Get metadata

## 1.1 Load file

In [2]:
metadata = pd.read_csv('sabdab-data/20221001_0807534_summary.tsv', sep='\t')
len(metadata.pdb.unique())

740

# 1.2 Filter out structure with Hapten Antigen

In [3]:
metadata = metadata[metadata['antigen_type']!='Hapten']
len(metadata.pdb.unique())

672

# 2. Get the metadata of a single structure as an example

idx pdb
0   1mhh
100 4liq


In [4]:
pdb = metadata.pdb.unique()[0]
print(pdb)

1mhh


In [5]:

metadata[metadata['pdb']==pdb]

Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain,antigen_type,antigen_het_name,antigen_name,short_header,date,compound,organism,heavy_species,light_species,antigen_species,authors,resolution,method,r_free,r_factor,scfv,engineered,heavy_subclass,light_subclass,light_ctype,affinity,delta_g,affinity_method,temperature,pmid
0,1mhh,D,C,0,,,,,IMMUNE SYSTEM,08/20/02,Antibody-antigen complex,FINEGOLDIA MAGNA; MUS MUSCULUS,mus musculus,mus musculus,,"Graille, M., Stura, E.A.",2.1,X-RAY DIFFRACTION,0.247,0.197,False,False,IGHV9,IGKV8,Kappa,1e-09,-12.278197,Unknown,,TBD
1,1mhh,B,A,0,,,,,IMMUNE SYSTEM,08/20/02,Antibody-antigen complex,FINEGOLDIA MAGNA; MUS MUSCULUS,mus musculus,mus musculus,,"Graille, M., Stura, E.A.",2.1,X-RAY DIFFRACTION,0.247,0.197,False,False,IGHV9,IGKV8,Kappa,1e-09,-12.278197,Unknown,,TBD


## 2.1 Parse structure

In [6]:
pdb_parser = PDBParser()

In [7]:
structure = pdb_parser.get_structure(pdb, "sabdab-data/imgt/{}.pdb".format(pdb))

## 2.2 View structure

In [44]:
view = nv.show_biopython(structure)
view

NGLWidget()

## 2.3 Extract substructures from structure

In [9]:
for model in structure.get_models():
    print(model)
    for chain in model.get_chains():
        print(chain)
        ppb = CaPPBuilder()
        for pp in ppb.build_peptides(chain):
            print(pp)
            print(pp.get_sequence())

<Model id=0>
<Chain id=E>
<Polypeptide start=820 end=882>
EVTIKVNLIFADGKIQTAEFKGTFEEATAEAYRYAALLAKVNGEWTADLEDGGNHMNIKFAGK
<Chain id=F>
<Polypeptide start=1820 end=1881>
EVTIKVNLIFADGKIQTAEFKGTFEEATAEAYRYAALLAKVNGEWTADLEDGGNHMNIKFAG
<Chain id=B>
<Polypeptide start=1 end=146>
QIQLVQSGPELKKPGETVKISCKASGYTFTDFSMHWVNQAPGKGLNWMGWVNTETGEPTYADDFKGRFAFSLETSASTAYLQINSLKNEDTATYFCARFLLRQYFDVWGAGTTVTVSSAKTTPPSVYPLAPGSAAQ
<Polypeptide start=147 end=225>
SMVTLGCLVKGYFPEPVTVTWNSGSLSSGVHTFPAVLQSDLYTLSSSVTVPSSTWPSETVTCNVAHPASSTKVDKKIVP
<Chain id=A>
<Polypeptide start=1 end=233>
DIVMSQSPSSLAVSAGEKVTMSCKSSQSLLNSRTRKNYLAWYQQKPGQSPKVLIYWASTRESGVPDRFTGRGSGTDFTLTISSVQAEDQAVYYCKQAYIPPLTFGAGTKLELKRADAAPTVSIFPPSSEQLTSGGASVVCFLNNFYPKDINVKWKIDGSERQNGVLNSWTDQDSKDSTYSMSSTLTLTKDEYERHNSYTCEATHKTSTSPIVKSFNRNE
<Chain id=D>
<Polypeptide start=1 end=144>
QIQLVQSGPELKKPGETVKISCKASGYTFTDFSMHWVNQAPGKGLNWMGWVNTETGEPTYADDFKGRFAFSLETSASTAYLQINSLKNEDTATYFCARFLLRQYFDVWGAGTTVTVSSAKTTPPSVYPLAPGSA
<Polypeptide start=145 end=224>
NSM

In [10]:
print(chain)
seq = ''
for res in chain.get_residues():
    seq += res.get_resname()

print(seq)

<Chain id=C>
ASPILEVALMETSERGLNSERPROSERSERLEUALAVALSERALAGLYGLULYSVALTHRMETSERCYSLYSSERSERGLNSERLEULEUASNSERARGTHRARGLYSASNTYRLEUALATRPTYRGLNGLNLYSPROGLYGLNSERPROLYSVALLEUILETYRTRPALASERTHRARGGLUSERGLYVALPROASPARGPHETHRGLYARGGLYSERGLYTHRASPPHETHRLEUTHRILESERSERVALGLNALAGLUASPGLNALAVALTYRTYRCYSLYSGLNALATYRILEPROPROLEUTHRPHEGLYALAGLYTHRLYSLEUGLULEULYSARGALAASPALAALAPROTHRVALSERILEPHEPROPROSERSERGLUGLNLEUTHRSERGLYGLYALASERVALVALCYSPHELEUASNASNPHETYRPROLYSASPILEASNVALLYSTRPLYSILEASPGLYSERGLUARGGLNASNGLYVALLEUASNSERTRPTHRASPGLNASPSERLYSASPSERTHRTYRSERMETSERSERTHRLEUTHRLEUTHRLYSASPGLUTYRGLUARGHISASNSERTYRTHRCYSGLUALATHRHISLYSTHRSERTHRSERPROILEVALLYSSERPHEASNARGASNGLUAEAHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOH

In [11]:
len(seq)/3

411.0

In [12]:
len(pp.get_sequence())

219

# 2.4 Get the distance between 2 chains

In [13]:
Ag = model['F']#Antigen
Ab = model['C']#LChain

In [14]:
ab_ppb = CaPPBuilder()
for ab_pp in ab_ppb.build_peptides(Ab):
    ab_seq = ab_pp.get_sequence()
    print(ab_seq)
ab_len = len(ab_seq)
print(ab_len)

DIVMSQSPSSLAVSAGEKVTMSCKSSQSLLNSRTRKNYLAWYQQKPGQSPKVLIYWASTRESGVPDRFTGRGSGTDFTLTISSVQAEDQAVYYCKQAYIPPLTFGAGTKLELKRADAAPTVSIFPPSSEQLTSGGASVVCFLNNFYPKDINVKWKIDGSERQNGVLNSWTDQDSKDSTYSMSSTLTLTKDEYERHNSYTCEATHKTSTSPIVKSFNRNE
219


In [15]:
ag_ppb = CaPPBuilder()
for ag_pp in ag_ppb.build_peptides(Ag):
    ag_seq = ag_pp.get_sequence()
    ag_start, ag_end = ag_pp.get_start_end()
    print(ag_seq)
ag_len = len(ag_seq)
print(ag_len, ag_start,ag_end)

EVTIKVNLIFADGKIQTAEFKGTFEEATAEAYRYAALLAKVNGEWTADLEDGGNHMNIKFAG
62 1820 1881


In [16]:
ag_pp

<Polypeptide start=1820 end=1881>

In [17]:
distances = []
ab_ress = []
ag_ress = []
ab_seqid = []
ag_seqid = []
ab = 1
comparison_num = 0
for ab_res in Ab.get_residues():
    ab_res_name = ab_res.get_resname()
    if ab_res_name != 'HOH' and ab_res_name != 'AEA':
        ab_atom = ab_res['CA']
        ag = ag_start 
        for ag_res in Ag.get_residues():
            ag_res_name = ag_res.get_resname()
            print(ag_res_name, ag)
            if ag_res_name != 'HOH' and ag_res_name != 'AEA':
                ag_atom = ag_res['CA']
                dist = ab_atom - ag_atom
                ab_ress.append(ab_res_name)
                ag_ress.append(ag_res_name)
                ab_seqid.append(ab)
                ag_seqid.append(ag)
                ag += 1
                distances.append(dist) 
                comparison_num += 1
    ab += 1
            


GLU 1820
VAL 1821
THR 1822
ILE 1823
LYS 1824
VAL 1825
ASN 1826
LEU 1827
ILE 1828
PHE 1829
ALA 1830
ASP 1831
GLY 1832
LYS 1833
ILE 1834
GLN 1835
THR 1836
ALA 1837
GLU 1838
PHE 1839
LYS 1840
GLY 1841
THR 1842
PHE 1843
GLU 1844
GLU 1845
ALA 1846
THR 1847
ALA 1848
GLU 1849
ALA 1850
TYR 1851
ARG 1852
TYR 1853
ALA 1854
ALA 1855
LEU 1856
LEU 1857
ALA 1858
LYS 1859
VAL 1860
ASN 1861
GLY 1862
GLU 1863
TRP 1864
THR 1865
ALA 1866
ASP 1867
LEU 1868
GLU 1869
ASP 1870
GLY 1871
GLY 1872
ASN 1873
HIS 1874
MET 1875
ASN 1876
ILE 1877
LYS 1878
PHE 1879
ALA 1880
GLY 1881
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
GLU 1820
VAL 1821
THR 1822
ILE 1823
LYS 1824
VAL 1825
ASN 1826
LEU 1827
ILE 1828
PHE 1829
ALA 1830
ASP 1831
GLY 1832
LYS 1833
ILE 1834
GLN 1835
THR 1836
ALA 1837
GLU 1838
PHE 1839
LYS 1840
GLY 1841
THR 1842
P

ASN 1826
LEU 1827
ILE 1828
PHE 1829
ALA 1830
ASP 1831
GLY 1832
LYS 1833
ILE 1834
GLN 1835
THR 1836
ALA 1837
GLU 1838
PHE 1839
LYS 1840
GLY 1841
THR 1842
PHE 1843
GLU 1844
GLU 1845
ALA 1846
THR 1847
ALA 1848
GLU 1849
ALA 1850
TYR 1851
ARG 1852
TYR 1853
ALA 1854
ALA 1855
LEU 1856
LEU 1857
ALA 1858
LYS 1859
VAL 1860
ASN 1861
GLY 1862
GLU 1863
TRP 1864
THR 1865
ALA 1866
ASP 1867
LEU 1868
GLU 1869
ASP 1870
GLY 1871
GLY 1872
ASN 1873
HIS 1874
MET 1875
ASN 1876
ILE 1877
LYS 1878
PHE 1879
ALA 1880
GLY 1881
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
GLU 1820
VAL 1821
THR 1822
ILE 1823
LYS 1824
VAL 1825
ASN 1826
LEU 1827
ILE 1828
PHE 1829
ALA 1830
ASP 1831
GLY 1832
LYS 1833
ILE 1834
GLN 1835
THR 1836
ALA 1837
GLU 1838
PHE 1839
LYS 1840
GLY 1841
THR 1842
PHE 1843
GLU 1844
GLU 1845
ALA 1846
THR 1847
ALA 1848
G

ILE 1828
PHE 1829
ALA 1830
ASP 1831
GLY 1832
LYS 1833
ILE 1834
GLN 1835
THR 1836
ALA 1837
GLU 1838
PHE 1839
LYS 1840
GLY 1841
THR 1842
PHE 1843
GLU 1844
GLU 1845
ALA 1846
THR 1847
ALA 1848
GLU 1849
ALA 1850
TYR 1851
ARG 1852
TYR 1853
ALA 1854
ALA 1855
LEU 1856
LEU 1857
ALA 1858
LYS 1859
VAL 1860
ASN 1861
GLY 1862
GLU 1863
TRP 1864
THR 1865
ALA 1866
ASP 1867
LEU 1868
GLU 1869
ASP 1870
GLY 1871
GLY 1872
ASN 1873
HIS 1874
MET 1875
ASN 1876
ILE 1877
LYS 1878
PHE 1879
ALA 1880
GLY 1881
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
GLU 1820
VAL 1821
THR 1822
ILE 1823
LYS 1824
VAL 1825
ASN 1826
LEU 1827
ILE 1828
PHE 1829
ALA 1830
ASP 1831
GLY 1832
LYS 1833
ILE 1834
GLN 1835
THR 1836
ALA 1837
GLU 1838
PHE 1839
LYS 1840
GLY 1841
THR 1842
PHE 1843
GLU 1844
GLU 1845
ALA 1846
THR 1847
ALA 1848
GLU 1849
ALA 1850
T

ALA 1866
ASP 1867
LEU 1868
GLU 1869
ASP 1870
GLY 1871
GLY 1872
ASN 1873
HIS 1874
MET 1875
ASN 1876
ILE 1877
LYS 1878
PHE 1879
ALA 1880
GLY 1881
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
GLU 1820
VAL 1821
THR 1822
ILE 1823
LYS 1824
VAL 1825
ASN 1826
LEU 1827
ILE 1828
PHE 1829
ALA 1830
ASP 1831
GLY 1832
LYS 1833
ILE 1834
GLN 1835
THR 1836
ALA 1837
GLU 1838
PHE 1839
LYS 1840
GLY 1841
THR 1842
PHE 1843
GLU 1844
GLU 1845
ALA 1846
THR 1847
ALA 1848
GLU 1849
ALA 1850
TYR 1851
ARG 1852
TYR 1853
ALA 1854
ALA 1855
LEU 1856
LEU 1857
ALA 1858
LYS 1859
VAL 1860
ASN 1861
GLY 1862
GLU 1863
TRP 1864
THR 1865
ALA 1866
ASP 1867
LEU 1868
GLU 1869
ASP 1870
GLY 1871
GLY 1872
ASN 1873
HIS 1874
MET 1875
ASN 1876
ILE 1877
LYS 1878
PHE 1879
ALA 1880
GLY 1881
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
H

THR 1822
ILE 1823
LYS 1824
VAL 1825
ASN 1826
LEU 1827
ILE 1828
PHE 1829
ALA 1830
ASP 1831
GLY 1832
LYS 1833
ILE 1834
GLN 1835
THR 1836
ALA 1837
GLU 1838
PHE 1839
LYS 1840
GLY 1841
THR 1842
PHE 1843
GLU 1844
GLU 1845
ALA 1846
THR 1847
ALA 1848
GLU 1849
ALA 1850
TYR 1851
ARG 1852
TYR 1853
ALA 1854
ALA 1855
LEU 1856
LEU 1857
ALA 1858
LYS 1859
VAL 1860
ASN 1861
GLY 1862
GLU 1863
TRP 1864
THR 1865
ALA 1866
ASP 1867
LEU 1868
GLU 1869
ASP 1870
GLY 1871
GLY 1872
ASN 1873
HIS 1874
MET 1875
ASN 1876
ILE 1877
LYS 1878
PHE 1879
ALA 1880
GLY 1881
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
GLU 1820
VAL 1821
THR 1822
ILE 1823
LYS 1824
VAL 1825
ASN 1826
LEU 1827
ILE 1828
PHE 1829
ALA 1830
ASP 1831
GLY 1832
LYS 1833
ILE 1834
GLN 1835
THR 1836
ALA 1837
GLU 1838
PHE 1839
LYS 1840
GLY 1841
THR 1842
PHE 1843
GLU 1844
G

In [18]:
comparisons_df = pd.DataFrame({'ab_ress':ab_ress,'ab_seqid':ab_seqid,'ag_ress':ag_ress,'ag_seqid':ag_seqid,'distances':distances})
comparisons_df['ag_ress_seqid'] = comparisons_df.ag_ress + '-' + comparisons_df.ag_seqid.astype(str)
comparisons_df.head()

Unnamed: 0,ab_ress,ab_seqid,ag_ress,ag_seqid,distances,ag_ress_seqid
0,ASP,1,GLU,1820,23.690613,GLU-1820
1,ASP,1,VAL,1821,26.037287,VAL-1821
2,ASP,1,THR,1822,26.619587,THR-1822
3,ASP,1,ILE,1823,27.920298,ILE-1823
4,ASP,1,LYS,1824,30.233057,LYS-1824


In [19]:
comparisons_df.sort_values(['distances']).head(20)

Unnamed: 0,ab_ress,ab_seqid,ag_ress,ag_seqid,distances,ag_ress_seqid
637,LEU,11,ALA,1837,4.334361,ALA-1837
759,VAL,13,GLN,1835,4.939712,GLN-1835
515,SER,9,PHE,1839,4.992039,PHE-1839
698,ALA,12,THR,1836,5.121083,THR-1836
576,SER,10,GLU,1838,5.144905,GLU-1838
514,SER,9,GLU,1838,5.380033,GLU-1838
453,PRO,8,PHE,1839,5.493803,PHE-1839
575,SER,10,ALA,1837,5.513206,ALA-1837
697,ALA,12,GLN,1835,5.648639,GLN-1835
636,LEU,11,THR,1836,5.776641,THR-1836


In [20]:
ress_paratope = comparisons_df.sort_values(['distances']).ag_ress_seqid.unique()[0:8]
ress_paratope

array(['ALA-1837', 'GLN-1835', 'PHE-1839', 'THR-1836', 'GLU-1838',
       'LYS-1840', 'GLU-1849', 'TYR-1853'], dtype=object)

In [21]:
sorted_comparisons_df = comparisons_df.sort_values(['distances']).head(30)
sorted_comparisons_df

Unnamed: 0,ab_ress,ab_seqid,ag_ress,ag_seqid,distances,ag_ress_seqid
637,LEU,11,ALA,1837,4.334361,ALA-1837
759,VAL,13,GLN,1835,4.939712,GLN-1835
515,SER,9,PHE,1839,4.992039,PHE-1839
698,ALA,12,THR,1836,5.121083,THR-1836
576,SER,10,GLU,1838,5.144905,GLU-1838
514,SER,9,GLU,1838,5.380033,GLU-1838
453,PRO,8,PHE,1839,5.493803,PHE-1839
575,SER,10,ALA,1837,5.513206,ALA-1837
697,ALA,12,GLN,1835,5.648639,GLN-1835
636,LEU,11,THR,1836,5.776641,THR-1836


In [22]:
sorted_comparisons_df[sorted_comparisons_df.ag_ress_seqid.isin(ress_paratope)].sort_values(['ag_seqid','ab_seqid'])

Unnamed: 0,ab_ress,ab_seqid,ag_ress,ag_seqid,distances,ag_ress_seqid
635,LEU,11,GLN,1835,7.244271,GLN-1835
697,ALA,12,GLN,1835,5.648639,GLN-1835
759,VAL,13,GLN,1835,4.939712,GLN-1835
821,SER,14,GLN,1835,7.235299,GLN-1835
636,LEU,11,THR,1836,5.776641,THR-1836
698,ALA,12,THR,1836,5.121083,THR-1836
760,VAL,13,THR,1836,6.898151,THR-1836
451,PRO,8,ALA,1837,6.276825,ALA-1837
513,SER,9,ALA,1837,7.275739,ALA-1837
575,SER,10,ALA,1837,5.513206,ALA-1837


In [23]:
view = nv.show_biopython(structure)
view

NGLWidget()

## 3. Example 2

In [24]:
pdb = metadata.pdb.unique()[100]
print(pdb)

4liq


In [25]:
metadata[metadata['pdb']==pdb]

Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain,antigen_type,antigen_het_name,antigen_name,short_header,date,compound,organism,heavy_species,light_species,antigen_species,authors,resolution,method,r_free,r_factor,scfv,engineered,heavy_subclass,light_subclass,light_ctype,affinity,delta_g,affinity_method,temperature,pmid
206,4liq,H,L,0,E,protein,,macrophage colony-stimulating factor 1 receptor,IMMUNE SYSTEM,07/03/13,Structure of the extracellular domain of human...,MUS MUSCULUS; HOMO SAPIENS,mus musculus,mus musculus,homo sapiens,"Benz, J., Gorr, I.H., Hertenberger, H., Ries, ...",2.6,X-RAY DIFFRACTION,0.237,0.186,False,True,IGHV1,IGKV1,Kappa,2e-10,-13.231763,SPR,,TBD


In [26]:
H_ch = metadata[metadata['pdb']==pdb].Hchain.values[0]
L_ch = metadata[metadata['pdb']==pdb].Lchain.values[0]
Ag_ch = metadata[metadata['pdb']==pdb].antigen_chain.values[0]
print(f'H chain: {H_ch}\nL chain: {L_ch}\nAntigen chain: {Ag_ch}')

H chain: H
L chain: L
Antigen chain: E


In [27]:
chain_2_struct_dict = {H_ch:'H_ch',L_ch:'L_ch',Ag_ch:'Ag_ch'}
chain_2_struct_dict

{'H': 'H_ch', 'L': 'L_ch', 'E': 'Ag_ch'}

## 3.1 Parse structure

In [28]:
pdb_parser = PDBParser()

In [29]:
structure = pdb_parser.get_structure(pdb, "sabdab-data/imgt/{}.pdb".format(pdb))

## 3.2 View structure

In [30]:
view = nv.show_biopython(structure)
view

NGLWidget()

## 3.3 Extract substructures from structure

In [31]:
pp_chains = {H_ch:[],L_ch:[],Ag_ch:[]}
pp_chains

{'H': [], 'L': [], 'E': []}

In [32]:
for model in structure.get_models():
    print(model)
    for chain in model.get_chains():
        ppb = CaPPBuilder()
        chain_id = chain.get_id()
        print(f'chain: {chain}, type: {chain_2_struct_dict[chain_id]}')
        for pp in ppb.build_peptides(chain):
            pp_chains[chain_id].append(pp)
            print(f'sequence:{pp.get_sequence()}')
pp_chains

<Model id=0>
chain: <Chain id=E>, type: Ag_ch
sequence:IPVIEPSVPELVVKPGATVTLRCVGNGSVEWDGPPSPHWTLYSDGSSSILSTNNATFQNTGTYRCTEPG
sequence:SAAIHLYVKDPARPWNVLAQEVVVFEDQDALLPCLLTDPVLEAGVSLVRV
sequence:PLMRHTNYSFSPWHGFTIHRAKFIQSQDYQCSALMGGRKVMSISIRLKVQKVIPGPPALTLVPAELVRIRGEAAQIVCSASSVDVNFDVFLQHNNTKLAIPQQSDFHNNRYQKVLTLNLDQVDFQHAGNYSCVASNVQGKHSTSMFFRVVESAYLNLSSEQNLIQEVTVGEGLNLKVMVEAYPGLQGFNWTYLGPFSDHQPE
sequence:KLANAT
sequence:TYRHTFTLSLPRLKPSEAGRYSFLARNPGGWRALTFELTLRYPPEVSVIWTFINGSGTLLCAASGYPQPNVTWLQCSGHTDRCDEAQVLQVWDDPYPEVLSQEPFHKVTVQSLLTVETLEHNQTYECRAHNSVGSGSWAFIP
chain: <Chain id=H>, type: H_ch
sequence:QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYDISWVRQAPGQGLEWMGVIWTDGGTNYAQKLQGRVTMTTDTSTSTAYMELRSLRSDDTAVYYCARDQRLYFDVWGQGTTVTVSSASTKGPSVFPLAPSS
sequence:GTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPK
chain: <Chain id=L>, type: L_ch
sequence:DIQMTQSPSSLSASVGDRVTITCRASEDVNTYVSWYQQKPGKAPKLLIYAASNRYTGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSFSYPTFGQGTKLEIKRTVAAPSVFIFPPSDEQLKSGTAS

{'H': [<Polypeptide start=1 end=143>, <Polypeptide start=144 end=224>],
 'L': [<Polypeptide start=1 end=234>],
 'E': [<Polypeptide start=20 end=88>,
  <Polypeptide start=94 end=143>,
  <Polypeptide start=147 end=348>,
  <Polypeptide start=350 end=355>,
  <Polypeptide start=359 end=500>]}

## 3.4 Get the distance between 2 chains

In [33]:
Ag = model['E']#Antigen
Ab = model['L']#LChain

In [34]:
ab_ppb = CaPPBuilder()
for ab_pp in ab_ppb.build_peptides(Ab):
    ab_seq = ab_pp.get_sequence()
    print(ab_seq)
ab_len = len(ab_seq)
print(ab_len)

DIQMTQSPSSLSASVGDRVTITCRASEDVNTYVSWYQQKPGKAPKLLIYAASNRYTGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSFSYPTFGQGTKLEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC
213


In [35]:
ag_ppb = CaPPBuilder()
for ag_pp in ag_ppb.build_peptides(Ag):
    ag_seq = ag_pp.get_sequence()
    ag_start, ag_end = ag_pp.get_start_end()
    print(ag_seq)
ag_len = len(ag_seq)
print(ag_len, ag_start,ag_end)

IPVIEPSVPELVVKPGATVTLRCVGNGSVEWDGPPSPHWTLYSDGSSSILSTNNATFQNTGTYRCTEPG
SAAIHLYVKDPARPWNVLAQEVVVFEDQDALLPCLLTDPVLEAGVSLVRV
PLMRHTNYSFSPWHGFTIHRAKFIQSQDYQCSALMGGRKVMSISIRLKVQKVIPGPPALTLVPAELVRIRGEAAQIVCSASSVDVNFDVFLQHNNTKLAIPQQSDFHNNRYQKVLTLNLDQVDFQHAGNYSCVASNVQGKHSTSMFFRVVESAYLNLSSEQNLIQEVTVGEGLNLKVMVEAYPGLQGFNWTYLGPFSDHQPE
KLANAT
TYRHTFTLSLPRLKPSEAGRYSFLARNPGGWRALTFELTLRYPPEVSVIWTFINGSGTLLCAASGYPQPNVTWLQCSGHTDRCDEAQVLQVWDDPYPEVLSQEPFHKVTVQSLLTVETLEHNQTYECRAHNSVGSGSWAFIP
142 359 500


In [36]:
ag_pp

<Polypeptide start=359 end=500>

In [37]:
distances = []
ab_ress = []
ag_ress = []
ab_seqid = []
ag_seqid = []
ab = 1
comparison_num = 0
for ab_res in Ab.get_residues():
    ab_res_name = ab_res.get_resname()
    ab_res_het_tag = ab_res.get_id()[0]
    if ab_res_het_tag == ' ' or ab_res_het_tag == '':
        ab_atom = ab_res['CA']
        ag = ag_start 
        print(ag_start)
        for ag_res in Ag.get_residues():
            ag_res_name = ag_res.get_resname()
            ag_res_het_tag = ag_res.get_id()[0]
            if ag_res_het_tag == ' ' or ag_res_het_tag == '':
                ag_atom = ag_res['CA']
                dist = ab_atom - ag_atom
                ab_ress.append(ab_res_name)
                ag_ress.append(ag_res_name)
                ab_seqid.append(ab)
                ag_seqid.append(ag)
                ag += 1
                distances.append(dist) 
                comparison_num += 1
    ab += 1
            


359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359
359


In [38]:
comparisons_df = pd.DataFrame({'ab_ress':ab_ress,'ab_seqid':ab_seqid,'ag_ress':ag_ress,'ag_seqid':ag_seqid,'distances':distances})
comparisons_df['ag_ress_seqid'] = comparisons_df.ag_ress + '-' + comparisons_df.ag_seqid.astype(str)
print(len(comparisons_df))
comparisons_df.head()

99897


Unnamed: 0,ab_ress,ab_seqid,ag_ress,ag_seqid,distances,ag_ress_seqid
0,ASP,1,ILE,359,64.917503,ILE-359
1,ASP,1,PRO,360,62.998077,PRO-360
2,ASP,1,VAL,361,63.987133,VAL-361
3,ASP,1,ILE,362,61.655251,ILE-362
4,ASP,1,GLU,363,62.17783,GLU-363


In [39]:
comparisons_df.sort_values(['distances']).head(20)

Unnamed: 0,ab_ress,ab_seqid,ag_ress,ag_seqid,distances,ag_ress_seqid
43438,SER,93,GLY,649,4.292464,GLY-649
43437,SER,93,GLU,648,4.349379,GLU-648
43016,PHE,92,PRO,696,4.472806,PRO-696
43486,SER,93,ARG,697,4.803949,ARG-697
42969,PHE,92,GLY,649,5.117987,GLY-649
43017,PHE,92,ARG,697,5.70049,ARG-697
43905,TYR,94,GLY,647,5.845009,GLY-647
43906,TYR,94,GLU,648,5.957078,GLU-648
43485,SER,93,PRO,696,5.992612,PRO-696
43436,SER,93,GLY,647,6.166421,GLY-647


In [40]:
ress_paratope = comparisons_df.sort_values(['distances']).ag_ress_seqid.unique()[0:8]
ress_paratope

array(['GLY-649', 'GLU-648', 'PRO-696', 'ARG-697', 'GLY-647', 'LEU-695',
       'LEU-698', 'LEU-650'], dtype=object)

In [41]:
sorted_comparisons_df = comparisons_df.sort_values(['distances']).head(30)
sorted_comparisons_df

Unnamed: 0,ab_ress,ab_seqid,ag_ress,ag_seqid,distances,ag_ress_seqid
43438,SER,93,GLY,649,4.292464,GLY-649
43437,SER,93,GLU,648,4.349379,GLU-648
43016,PHE,92,PRO,696,4.472806,PRO-696
43486,SER,93,ARG,697,4.803949,ARG-697
42969,PHE,92,GLY,649,5.117987,GLY-649
43017,PHE,92,ARG,697,5.70049,ARG-697
43905,TYR,94,GLY,647,5.845009,GLY-647
43906,TYR,94,GLU,648,5.957078,GLU-648
43485,SER,93,PRO,696,5.992612,PRO-696
43436,SER,93,GLY,647,6.166421,GLY-647


In [42]:
sorted_comparisons_df[sorted_comparisons_df.ag_ress_seqid.isin(ress_paratope)].sort_values(['ag_seqid','ab_seqid'])

Unnamed: 0,ab_ress,ab_seqid,ag_ress,ag_seqid,distances,ag_ress_seqid
42967,PHE,92,GLY,647,8.98881,GLY-647
43436,SER,93,GLY,647,6.166421,GLY-647
43905,TYR,94,GLY,647,5.845009,GLY-647
44374,PRO,95,GLY,647,9.015257,GLY-647
42968,PHE,92,GLU,648,7.354068,GLU-648
43437,SER,93,GLU,648,4.349379,GLU-648
43906,TYR,94,GLU,648,5.957078,GLU-648
13422,VAL,29,GLY,649,8.615442,GLY-649
42500,SER,91,GLY,649,8.348326,GLY-649
42969,PHE,92,GLY,649,5.117987,GLY-649


In [43]:
view = nv.show_biopython(structure)
view

NGLWidget()