In [1]:
import pandas as pd
from Bio.PDB import *
import nglview as nv

pd.set_option('display.max_columns', 100)

# 1. Get metadata

## 1.1 Load file

In [2]:
metadata = pd.read_csv('sabdab-data/20221001_0807534_summary.tsv', sep='\t')
len(metadata.pdb.unique())

740

# 1.2 Filter out structure with Hapten Antigen

In [3]:
metadata = metadata[metadata['antigen_type']!='Hapten']
len(metadata.pdb.unique())

672

# 2. Get the metadata of a single structure as an example

idx pdb
0   1mhh
100 4liq


In [4]:
pdb = metadata.pdb.unique()[0]
print(pdb)

1mhh


In [5]:

metadata[metadata['pdb']==pdb]

Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain,antigen_type,antigen_het_name,antigen_name,short_header,date,compound,organism,heavy_species,light_species,antigen_species,authors,resolution,method,r_free,r_factor,scfv,engineered,heavy_subclass,light_subclass,light_ctype,affinity,delta_g,affinity_method,temperature,pmid
0,1mhh,D,C,0,,,,,IMMUNE SYSTEM,08/20/02,Antibody-antigen complex,FINEGOLDIA MAGNA; MUS MUSCULUS,mus musculus,mus musculus,,"Graille, M., Stura, E.A.",2.1,X-RAY DIFFRACTION,0.247,0.197,False,False,IGHV9,IGKV8,Kappa,1e-09,-12.278197,Unknown,,TBD
1,1mhh,B,A,0,,,,,IMMUNE SYSTEM,08/20/02,Antibody-antigen complex,FINEGOLDIA MAGNA; MUS MUSCULUS,mus musculus,mus musculus,,"Graille, M., Stura, E.A.",2.1,X-RAY DIFFRACTION,0.247,0.197,False,False,IGHV9,IGKV8,Kappa,1e-09,-12.278197,Unknown,,TBD


## 2.1 Parse structure

In [6]:
pdb_parser = PDBParser()

In [7]:
structure = pdb_parser.get_structure(pdb, "sabdab-data/imgt/{}.pdb".format(pdb))

## 2.2 View structure

In [8]:
view = nv.show_biopython(structure)
view

NGLWidget()

## 2.3 Extract substructures from structure

In [9]:
for model in structure.get_models():
    print(model)
    for chain in model.get_chains():
        print(chain)
        ppb = CaPPBuilder()
        for pp in ppb.build_peptides(chain):
            print(pp)
            print(pp.get_sequence())

<Model id=0>
<Chain id=E>
<Polypeptide start=820 end=882>
EVTIKVNLIFADGKIQTAEFKGTFEEATAEAYRYAALLAKVNGEWTADLEDGGNHMNIKFAGK
<Chain id=F>
<Polypeptide start=1820 end=1881>
EVTIKVNLIFADGKIQTAEFKGTFEEATAEAYRYAALLAKVNGEWTADLEDGGNHMNIKFAG
<Chain id=B>
<Polypeptide start=1 end=146>
QIQLVQSGPELKKPGETVKISCKASGYTFTDFSMHWVNQAPGKGLNWMGWVNTETGEPTYADDFKGRFAFSLETSASTAYLQINSLKNEDTATYFCARFLLRQYFDVWGAGTTVTVSSAKTTPPSVYPLAPGSAAQ
<Polypeptide start=147 end=225>
SMVTLGCLVKGYFPEPVTVTWNSGSLSSGVHTFPAVLQSDLYTLSSSVTVPSSTWPSETVTCNVAHPASSTKVDKKIVP
<Chain id=A>
<Polypeptide start=1 end=233>
DIVMSQSPSSLAVSAGEKVTMSCKSSQSLLNSRTRKNYLAWYQQKPGQSPKVLIYWASTRESGVPDRFTGRGSGTDFTLTISSVQAEDQAVYYCKQAYIPPLTFGAGTKLELKRADAAPTVSIFPPSSEQLTSGGASVVCFLNNFYPKDINVKWKIDGSERQNGVLNSWTDQDSKDSTYSMSSTLTLTKDEYERHNSYTCEATHKTSTSPIVKSFNRNE
<Chain id=D>
<Polypeptide start=1 end=144>
QIQLVQSGPELKKPGETVKISCKASGYTFTDFSMHWVNQAPGKGLNWMGWVNTETGEPTYADDFKGRFAFSLETSASTAYLQINSLKNEDTATYFCARFLLRQYFDVWGAGTTVTVSSAKTTPPSVYPLAPGSA
<Polypeptide start=145 end=224>
NSM

In [10]:
print(chain)
seq = ''
for res in chain.get_residues():
    seq += res.get_resname()

print(seq)

<Chain id=C>
ASPILEVALMETSERGLNSERPROSERSERLEUALAVALSERALAGLYGLULYSVALTHRMETSERCYSLYSSERSERGLNSERLEULEUASNSERARGTHRARGLYSASNTYRLEUALATRPTYRGLNGLNLYSPROGLYGLNSERPROLYSVALLEUILETYRTRPALASERTHRARGGLUSERGLYVALPROASPARGPHETHRGLYARGGLYSERGLYTHRASPPHETHRLEUTHRILESERSERVALGLNALAGLUASPGLNALAVALTYRTYRCYSLYSGLNALATYRILEPROPROLEUTHRPHEGLYALAGLYTHRLYSLEUGLULEULYSARGALAASPALAALAPROTHRVALSERILEPHEPROPROSERSERGLUGLNLEUTHRSERGLYGLYALASERVALVALCYSPHELEUASNASNPHETYRPROLYSASPILEASNVALLYSTRPLYSILEASPGLYSERGLUARGGLNASNGLYVALLEUASNSERTRPTHRASPGLNASPSERLYSASPSERTHRTYRSERMETSERSERTHRLEUTHRLEUTHRLYSASPGLUTYRGLUARGHISASNSERTYRTHRCYSGLUALATHRHISLYSTHRSERTHRSERPROILEVALLYSSERPHEASNARGASNGLUAEAHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOHHOH

In [11]:
len(seq)/3

411.0

In [12]:
len(pp.get_sequence())

219

# 2.4 Get the distance between 2 chains

In [13]:
Ag = model['F']#Antigen
H = model['C']#LChain

In [14]:
h_ppb = CaPPBuilder()
for h_pp in h_ppb.build_peptides(H):
    h_seq = h_pp.get_sequence()
    print(h_seq)
h_len = len(h_seq)
print(h_len)

DIVMSQSPSSLAVSAGEKVTMSCKSSQSLLNSRTRKNYLAWYQQKPGQSPKVLIYWASTRESGVPDRFTGRGSGTDFTLTISSVQAEDQAVYYCKQAYIPPLTFGAGTKLELKRADAAPTVSIFPPSSEQLTSGGASVVCFLNNFYPKDINVKWKIDGSERQNGVLNSWTDQDSKDSTYSMSSTLTLTKDEYERHNSYTCEATHKTSTSPIVKSFNRNE
219


In [15]:
ag_ppb = CaPPBuilder()
for ag_pp in ag_ppb.build_peptides(Ag):
    ag_seq = ag_pp.get_sequence()
    ag_start, ag_end = ag_pp.get_start_end()
    print(ag_seq)
ag_len = len(ag_seq)
print(ag_len, ag_start,ag_end)

EVTIKVNLIFADGKIQTAEFKGTFEEATAEAYRYAALLAKVNGEWTADLEDGGNHMNIKFAG
62 1820 1881


In [16]:
ag_pp

<Polypeptide start=1820 end=1881>

In [17]:
distances = []
h_ress = []
ag_ress = []
h_seqid = []
ag_seqid = []
ab = 1
comparison_num = 0
for h_res in H.get_residues():
    h_res_name = h_res.get_resname()
    if h_res_name != 'HOH' and h_res_name != 'AEA':
        h_atom = h_res['CA']
        ag = ag_start 
        for ag_res in Ag.get_residues():
            ag_res_name = ag_res.get_resname()
            print(ag_res_name, ag)
            if ag_res_name != 'HOH' and ag_res_name != 'AEA':
                ag_atom = ag_res['CA']
                dist = h_atom - ag_atom
                h_ress.append(h_res_name)
                ag_ress.append(ag_res_name)
                h_seqid.append(ab)
                ag_seqid.append(ag)
                ag += 1
                distances.append(dist) 
                comparison_num += 1
    ab += 1
            


GLU 1820
VAL 1821
THR 1822
ILE 1823
LYS 1824
VAL 1825
ASN 1826
LEU 1827
ILE 1828
PHE 1829
ALA 1830
ASP 1831
GLY 1832
LYS 1833
ILE 1834
GLN 1835
THR 1836
ALA 1837
GLU 1838
PHE 1839
LYS 1840
GLY 1841
THR 1842
PHE 1843
GLU 1844
GLU 1845
ALA 1846
THR 1847
ALA 1848
GLU 1849
ALA 1850
TYR 1851
ARG 1852
TYR 1853
ALA 1854
ALA 1855
LEU 1856
LEU 1857
ALA 1858
LYS 1859
VAL 1860
ASN 1861
GLY 1862
GLU 1863
TRP 1864
THR 1865
ALA 1866
ASP 1867
LEU 1868
GLU 1869
ASP 1870
GLY 1871
GLY 1872
ASN 1873
HIS 1874
MET 1875
ASN 1876
ILE 1877
LYS 1878
PHE 1879
ALA 1880
GLY 1881
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
GLU 1820
VAL 1821
THR 1822
ILE 1823
LYS 1824
VAL 1825
ASN 1826
LEU 1827
ILE 1828
PHE 1829
ALA 1830
ASP 1831
GLY 1832
LYS 1833
ILE 1834
GLN 1835
THR 1836
ALA 1837
GLU 1838
PHE 1839
LYS 1840
GLY 1841
THR 1842
P

GLU 1863
TRP 1864
THR 1865
ALA 1866
ASP 1867
LEU 1868
GLU 1869
ASP 1870
GLY 1871
GLY 1872
ASN 1873
HIS 1874
MET 1875
ASN 1876
ILE 1877
LYS 1878
PHE 1879
ALA 1880
GLY 1881
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
HOH 1882
GLU 1820
VAL 1821
THR 1822
ILE 1823
LYS 1824
VAL 1825
ASN 1826
LEU 1827
ILE 1828
PHE 1829
ALA 1830
ASP 1831
GLY 1832
LYS 1833
ILE 1834
GLN 1835
THR 1836
ALA 1837
GLU 1838
PHE 1839
LYS 1840
GLY 1841
THR 1842
PHE 1843
GLU 1844
GLU 1845
ALA 1846
THR 1847
ALA 1848
GLU 1849
ALA 1850
TYR 1851
ARG 1852
TYR 1853
ALA 1854
ALA 1855
LEU 1856
LEU 1857
ALA 1858
LYS 1859
VAL 1860
ASN 1861
GLY 1862
GLU 1863
TRP 1864
THR 1865
ALA 1866
ASP 1867
LEU 1868
GLU 1869
ASP 1870
GLY 1871
GLY 1872
ASN 1873
HIS 1874
MET 1875
ASN 1876
ILE 1877
LYS 1878
PHE 1879
ALA 1880
GLY 1881
HOH 1882
HOH 1882
HOH 1882
HOH 1882
H

In [18]:
comparisons_df = pd.DataFrame({'h_ress':h_ress,'h_seqid':h_seqid,'ag_ress':ag_ress,'ag_seqid':ag_seqid,'distances':distances})
comparisons_df['ag_ress_seqid'] = comparisons_df.ag_ress + '-' + comparisons_df.ag_seqid.astype(str)
comparisons_df.head()

Unnamed: 0,h_ress,h_seqid,ag_ress,ag_seqid,distances,ag_ress_seqid
0,ASP,1,GLU,1820,23.690613,GLU-1820
1,ASP,1,VAL,1821,26.037287,VAL-1821
2,ASP,1,THR,1822,26.619587,THR-1822
3,ASP,1,ILE,1823,27.920298,ILE-1823
4,ASP,1,LYS,1824,30.233057,LYS-1824


In [19]:
comparisons_df.sort_values(['distances']).head(20)

Unnamed: 0,h_ress,h_seqid,ag_ress,ag_seqid,distances,ag_ress_seqid
637,LEU,11,ALA,1837,4.334361,ALA-1837
759,VAL,13,GLN,1835,4.939712,GLN-1835
515,SER,9,PHE,1839,4.992039,PHE-1839
698,ALA,12,THR,1836,5.121083,THR-1836
576,SER,10,GLU,1838,5.144905,GLU-1838
514,SER,9,GLU,1838,5.380033,GLU-1838
453,PRO,8,PHE,1839,5.493803,PHE-1839
575,SER,10,ALA,1837,5.513206,ALA-1837
697,ALA,12,GLN,1835,5.648639,GLN-1835
636,LEU,11,THR,1836,5.776641,THR-1836


In [20]:
ress_paratope = comparisons_df.sort_values(['distances']).ag_ress_seqid.unique()[0:8]
ress_paratope

array(['ALA-1837', 'GLN-1835', 'PHE-1839', 'THR-1836', 'GLU-1838',
       'LYS-1840', 'GLU-1849', 'TYR-1853'], dtype=object)

In [21]:
sorted_comparisons_df = comparisons_df.sort_values(['distances']).head(30)
sorted_comparisons_df

Unnamed: 0,h_ress,h_seqid,ag_ress,ag_seqid,distances,ag_ress_seqid
637,LEU,11,ALA,1837,4.334361,ALA-1837
759,VAL,13,GLN,1835,4.939712,GLN-1835
515,SER,9,PHE,1839,4.992039,PHE-1839
698,ALA,12,THR,1836,5.121083,THR-1836
576,SER,10,GLU,1838,5.144905,GLU-1838
514,SER,9,GLU,1838,5.380033,GLU-1838
453,PRO,8,PHE,1839,5.493803,PHE-1839
575,SER,10,ALA,1837,5.513206,ALA-1837
697,ALA,12,GLN,1835,5.648639,GLN-1835
636,LEU,11,THR,1836,5.776641,THR-1836


In [22]:
sorted_comparisons_df[sorted_comparisons_df.ag_ress_seqid.isin(ress_paratope)].sort_values(['ag_seqid','h_seqid'])

Unnamed: 0,h_ress,h_seqid,ag_ress,ag_seqid,distances,ag_ress_seqid
635,LEU,11,GLN,1835,7.244271,GLN-1835
697,ALA,12,GLN,1835,5.648639,GLN-1835
759,VAL,13,GLN,1835,4.939712,GLN-1835
821,SER,14,GLN,1835,7.235299,GLN-1835
636,LEU,11,THR,1836,5.776641,THR-1836
698,ALA,12,THR,1836,5.121083,THR-1836
760,VAL,13,THR,1836,6.898151,THR-1836
451,PRO,8,ALA,1837,6.276825,ALA-1837
513,SER,9,ALA,1837,7.275739,ALA-1837
575,SER,10,ALA,1837,5.513206,ALA-1837


In [23]:
view = nv.show_biopython(structure)
view

NGLWidget()

## 3. Example 2

In [24]:
pdb = metadata.pdb.unique()[100]
print(pdb)

4liq


In [25]:
metadata[metadata['pdb']==pdb]

Unnamed: 0,pdb,Hchain,Lchain,model,antigen_chain,antigen_type,antigen_het_name,antigen_name,short_header,date,compound,organism,heavy_species,light_species,antigen_species,authors,resolution,method,r_free,r_factor,scfv,engineered,heavy_subclass,light_subclass,light_ctype,affinity,delta_g,affinity_method,temperature,pmid
206,4liq,H,L,0,E,protein,,macrophage colony-stimulating factor 1 receptor,IMMUNE SYSTEM,07/03/13,Structure of the extracellular domain of human...,MUS MUSCULUS; HOMO SAPIENS,mus musculus,mus musculus,homo sapiens,"Benz, J., Gorr, I.H., Hertenberger, H., Ries, ...",2.6,X-RAY DIFFRACTION,0.237,0.186,False,True,IGHV1,IGKV1,Kappa,2e-10,-13.231763,SPR,,TBD


In [26]:
H_label = metadata[metadata['pdb']==pdb].Hchain.values[0]
L_label = metadata[metadata['pdb']==pdb].Lchain.values[0]
Ag_label = metadata[metadata['pdb']==pdb].antigen_chain.values[0]
print(f'H lable: {H_label}\nL label: {L_label}\nAntigen label: {Ag_label}')

H lable: H
L label: L
Antigen label: E


In [27]:
label_2_chain_dict = {H_label:'H_ch',L_label:'L_ch',Ag_label:'Ag_ch'}
label_2_chain_dict

{'H': 'H_ch', 'L': 'L_ch', 'E': 'Ag_ch'}

## 3.1 Parse structure

In [28]:
pdb_parser = PDBParser()

In [29]:
structure = pdb_parser.get_structure(pdb, "sabdab-data/imgt/{}.pdb".format(pdb))

## 3.2 View structure

In [30]:
view = nv.show_biopython(structure)
view

NGLWidget()

## 3.3 Extract substructures from structure

In [31]:
pp_chains_dict = {H_label:[],L_label:[],Ag_label:[]}
pp_chains_dict

{'H': [], 'L': [], 'E': []}

In [32]:
for model in structure.get_models():
    print(model)
    for chain in model.get_chains():
        ppb = CaPPBuilder()
        chain_id = chain.get_id()
        print(f'chain: {chain}, type: {label_2_chain_dict[chain_id]}')
        for pp in ppb.build_peptides(chain):
            start, end = pp.get_start_end()
            pp_chains_dict[chain_id].append((pp,start,end))

            print(f'sequence:{pp.get_sequence()}')
pp_chains_dict

<Model id=0>
chain: <Chain id=E>, type: Ag_ch
sequence:IPVIEPSVPELVVKPGATVTLRCVGNGSVEWDGPPSPHWTLYSDGSSSILSTNNATFQNTGTYRCTEPG
sequence:SAAIHLYVKDPARPWNVLAQEVVVFEDQDALLPCLLTDPVLEAGVSLVRV
sequence:PLMRHTNYSFSPWHGFTIHRAKFIQSQDYQCSALMGGRKVMSISIRLKVQKVIPGPPALTLVPAELVRIRGEAAQIVCSASSVDVNFDVFLQHNNTKLAIPQQSDFHNNRYQKVLTLNLDQVDFQHAGNYSCVASNVQGKHSTSMFFRVVESAYLNLSSEQNLIQEVTVGEGLNLKVMVEAYPGLQGFNWTYLGPFSDHQPE
sequence:KLANAT
sequence:TYRHTFTLSLPRLKPSEAGRYSFLARNPGGWRALTFELTLRYPPEVSVIWTFINGSGTLLCAASGYPQPNVTWLQCSGHTDRCDEAQVLQVWDDPYPEVLSQEPFHKVTVQSLLTVETLEHNQTYECRAHNSVGSGSWAFIP
chain: <Chain id=H>, type: H_ch
sequence:QVQLVQSGAEVKKPGASVKVSCKASGYTFTSYDISWVRQAPGQGLEWMGVIWTDGGTNYAQKLQGRVTMTTDTSTSTAYMELRSLRSDDTAVYYCARDQRLYFDVWGQGTTVTVSSASTKGPSVFPLAPSS
sequence:GTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPK
chain: <Chain id=L>, type: L_ch
sequence:DIQMTQSPSSLSASVGDRVTITCRASEDVNTYVSWYQQKPGKAPKLLIYAASNRYTGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSFSYPTFGQGTKLEIKRTVAAPSVFIFPPSDEQLKSGTAS

{'H': [(<Polypeptide start=1 end=143>, 1, 143),
  (<Polypeptide start=144 end=224>, 144, 224)],
 'L': [(<Polypeptide start=1 end=234>, 1, 234)],
 'E': [(<Polypeptide start=20 end=88>, 20, 88),
  (<Polypeptide start=94 end=143>, 94, 143),
  (<Polypeptide start=147 end=348>, 147, 348),
  (<Polypeptide start=350 end=355>, 350, 355),
  (<Polypeptide start=359 end=500>, 359, 500)]}

## 3.4 Get the distance between 2 chains

In [33]:
Ab_labels_list = [H_label, L_label]
Ag_labels_list = [Ag_label]

In [34]:
distances = []
ab_ress = []
ag_ress = []
ab_seqids = []
ag_seqids = []
ab_labels = []
ag_labels = []
comparison_num = 0
gratest_distance = 0
for ab_label in Ab_labels_list:
    Ab_ch = model[ab_label]
    for ab_res in Ab_ch.get_residues():
        ab_res_name = ab_res.get_resname()
        ab_res_het_tag, ab_seqid, ab_insertion = ab_res.get_id()
        if ab_res_het_tag == ' ' or ab_res_het_tag == '':
            ab_atom = ab_res['CA']
            for ag_label in Ag_labels_list:
                Ag_ch = model[ag_label]
                for ag_res in Ag_ch.get_residues():
                    ag_res_name = ag_res.get_resname()
                    ag_res_het_tag, ag_seqid, ag_insertion = ag_res.get_id()
                    if ag_res_het_tag == ' ' or ag_res_het_tag == '':
                        ag_atom = ag_res['CA']
                        dist = ab_atom - ag_atom
                        if dist > gratest_distance:
                            gratest_distance = dist
                        if dist <= 15:
                            ab_ress.append(ab_res_name)
                            ag_ress.append(ag_res_name)
                            ab_seqids.append(ab_seqid)
                            ag_seqids.append(ag_seqid)
                            ab_labels.append(ab_label)
                            ag_labels.append(ag_labels)
                            distances.append(dist) 
                            comparison_num += 1
                            if comparison_num % 100 == 0:
                                print(f'{comparison_num} comparisons made so far.')

print('Finished...')



100 comparisons made so far.
200 comparisons made so far.
300 comparisons made so far.
400 comparisons made so far.
500 comparisons made so far.
600 comparisons made so far.
700 comparisons made so far.
Finished...


In [35]:
gratest_distance

114.911606

In [37]:
len(ab_labels), len(ab_ress), len(ab_seqids), len(ag_labels), len(ag_ress), len()

(780, 780)

In [38]:
comparisons_df = pd.DataFrame({'ab_label':ab_labels,'ab_res':ab_ress,'ab_seqid':ab_seqids,'ag_label':ag_labels,'ag_res':ag_ress,'ag_seqid':ag_seqids,'distances':distances})
comparisons_df['ag_ress_seqid'] = comparisons_df.ag_res + '-' + comparisons_df.ag_seqid.astype(str)


In [39]:
len(comparisons_df)

780

In [40]:
comparisons_df.columns

Index(['ab_label', 'ab_res', 'ab_seqid', 'ag_label', 'ag_res', 'ag_seqid',
       'distances', 'ag_ress_seqid'],
      dtype='object')

In [None]:
comparisons_df.sort_values(['distances']).head(20)

Unnamed: 0,ab_ress,ab_seqid,ag_ress,ag_seqid,distances,ag_ress_seqid
34359,GLY,74,ARG,481,42.033504,ARG-481
34357,GLY,74,LEU,479,42.06221,LEU-479
34356,GLY,74,PRO,478,42.353378,PRO-478
14659,SER,32,LEU,479,42.494301,LEU-479
12782,SER,28,PRO,478,42.704926,PRO-478
12783,SER,28,LEU,479,42.723446,LEU-479
33890,SER,73,ARG,481,42.807564,ARG-481
14661,SER,32,ARG,481,42.962662,ARG-481
12313,GLN,27,PRO,478,43.081909,PRO-478
14658,SER,32,PRO,478,43.102478,PRO-478


In [None]:
ress_paratope = comparisons_df.sort_values(['distances']).ag_ress_seqid.unique()[0:8]
ress_paratope

array(['ARG-481', 'LEU-479', 'PRO-478', 'MET-480', 'HIS-482', 'VAL-477',
       'ARG-476', 'ILE-501'], dtype=object)

In [None]:
sorted_comparisons_df = comparisons_df.sort_values(['distances']).head(30)
sorted_comparisons_df

Unnamed: 0,ab_ress,ab_seqid,ag_ress,ag_seqid,distances,ag_ress_seqid
34359,GLY,74,ARG,481,42.033504,ARG-481
34357,GLY,74,LEU,479,42.06221,LEU-479
34356,GLY,74,PRO,478,42.353378,PRO-478
14659,SER,32,LEU,479,42.494301,LEU-479
12782,SER,28,PRO,478,42.704926,PRO-478
12783,SER,28,LEU,479,42.723446,LEU-479
33890,SER,73,ARG,481,42.807564,ARG-481
14661,SER,32,ARG,481,42.962662,ARG-481
12313,GLN,27,PRO,478,43.081909,PRO-478
14658,SER,32,PRO,478,43.102478,PRO-478


In [None]:
sorted_comparisons_df[sorted_comparisons_df.ag_ress_seqid.isin(ress_paratope)].sort_values(['ag_seqid','h_seqid'])

Unnamed: 0,ab_ress,ab_seqid,ag_ress,ag_seqid,distances,ag_ress_seqid
12313,GLN,27,PRO,478,43.081909,PRO-478
12782,SER,28,PRO,478,42.704926,PRO-478
14658,SER,32,PRO,478,43.102478,PRO-478
33887,SER,73,PRO,478,44.002808,PRO-478
34356,GLY,74,PRO,478,42.353378,PRO-478
34825,THR,75,PRO,478,43.403671,PRO-478
35294,ASP,76,PRO,478,44.418274,PRO-478
12314,GLN,27,LEU,479,43.433979,LEU-479
12783,SER,28,LEU,479,42.723446,LEU-479
13721,LEU,30,LEU,479,44.090824,LEU-479


In [None]:
view = nv.show_biopython(structure)
view

NGLWidget()