# Preparations

Code prerequisites:
- obabel

In [1]:
%load_ext autoreload
%autoreload 2

In [8]:
import typing as tp

import pandas as pd
import numpy as np

import requests
import re
import time
from copy import deepcopy
import pickle

from pathlib import Path

from collections import OrderedDict, namedtuple
from itertools import combinations

import shutil

import functools

from support import exception_handler, \
get_molecules_by_query, get_subtructure_smiles, \
download_ligand, download_ligands, mol2_to_smiles, \
get_pdb_by_smiles, get_pdb_by_ligand_identifier, \
get_list_of_pdb_by_smiles, get_list_of_pdb_by_ligand_identifier, \
do_atom_replacement, get_activity_df_by_pdb, get_subunits_by_pdb_and_ligand_id, \
get_uniprot_by_pdb_id_entity_id, get_uniprot_by_pdb_id, pdb2pubmed


## Folders setup

In [9]:
data_folder = Path("./data")
data_folder.mkdir(exist_ok = True)

downloads_folder = Path("files")
flourine_raw_ligands_folder = downloads_folder / "F-raw-ligands"

flourine_hydrogen_replacements_folder = downloads_folder / "F-H-replacements"

# Getting flourine ligands

## Getting identifier for each ligand that contains fluorine

In [5]:
all_fluorine_ligands = get_molecules_by_query("F")
all_fluorine_ligands

Attempt to get full number of entries:
Query 'F', (1, 1)  successed!
Full number of entries is: 5104
Query 'F', (0, 100)  successed!
Query 'F', (100, 100)  successed!
Query 'F', (200, 100)  successed!
Query 'F', (300, 100)  successed!
Query 'F', (400, 100)  successed!
Query 'F', (500, 100)  successed!
Query 'F', (600, 100)  successed!
Query 'F', (700, 100)  successed!
Query 'F', (800, 100)  successed!
Query 'F', (900, 100)  successed!
Query 'F', (1000, 100)  successed!
Query 'F', (1100, 100)  successed!
Query 'F', (1200, 100)  successed!
Query 'F', (1300, 100)  successed!
Query 'F', (1400, 100)  successed!
Query 'F', (1500, 100)  successed!
Query 'F', (1600, 100)  successed!
Query 'F', (1700, 100)  successed!
Query 'F', (1800, 100)  successed!
Query 'F', (1900, 100)  successed!
Query 'F', (2000, 100)  successed!
Query 'F', (2100, 100)  successed!
Query 'F', (2200, 100)  successed!
Query 'F', (2300, 100)  successed!
Query 'F', (2400, 100)  successed!
Query 'F', (2500, 100)  successed!
Q

Unnamed: 0,identifier,score
0,,1.000000
1,AF3,0.250000
2,FAH,0.200000
3,ETF,0.166667
4,W6X,0.142857
...,...,...
99,PRD_001193,0.010989
0,XR4,0.009901
1,C9V,0.009346
2,PRD_000995,0.008929


In [6]:
try:
    all_fluorine_ligands.to_pickle(data_folder / "flourine_raw_ligands.pkl")
except NameError:
    all_fluorine_ligands = pd.read_pickle(data_folder / "flourine_raw_ligands.pkl")

## Download mol2 ligands

In [10]:
path_maps = download_ligands(list(all_fluorine_ligands["identifier"]),
                             flourine_raw_ligands_folder,
                             downloads_folder)
path_maps

<Response [404]>
Ligand  not found (mol2 format) 
AF3_ideal.mol2 ligand already was saved to: files/F-raw-ligands/AF3_ideal.mol2
FAH_ideal.mol2 ligand already was saved to: files/F-raw-ligands/FAH_ideal.mol2
ETF_ideal.mol2 ligand already was saved to: files/F-raw-ligands/ETF_ideal.mol2
W6X_ideal.mol2 ligand already was saved to: files/F-raw-ligands/W6X_ideal.mol2
TFA_ideal.mol2 ligand already was saved to: files/F-raw-ligands/TFA_ideal.mol2
TFS_ideal.mol2 ligand already was saved to: files/F-raw-ligands/TFS_ideal.mol2
W6Z_ideal.mol2 ligand already was saved to: files/F-raw-ligands/W6Z_ideal.mol2
9O0_ideal.mol2 ligand already was saved to: files/F-raw-ligands/9O0_ideal.mol2
4FB_ideal.mol2 ligand already was saved to: files/F-raw-ligands/4FB_ideal.mol2
V7M_ideal.mol2 ligand already was saved to: files/F-raw-ligands/V7M_ideal.mol2
FBA_ideal.mol2 ligand already was saved to: files/F-raw-ligands/FBA_ideal.mol2
FFP_ideal.mol2 ligand already was saved to: files/F-raw-ligands/FFP_ideal.mol2
OB

{'': None,
 'AF3': PosixPath('F-raw-ligands/AF3_ideal.mol2'),
 'FAH': PosixPath('F-raw-ligands/FAH_ideal.mol2'),
 'ETF': PosixPath('F-raw-ligands/ETF_ideal.mol2'),
 'W6X': PosixPath('F-raw-ligands/W6X_ideal.mol2'),
 'TFA': PosixPath('F-raw-ligands/TFA_ideal.mol2'),
 'TFS': PosixPath('F-raw-ligands/TFS_ideal.mol2'),
 'W6Z': PosixPath('F-raw-ligands/W6Z_ideal.mol2'),
 '9O0': PosixPath('F-raw-ligands/9O0_ideal.mol2'),
 '4FB': PosixPath('F-raw-ligands/4FB_ideal.mol2'),
 'V7M': PosixPath('F-raw-ligands/V7M_ideal.mol2'),
 'FBA': PosixPath('F-raw-ligands/FBA_ideal.mol2'),
 'FFP': PosixPath('F-raw-ligands/FFP_ideal.mol2'),
 'OBF': PosixPath('F-raw-ligands/OBF_ideal.mol2'),
 'DFX': PosixPath('F-raw-ligands/DFX_ideal.mol2'),
 '1DQ': PosixPath('F-raw-ligands/1DQ_ideal.mol2'),
 '6Y5': PosixPath('F-raw-ligands/6Y5_ideal.mol2'),
 'U5V': PosixPath('F-raw-ligands/U5V_ideal.mol2'),
 'DFE': PosixPath('F-raw-ligands/DFE_ideal.mol2'),
 'DFB': PosixPath('F-raw-ligands/DFB_ideal.mol2'),
 '9O3': PosixPath('F

In [11]:
path_maps = {(k): (downloads_folder / v if v is not None else None) for k, v in path_maps.items() }
paths_df = pd.DataFrame.from_dict(path_maps, orient="index").reset_index().rename({0: "path", "index": "identifier"}, axis=1)


### Count nan values and check it

In [12]:
nan_values = paths_df["path"].isna()
print("Num of nan: ", nan_values.sum())
paths_df[nan_values]

Num of nan:  33


Unnamed: 0,identifier,path
0,,
1159,PRD_900082,
1191,PRD_900073,
1205,PRD_900064,
1233,PRD_900050,
1329,PRD_900077,
2912,PRD_000781,
3280,PRD_000338,
3428,PRD_000239,
3593,PRD_000423,


There are protein sequences that are not used as ligands, so we can filter them

In [13]:
filtered_values = paths_df[~nan_values]
filtered_values

Unnamed: 0,identifier,path
1,AF3,files/F-raw-ligands/AF3_ideal.mol2
2,FAH,files/F-raw-ligands/FAH_ideal.mol2
3,ETF,files/F-raw-ligands/ETF_ideal.mol2
4,W6X,files/F-raw-ligands/W6X_ideal.mol2
5,TFA,files/F-raw-ligands/TFA_ideal.mol2
...,...,...
5023,TKY,files/F-raw-ligands/TKY_ideal.mol2
5024,QDQ,files/F-raw-ligands/QDQ_ideal.mol2
5025,VKA,files/F-raw-ligands/VKA_ideal.mol2
5029,XR4,files/F-raw-ligands/XR4_ideal.mol2


In [14]:
all_fluorine_ligands_filled = all_fluorine_ligands.\
                                    set_index("identifier").\
                                    join(filtered_values.set_index("identifier"), how="right")

In [7]:
try:
    all_fluorine_ligands_filled.to_pickle(data_folder / "flourine_filled_ligands.pkl")
except NameError:
    all_fluorine_ligands_filled = pd.read_pickle(data_folder / "flourine_filled_ligands.pkl")

all_fluorine_ligands_filled

Unnamed: 0,identifier,score,path
0,001,0.022222,files/F-raw-ligands/001_ideal.mol2
1,008,0.038462,files/F-raw-ligands/008_ideal.mol2
2,014,0.052632,files/F-raw-ligands/014_ideal.mol2
3,018,0.050000,files/F-raw-ligands/018_ideal.mol2
4,01C,0.047619,files/F-raw-ligands/01C_ideal.mol2
...,...,...,...
5061,ZZO,0.026316,files/F-raw-ligands/ZZO_ideal.mol2
5062,ZZP,0.031250,files/F-raw-ligands/ZZP_ideal.mol2
5063,ZZV,0.034483,files/F-raw-ligands/ZZV_ideal.mol2
5064,ZZW,0.040000,files/F-raw-ligands/ZZW_ideal.mol2


### Convert mol2 files to smiles via openbabel

In [27]:
all_fluorine_ligands_filled["smiles"] = \
    all_fluorine_ligands_filled["path"].apply(lambda x: mol2_to_smiles(str(x)))

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule co

In [28]:
all_fluorine_ligands_filled["smiles"].isna().sum()

0

In [30]:
all_fluorine_ligands_smiled = all_fluorine_ligands_filled.copy()
all_fluorine_ligands_smiled.to_csv(data_folder / "flourine_smiled_ligands.csv")

In [8]:
all_fluorine_ligands_smiled = pd.read_csv(data_folder / "flourine_smiled_ligands.csv")

In [6]:
try:
    all_fluorine_ligands_smiled.to_pickle(data_folder / "flourine_smiled_ligands.pkl")
except NameError:
    all_fluorine_ligands_smiled = pd.read_pickle(data_folder / "flourine_smiled_ligands.pkl")

In [7]:
all_fluorine_ligands_smiled

Unnamed: 0,identifier,score,path,smiles
0,001,0.022222,files/F-raw-ligands/001_ideal.mol2,c1(cc(c(c(c1)OC)OC)OC)C(F)(F)C(=O)N1CCCC[C@H]1...
1,008,0.038462,files/F-raw-ligands/008_ideal.mol2,Fc1ccccc1C[C@@H](N)CC(=O)N1Cc2ccccc2C[C@H]1C(=O)N
2,014,0.052632,files/F-raw-ligands/014_ideal.mol2,Fc1cc2[nH]c(n3ncc(C(=O)O)c3)nc2cc1Cl
3,018,0.050000,files/F-raw-ligands/018_ideal.mol2,O=[N](O)c1cc(C(F)(F)F)ccc1NCc1cscc1
4,01C,0.047619,files/F-raw-ligands/01C_ideal.mol2,N[C@@H](Cc1c[nH]c2c1cccc2)C(=O)C(F)(F)C(F)(F)F
...,...,...,...,...
5061,ZZO,0.026316,files/F-raw-ligands/ZZO_ideal.mol2,Cc1ccccc1n1c(=O)c2c(C)cccc2nc1Cn1nc(c2c1ncnc2N...
5062,ZZP,0.031250,files/F-raw-ligands/ZZP_ideal.mol2,COC1=C/C(=N\c2nc3ccccc3nc2NS(=O)(=O)c2ccc(F)cc...
5063,ZZV,0.034483,files/F-raw-ligands/ZZV_ideal.mol2,CN(c1c2CN(Cc3ccc(F)cc3)C(=O)c2c(O)c2ncccc12)S(...
5064,ZZW,0.040000,files/F-raw-ligands/ZZW_ideal.mol2,O=C(NO)c1cc2c3ccccc3n(Cc3ccc(F)cc3)c2cn1


## Filter high-fluorinated ligands (more than 8 atoms of fluorine)

In [8]:
over_fluorine_ligands = all_fluorine_ligands_smiled.smiles.str.count("F") <= 8
all_fluorine_ligands_smiled_filtered = all_fluorine_ligands_smiled[over_fluorine_ligands]

## Getting PDB of each fluorinated ligand

### Getting correspondent PDB identifiers for eaсh ligand

In [10]:
all_fluorine_ligands_smiled_pdb = all_fluorine_ligands_smiled_filtered.copy()

In [11]:
all_fluorine_ligands_smiled_pdb = all_fluorine_ligands_smiled_pdb.rename({"pdbs": "pdbs_from_smiles"}, axis = 1)

In [12]:
try:
    all_fluorine_ligands_smiled_pdb.to_pickle(data_folder / "all_fluorine_ligands_smiled_pdb.pkl")
except NameError:
    all_fluorine_ligands_smiled_pdb = pd.read_pickle(data_folder / "all_fluorine_ligands_smiled_pdb.pkl")

In [14]:
all_fluorine_ligands_smiled_pdb_from_id = all_fluorine_ligands_smiled_pdb.copy()
all_fluorine_ligands_smiled_pdb_from_id["pdb_from_identifier"] = np.nan

In [155]:
for ligand in all_fluorine_ligands_smiled_pdb_from_id.itertuples():
    # print(ligand.pdb_from_identifier)
    if pd.isna(ligand.pdb_from_identifier):
        try:
            print(ligand.Index)
            #  Get list of pdb for this ligand
            all_fluorine_ligands_smiled_pdb_from_id.loc[ligand.Index, "pdb_from_identifier"] = \
                "|".join(get_list_of_pdb_by_ligand_identifier(ligand.identifier))
        except KeyboardInterrupt:
            break
        except Exception as e:
            print(e)
            continue

# all_fluorine_ligands_smiled_pdb_from_id["pdb_from_identifier"] = \
#     all_fluorine_ligands_smiled_pdb_from_id["identifier"].apply(get_list_of_pdb_by_ligand_identifier)

0
Query '001', (0, 2000)  successed!
1
Query '008', (0, 2000)  successed!
2
Query '014', (0, 2000)  successed!
3
Query '018', (0, 2000)  successed!
4
Nothing found for  01C
5
Query '020', (0, 2000)  successed!
6
Nothing found for  025
7
Query '032', (0, 2000)  successed!
8
Query '03M', (0, 2000)  successed!
9
Query '03P', (0, 2000)  successed!
10
Query '03Q', (0, 2000)  successed!
11
Query '03R', (0, 2000)  successed!
12
Query '03U', (0, 2000)  successed!
13
Query '03X', (0, 2000)  successed!
14
Query '041', (0, 2000)  successed!
15
Query '044', (0, 2000)  successed!
16
Query '046', (0, 2000)  successed!
17
Query '048', (0, 2000)  successed!
18
Query '04K', (0, 2000)  successed!
19
Query '04R', (0, 2000)  successed!
20
Query '051', (0, 2000)  successed!
21
Query '053', (0, 2000)  successed!
22
Query '054', (0, 2000)  successed!
23
Query '058', (0, 2000)  successed!
24
Query '05O', (0, 2000)  successed!
25
Query '05X', (0, 2000)  successed!
26
Query '063', (0, 2000)  successed!
27
Query

In [156]:
all_fluorine_ligands_smiled_pdb_from_id

Unnamed: 0,identifier,score,path,F_smiles,pdb_from_identifier
0,001,0.022222,files/F-raw-ligands/001_ideal.mol2,c1(cc(c(c(c1)OC)OC)OC)C(F)(F)C(=O)N1CCCC[C@H]1...,1J4R
1,008,0.038462,files/F-raw-ligands/008_ideal.mol2,Fc1ccccc1C[C@@H](N)CC(=O)N1Cc2ccccc2C[C@H]1C(=O)N,2BUC
2,014,0.052632,files/F-raw-ligands/014_ideal.mol2,Fc1cc2[nH]c(n3ncc(C(=O)O)c3)nc2cc1Cl,3OUH
3,018,0.050000,files/F-raw-ligands/018_ideal.mol2,O=[N](O)c1cc(C(F)(F)F)ccc1NCc1cscc1,3H7W
4,01C,0.047619,files/F-raw-ligands/01C_ideal.mol2,N[C@@H](Cc1c[nH]c2c1cccc2)C(=O)C(F)(F)C(F)(F)F,
...,...,...,...,...,...
5061,ZZO,0.026316,files/F-raw-ligands/ZZO_ideal.mol2,Cc1ccccc1n1c(=O)c2c(C)cccc2nc1Cn1nc(c2c1ncnc2N...,2WXH
5062,ZZP,0.031250,files/F-raw-ligands/ZZP_ideal.mol2,COC1=C/C(=N\c2nc3ccccc3nc2NS(=O)(=O)c2ccc(F)cc...,2WXO
5063,ZZV,0.034483,files/F-raw-ligands/ZZV_ideal.mol2,CN(c1c2CN(Cc3ccc(F)cc3)C(=O)c2c(O)c2ncccc12)S(...,3OYD
5064,ZZW,0.040000,files/F-raw-ligands/ZZW_ideal.mol2,O=C(NO)c1cc2c3ccccc3n(Cc3ccc(F)cc3)c2cn1,3OYC


In [157]:
try:
    all_fluorine_ligands_smiled_pdb_from_id.to_pickle(data_folder / "all_fluorine_ligands_smiled_pdb_from_id.pkl")
except NameError:
    all_fluorine_ligands_smiled_pdb_from_id = pd.read_pickle(data_folder / "all_fluorine_ligands_smiled_pdb_from_id.pkl")

In [153]:
get_list_of_pdb_by_ligand_identifier("001")

Query '001', (0, 2000)  successed!


['1J4R']

In [158]:
all_fluorine_ligands_smiled_pdb_from_id

Unnamed: 0,identifier,score,path,F_smiles,pdb_from_identifier
0,001,0.022222,files/F-raw-ligands/001_ideal.mol2,c1(cc(c(c(c1)OC)OC)OC)C(F)(F)C(=O)N1CCCC[C@H]1...,1J4R
1,008,0.038462,files/F-raw-ligands/008_ideal.mol2,Fc1ccccc1C[C@@H](N)CC(=O)N1Cc2ccccc2C[C@H]1C(=O)N,2BUC
2,014,0.052632,files/F-raw-ligands/014_ideal.mol2,Fc1cc2[nH]c(n3ncc(C(=O)O)c3)nc2cc1Cl,3OUH
3,018,0.050000,files/F-raw-ligands/018_ideal.mol2,O=[N](O)c1cc(C(F)(F)F)ccc1NCc1cscc1,3H7W
4,01C,0.047619,files/F-raw-ligands/01C_ideal.mol2,N[C@@H](Cc1c[nH]c2c1cccc2)C(=O)C(F)(F)C(F)(F)F,
...,...,...,...,...,...
5061,ZZO,0.026316,files/F-raw-ligands/ZZO_ideal.mol2,Cc1ccccc1n1c(=O)c2c(C)cccc2nc1Cn1nc(c2c1ncnc2N...,2WXH
5062,ZZP,0.031250,files/F-raw-ligands/ZZP_ideal.mol2,COC1=C/C(=N\c2nc3ccccc3nc2NS(=O)(=O)c2ccc(F)cc...,2WXO
5063,ZZV,0.034483,files/F-raw-ligands/ZZV_ideal.mol2,CN(c1c2CN(Cc3ccc(F)cc3)C(=O)c2c(O)c2ncccc12)S(...,3OYD
5064,ZZW,0.040000,files/F-raw-ligands/ZZW_ideal.mol2,O=C(NO)c1cc2c3ccccc3n(Cc3ccc(F)cc3)c2cn1,3OYC


## Generate H-replacements in ligands containing fluorine

In [17]:
paths_to_replacements = {}

all_fluorine_ligands_replacements = all_fluorine_ligands_smiled_filtered.copy()

In [18]:
def replacement_folder_exists(path):
    return (flourine_hydrogen_replacements_folder / path.stem).exists()


all_fluorine_ligands_replacements["has_replacements"] = all_fluorine_ligands_replacements["path"].\
                                                        apply(Path).\
                                                        apply(replacement_folder_exists)
all_fluorine_ligands_replacements

Unnamed: 0,identifier,score,path,smiles,has_replacements
0,001,0.022222,files/F-raw-ligands/001_ideal.mol2,c1(cc(c(c(c1)OC)OC)OC)C(F)(F)C(=O)N1CCCC[C@H]1...,True
1,008,0.038462,files/F-raw-ligands/008_ideal.mol2,Fc1ccccc1C[C@@H](N)CC(=O)N1Cc2ccccc2C[C@H]1C(=O)N,True
2,014,0.052632,files/F-raw-ligands/014_ideal.mol2,Fc1cc2[nH]c(n3ncc(C(=O)O)c3)nc2cc1Cl,True
3,018,0.050000,files/F-raw-ligands/018_ideal.mol2,O=[N](O)c1cc(C(F)(F)F)ccc1NCc1cscc1,True
4,01C,0.047619,files/F-raw-ligands/01C_ideal.mol2,N[C@@H](Cc1c[nH]c2c1cccc2)C(=O)C(F)(F)C(F)(F)F,True
...,...,...,...,...,...
5061,ZZO,0.026316,files/F-raw-ligands/ZZO_ideal.mol2,Cc1ccccc1n1c(=O)c2c(C)cccc2nc1Cn1nc(c2c1ncnc2N...,True
5062,ZZP,0.031250,files/F-raw-ligands/ZZP_ideal.mol2,COC1=C/C(=N\c2nc3ccccc3nc2NS(=O)(=O)c2ccc(F)cc...,True
5063,ZZV,0.034483,files/F-raw-ligands/ZZV_ideal.mol2,CN(c1c2CN(Cc3ccc(F)cc3)C(=O)c2c(O)c2ncccc12)S(...,True
5064,ZZW,0.040000,files/F-raw-ligands/ZZW_ideal.mol2,O=C(NO)c1cc2c3ccccc3n(Cc3ccc(F)cc3)c2cn1,True


In [19]:
for ligand in all_fluorine_ligands_replacements.itertuples():
    
    p = Path(ligand.path)
    
    if ligand.has_replacements == True:
        paths_to_replacements[ligand.identifier] = flourine_hydrogen_replacements_folder / p.stem
        continue
    
    do_atom_replacement(p, "F", "H", flourine_hydrogen_replacements_folder / p.stem)
    
    paths_to_replacements[ligand.identifier] = flourine_hydrogen_replacements_folder / p.stem
    print(ligand.identifier, ligand.Index)

In [22]:
all_fluorine_ligands_replacements["has_replacements"] = all_fluorine_ligands_replacements["path"].\
                                                        apply(Path).\
                                                        apply(replacement_folder_exists)

In [23]:
all_fluorine_ligands_smiled_with_replacements = all_fluorine_ligands_replacements[all_fluorine_ligands_replacements["has_replacements"] == True].\
    drop("has_replacements", axis = 1)

all_fluorine_ligands_smiled_with_replacements["replacements_folder"] = \
    all_fluorine_ligands_smiled_with_replacements["identifier"].map(paths_to_replacements)

all_fluorine_ligands_smiled_with_replacements

Unnamed: 0,identifier,score,path,smiles,replacements_folder
0,001,0.022222,files/F-raw-ligands/001_ideal.mol2,c1(cc(c(c(c1)OC)OC)OC)C(F)(F)C(=O)N1CCCC[C@H]1...,files/F-H-replacements/001_ideal
1,008,0.038462,files/F-raw-ligands/008_ideal.mol2,Fc1ccccc1C[C@@H](N)CC(=O)N1Cc2ccccc2C[C@H]1C(=O)N,files/F-H-replacements/008_ideal
2,014,0.052632,files/F-raw-ligands/014_ideal.mol2,Fc1cc2[nH]c(n3ncc(C(=O)O)c3)nc2cc1Cl,files/F-H-replacements/014_ideal
3,018,0.050000,files/F-raw-ligands/018_ideal.mol2,O=[N](O)c1cc(C(F)(F)F)ccc1NCc1cscc1,files/F-H-replacements/018_ideal
4,01C,0.047619,files/F-raw-ligands/01C_ideal.mol2,N[C@@H](Cc1c[nH]c2c1cccc2)C(=O)C(F)(F)C(F)(F)F,files/F-H-replacements/01C_ideal
...,...,...,...,...,...
5061,ZZO,0.026316,files/F-raw-ligands/ZZO_ideal.mol2,Cc1ccccc1n1c(=O)c2c(C)cccc2nc1Cn1nc(c2c1ncnc2N...,files/F-H-replacements/ZZO_ideal
5062,ZZP,0.031250,files/F-raw-ligands/ZZP_ideal.mol2,COC1=C/C(=N\c2nc3ccccc3nc2NS(=O)(=O)c2ccc(F)cc...,files/F-H-replacements/ZZP_ideal
5063,ZZV,0.034483,files/F-raw-ligands/ZZV_ideal.mol2,CN(c1c2CN(Cc3ccc(F)cc3)C(=O)c2c(O)c2ncccc12)S(...,files/F-H-replacements/ZZV_ideal
5064,ZZW,0.040000,files/F-raw-ligands/ZZW_ideal.mol2,O=C(NO)c1cc2c3ccccc3n(Cc3ccc(F)cc3)c2cn1,files/F-H-replacements/ZZW_ideal


## Convert each replacement to smiles

In [25]:
ligands_for_conversion = all_fluorine_ligands_smiled_with_replacements.copy()

In [26]:
dfs_replacements = []

In [128]:
for ligand in ligands_for_conversion.itertuples():
    
    d = {}

    for replacement_mol2 in Path(ligand.replacements_folder).iterdir():
        if replacement_mol2.suffix == ".mol2":
            
            replacement_id = replacement_mol2.stem.split('_')[-1]
            d[replacement_mol2] = {"id": replacement_id, "H_smiles": mol2_to_smiles(str(replacement_mol2))}

    df = pd.DataFrame.from_dict(d, orient="index")
    df["F_ligand_id"] = ligand.identifier

    df.reset_index().rename({"index": "path"}, axis=1)
    
    dfs_replacements.append(df)

1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule co

In [129]:
full_df_replacements_table = pd.concat(dfs_replacements)

In [36]:
try:
    full_df_replacements_table.to_pickle(data_folder / "full_df_replacements_table.pkl")
except NameError:
    full_df_replacements_table = pd.read_pickle(data_folder / "full_df_replacements_table.pkl")

In [38]:
full_df_replacements_table

Unnamed: 0,id,H_smiles,F_ligand_id
0,1,c1(cc(c(c(c1)OC)OC)OC)[C@H](F)C(=O)N1CCCC[C@H]...,001
1,0,c1(cc(c(c(c1)OC)OC)OC)[C@@H](F)C(=O)N1CCCC[C@H...,001
2,0,c1ccccc1C[C@@H](N)CC(=O)N1Cc2ccccc2C[C@H]1C(=O)N,008
3,0,Clc1ccc2[nH]c(n3ncc(C(=O)O)c3)nc2c1,014
4,1,O=[N](O)c1cc(C(F)F)ccc1NCc1cscc1,018
...,...,...,...
10469,0,Cc1ccccc1n1c(=O)c2c(C)cccc2nc1Cn1nc(c2c1ncnc2N...,ZZO
10470,0,COC1=C/C(=N\c2nc3ccccc3nc2NS(=O)(=O)c2ccccc2)/...,ZZP
10471,0,CN(c1c2CN(Cc3ccccc3)C(=O)c2c(O)c2ncccc12)S(=O)...,ZZV
10472,0,O=C(NO)c1cc2c3ccccc3n(Cc3ccccc3)c2cn1,ZZW


In [39]:
len(full_df_replacements_table)

10474

### Remove duplicates of smiles

In [41]:
full_df_replacements_table_nodup = full_df_replacements_table.drop_duplicates(subset = "H_smiles")

In [42]:
full_df_replacements_table_nodup

Unnamed: 0,id,H_smiles,F_ligand_id
0,1,c1(cc(c(c(c1)OC)OC)OC)[C@H](F)C(=O)N1CCCC[C@H]...,001
1,0,c1(cc(c(c(c1)OC)OC)OC)[C@@H](F)C(=O)N1CCCC[C@H...,001
2,0,c1ccccc1C[C@@H](N)CC(=O)N1Cc2ccccc2C[C@H]1C(=O)N,008
3,0,Clc1ccc2[nH]c(n3ncc(C(=O)O)c3)nc2c1,014
4,1,O=[N](O)c1cc(C(F)F)ccc1NCc1cscc1,018
...,...,...,...
10469,0,Cc1ccccc1n1c(=O)c2c(C)cccc2nc1Cn1nc(c2c1ncnc2N...,ZZO
10470,0,COC1=C/C(=N\c2nc3ccccc3nc2NS(=O)(=O)c2ccccc2)/...,ZZP
10471,0,CN(c1c2CN(Cc3ccccc3)C(=O)c2c(O)c2ncccc12)S(=O)...,ZZV
10472,0,O=C(NO)c1cc2c3ccccc3n(Cc3ccccc3)c2cn1,ZZW


In [43]:
try:
    full_df_replacements_table_nodup.to_pickle(data_folder / "full_df_replacements_table_nodup.pkl")
except NameError:
    full_df_replacements_table_nodup = pd.read_pickle(data_folder / "full_df_replacements_table_nodup.pkl")


# Get PDB of each flourinated ligand and with hydrogen replacements

## Queries for fluorinated ligands

In [47]:
full_df_replacements_table_nodup_pdb = full_df_replacements_table_nodup.copy()
full_df_replacements_table_nodup_pdb["pdb_list"] = np.nan

In [48]:
full_df_replacements_table_nodup_pdb

Unnamed: 0,id,H_smiles,F_ligand_id,pdb_list
0,1,c1(cc(c(c(c1)OC)OC)OC)[C@H](F)C(=O)N1CCCC[C@H]...,001,
1,0,c1(cc(c(c(c1)OC)OC)OC)[C@@H](F)C(=O)N1CCCC[C@H...,001,
2,0,c1ccccc1C[C@@H](N)CC(=O)N1Cc2ccccc2C[C@H]1C(=O)N,008,
3,0,Clc1ccc2[nH]c(n3ncc(C(=O)O)c3)nc2c1,014,
4,1,O=[N](O)c1cc(C(F)F)ccc1NCc1cscc1,018,
...,...,...,...,...
10469,0,Cc1ccccc1n1c(=O)c2c(C)cccc2nc1Cn1nc(c2c1ncnc2N...,ZZO,
10470,0,COC1=C/C(=N\c2nc3ccccc3nc2NS(=O)(=O)c2ccccc2)/...,ZZP,
10471,0,CN(c1c2CN(Cc3ccccc3)C(=O)c2c(O)c2ncccc12)S(=O)...,ZZV,
10472,0,O=C(NO)c1cc2c3ccccc3n(Cc3ccccc3)c2cn1,ZZW,


In [56]:
for ligand in full_df_replacements_table_nodup_pdb.itertuples():
    # print(ligand.pdb_from_identifier)
    # print(ligand)
    if pd.isna(ligand.H_pdb_list):
        # print("!")
        try:
            print(ligand.Index)
            h_pdbs = get_list_of_pdb_by_smiles(ligand.H_smiles)
            
            if isinstance(h_pdbs, list):
                full_df_replacements_table_nodup_pdb.loc[ligand.Index, "H_pdb_list"] = \
                    "|".join(h_pdbs)
            else:
                full_df_replacements_table_nodup_pdb.loc[ligand.Index, "H_pdb_list"] = h_pdbs
                
        except KeyboardInterrupt:
            break
        # except Exception as e:
        #     print(e)
        #     continue


0
Nothing found for  c1(cc(c(c(c1)OC)OC)OC)[C@H](F)C(=O)N1CCCC[C@H]1C(=O)O[C@H](CCCc1cnccc1)CCCc1ccccc1
1
Nothing found for  c1(cc(c(c(c1)OC)OC)OC)[C@@H](F)C(=O)N1CCCC[C@H]1C(=O)O[C@H](CCCc1cnccc1)CCCc1ccccc1
2
Nothing found for  c1ccccc1C[C@@H](N)CC(=O)N1Cc2ccccc2C[C@H]1C(=O)N
3
Nothing found for  Clc1ccc2[nH]c(n3ncc(C(=O)O)c3)nc2c1
4
Nothing found for  O=[N](O)c1cc(C(F)F)ccc1NCc1cscc1
7
Nothing found for  N[C@@H](Cc1c[nH]c2c1cccc2)C(=O)[C@@H](F)C(F)(F)F
8
Nothing found for  N[C@@H](Cc1c[nH]c2c1cccc2)C(=O)[C@H](F)C(F)(F)F
9
Nothing found for  N[C@@H](Cc1c[nH]c2c1cccc2)C(=O)C(F)(F)C(F)F
12
Nothing found for  FC(F)c1ccc(c(c1)[N](=O)O)NCc1ccco1
13
Nothing found for  C(F)(F)c1ccc(c(c1)[N](=O)O)NCc1ccco1
15
Nothing found for  c1(S(=O)(=O)O)cccc(C(F)F)c1
18
Nothing found for  c1(ccc(c2cnc3[nH]cc(C(=O)c4c(c(ccc4)NS(=O)(=O)CCC)F)c3c2)cc1)Cl
19
Nothing found for  c1(ccc(c2cnc3[nH]cc(C(=O)c4cc(ccc4F)NS(=O)(=O)CCC)c3c2)cc1)Cl
20
Nothing found for  c1c2c(c[nH]c2c(c(c1)Cl)C)/C=C/1\NC(=O)N(C1=O)Cc1

In [77]:
full_df_replacements_table_nodup_pdb = full_df_replacements_table_nodup_pdb.drop(["Unnamed: 0"], axis = 1).\
                                     rename({"pdb_list": "H_pdb_list",
                                             "ligand_id": "F_ligand_id",
                                             "smiles": "H_smiles"}, axis = 1)

In [119]:
try:
    full_df_replacements_table_nodup_pdb.to_pickle(data_folder / "full_df_replacements_table_nodup_pdb.pkl")
except NameError:
    full_df_replacements_table_nodup_pdb = pd.read_pickle(data_folder / "full_df_replacements_table_nodup_pdb.pkl")


In [120]:
(~pd.isna(full_df_replacements_table_nodup_pdb.H_pdb_list)).sum()

550

In [121]:
has_pdb = (~pd.isna(full_df_replacements_table_nodup_pdb.H_pdb_list))

full_df_replacements_table_nodup_pdb_no_common =\
full_df_replacements_table_nodup_pdb[has_pdb]

In [122]:
def str_to_set(s, delim = "|"):
    if pd.isna(s):
        return np.nan
    elif isinstance(s, set):
        return s
    else:
        try:
            return set(s.split(delim))
        except AttributeError:
            # print(f"Too much pdbs({s}) for ligand")
            return np.nan

In [124]:
full_df_replacements_table_nodup_pdb_no_common.H_pdb_list =\
full_df_replacements_table_nodup_pdb_no_common.H_pdb_list.apply(str_to_set)

In [125]:
full_df_replacements_table_nodup_pdb_no_common

Unnamed: 0,id,H_smiles,F_ligand_id,H_pdb_list
40,0,n1c(N)c(C(=O)c2ccccc2)sc1Nc1ccccc1,06Z,{3QTR}
54,0,c1c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N)...,09I,"{6L05, 5ZA9}"
68,0,N(c1ccc(cc1)C(=O)O)c1nc(Nc2ccccc2)ccn1,0BZ,"{3UNJ, 3UO5}"
74,0,Clc1ccccc1Nc1ccnc(Nc2ccc(C(=O)O)cc2)n1,0C6,"{3UNK, 3UO6}"
93,0,C(O[P@@](=O)(O)OP(=O)(O)O)/C=C(\C)/CCC=C(C)C,0FV,"{5XK8, 3VC2, 3KRP, 2E8X, 1W55, 1H47, 1ZCW, 2VG..."
...,...,...,...,...
7155,0,n1c2ccccc2sc1NC(=O)C,ZC3,{4UVH}
7157,0,c1c(CN)cccc1,ZDV,"{3HAT, 1UTJ, 5MNL, 1N6Y, 6T56, 5PA9, 2BZA, 2EU..."
7162,0,O=C(C)N[C@@H]1[C@@H](NC(=N)N)C=C(O[C@H]1[C@H](...,ZGE,"{4I00, 4CPZ, 3CKZ, 3TI5, 6EKU, 2CML, 6HCX, 4CP..."
7178,0,C1C(=O)C=C2CC[C@@H]3[C@@H]([C@]2(C1)C)[C@H](C[...,ZK5,"{6HGG, 6HGF, 4P6X, 6NWL, 5HGC, 2VDY, 6ITP, 2V9..."


In [130]:
all_fluorine_ligands_smiled_pdb_from_id = \
all_fluorine_ligands_smiled_pdb_from_id.rename({"smiles": "F_smiles"}, axis = 1)

In [131]:
all_fluorine_ligands_smiled_pdb_from_id

Unnamed: 0,identifier,score,path,F_smiles,pdb_from_identifier
0,001,0.022222,files/F-raw-ligands/001_ideal.mol2,c1(cc(c(c(c1)OC)OC)OC)C(F)(F)C(=O)N1CCCC[C@H]1...,
1,008,0.038462,files/F-raw-ligands/008_ideal.mol2,Fc1ccccc1C[C@@H](N)CC(=O)N1Cc2ccccc2C[C@H]1C(=O)N,
2,014,0.052632,files/F-raw-ligands/014_ideal.mol2,Fc1cc2[nH]c(n3ncc(C(=O)O)c3)nc2cc1Cl,
3,018,0.050000,files/F-raw-ligands/018_ideal.mol2,O=[N](O)c1cc(C(F)(F)F)ccc1NCc1cscc1,
4,01C,0.047619,files/F-raw-ligands/01C_ideal.mol2,N[C@@H](Cc1c[nH]c2c1cccc2)C(=O)C(F)(F)C(F)(F)F,
...,...,...,...,...,...
5061,ZZO,0.026316,files/F-raw-ligands/ZZO_ideal.mol2,Cc1ccccc1n1c(=O)c2c(C)cccc2nc1Cn1nc(c2c1ncnc2N...,
5062,ZZP,0.031250,files/F-raw-ligands/ZZP_ideal.mol2,COC1=C/C(=N\c2nc3ccccc3nc2NS(=O)(=O)c2ccc(F)cc...,
5063,ZZV,0.034483,files/F-raw-ligands/ZZV_ideal.mol2,CN(c1c2CN(Cc3ccc(F)cc3)C(=O)c2c(O)c2ncccc12)S(...,
5064,ZZW,0.040000,files/F-raw-ligands/ZZW_ideal.mol2,O=C(NO)c1cc2c3ccccc3n(Cc3ccc(F)cc3)c2cn1,


In [136]:
full_df_replacements_table_nodup_pdb_no_common

Unnamed: 0,id,H_smiles,F_ligand_id,H_pdb_list
40,0,n1c(N)c(C(=O)c2ccccc2)sc1Nc1ccccc1,06Z,{3QTR}
54,0,c1c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N)...,09I,"{6L05, 5ZA9}"
68,0,N(c1ccc(cc1)C(=O)O)c1nc(Nc2ccccc2)ccn1,0BZ,"{3UNJ, 3UO5}"
74,0,Clc1ccccc1Nc1ccnc(Nc2ccc(C(=O)O)cc2)n1,0C6,"{3UNK, 3UO6}"
93,0,C(O[P@@](=O)(O)OP(=O)(O)O)/C=C(\C)/CCC=C(C)C,0FV,"{5XK8, 3VC2, 3KRP, 2E8X, 1W55, 1H47, 1ZCW, 2VG..."
...,...,...,...,...
7155,0,n1c2ccccc2sc1NC(=O)C,ZC3,{4UVH}
7157,0,c1c(CN)cccc1,ZDV,"{3HAT, 1UTJ, 5MNL, 1N6Y, 6T56, 5PA9, 2BZA, 2EU..."
7162,0,O=C(C)N[C@@H]1[C@@H](NC(=N)N)C=C(O[C@H]1[C@H](...,ZGE,"{4I00, 4CPZ, 3CKZ, 3TI5, 6EKU, 2CML, 6HCX, 4CP..."
7178,0,C1C(=O)C=C2CC[C@@H]3[C@@H]([C@]2(C1)C)[C@H](C[...,ZK5,"{6HGG, 6HGF, 4P6X, 6NWL, 5HGC, 2VDY, 6ITP, 2V9..."


In [159]:
merged_table = full_df_replacements_table_nodup_pdb_no_common.merge(all_fluorine_ligands_smiled_pdb_from_id, 
         left_on = "F_ligand_id",
         right_on = "identifier",
         how = "left")

In [160]:
merged_table

Unnamed: 0,id,H_smiles,F_ligand_id,H_pdb_list,identifier,score,path,F_smiles,pdb_from_identifier
0,0,n1c(N)c(C(=O)c2ccccc2)sc1Nc1ccccc1,06Z,{3QTR},06Z,0.045455,files/F-raw-ligands/06Z_ideal.mol2,Fc1cc(Nc2nc(N)c(C(=O)c3ccccc3)s2)ccc1,3RJC
1,0,c1c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N)...,09I,"{6L05, 5ZA9}",09I,0.033333,files/F-raw-ligands/09I_ideal.mol2,c1(c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N...,5ZC5
2,0,N(c1ccc(cc1)C(=O)O)c1nc(Nc2ccccc2)ccn1,0BZ,"{3UNJ, 3UO5}",0BZ,0.041667,files/F-raw-ligands/0BZ_ideal.mol2,Fc1c(Nc2nc(Nc3ccc(cc3)C(=O)O)ncc2)cccc1,3UNZ
3,0,Clc1ccccc1Nc1ccnc(Nc2ccc(C(=O)O)cc2)n1,0C6,"{3UNK, 3UO6}",0C6,0.040000,files/F-raw-ligands/0C6_ideal.mol2,Clc1ccccc1Nc1c(cnc(Nc2ccc(C(=O)O)cc2)n1)F,3UOK
4,0,C(O[P@@](=O)(O)OP(=O)(O)O)/C=C(\C)/CCC=C(C)C,0FV,"{5XK8, 3VC2, 3KRP, 2E8X, 1W55, 1H47, 1ZCW, 2VG...",0FV,0.050000,files/F-raw-ligands/0FV_ideal.mol2,C(O[P@@](=O)(O)OP(=O)(O)O)/C(=C(\C)/CCC=C(C)C)/F,5NX5|3V1X|5UV1|5NX7
...,...,...,...,...,...,...,...,...,...
545,0,n1c2ccccc2sc1NC(=O)C,ZC3,{4UVH},ZC3,0.071429,files/F-raw-ligands/ZC3_ideal.mol2,n1c2cc(F)ccc2sc1NC(=O)C,5A4L
546,0,c1c(CN)cccc1,ZDV,"{3HAT, 1UTJ, 5MNL, 1N6Y, 6T56, 5PA9, 2BZA, 2EU...",ZDV,0.111111,files/F-raw-ligands/ZDV_ideal.mol2,c1c(CN)cc(F)cc1,
547,0,O=C(C)N[C@@H]1[C@@H](NC(=N)N)C=C(O[C@H]1[C@H](...,ZGE,"{4I00, 4CPZ, 3CKZ, 3TI5, 6EKU, 2CML, 6HCX, 4CP...",ZGE,0.041667,files/F-raw-ligands/ZGE_ideal.mol2,O=C(C)N[C@@H]1[C@@H](NC(=N)N)C(=C(O[C@H]1[C@H]...,3W09
548,0,C1C(=O)C=C2CC[C@@H]3[C@@H]([C@]2(C1)C)[C@H](C[...,ZK5,"{6HGG, 6HGF, 4P6X, 6NWL, 5HGC, 2VDY, 6ITP, 2V9...",ZK5,0.037037,files/F-raw-ligands/ZK5_ideal.mol2,C1C(=O)C=C2CC[C@@H]3[C@@]([C@]2(C1)C)([C@H](C[...,1GS4


0                        {3RJC}
1                        {5ZC5}
2                        {3UNZ}
3                        {3UOK}
4      {5NX7, 3V1X, 5NX5, 5UV1}
                 ...           
545                      {5A4L}
546                          {}
547                      {3W09}
548                      {1GS4}
549                      {5M8Z}
Name: F_PDB, Length: 550, dtype: object

In [179]:
merged_table_fmt = merged_table_fmt.rename({"H_pdb_list": "H_PDB"}, axis = 1)

In [168]:
merged_table_fmt = merged_table.drop(["id", "score", "path", "identifier"], axis = 1)\
                               .rename({"pdb_from_identifier": "F_PDB",
                                        "H_pdb_list": "H_PDB"}, axis = 1)\

merged_table_fmt.F_PDB = merged_table_fmt.F_PDB.apply(str_to_set) 



In [180]:
try:
    merged_table_fmt.to_pickle(data_folder / "merged_table_fmt.pkl")
except NameError:
    merged_table_fmt = pd.read_pickle(data_folder / "merged_table_fmt.pkl")

In [181]:
merged_table_fmt

Unnamed: 0,H_smiles,F_ligand_id,H_PDB,F_smiles,F_PDB
0,n1c(N)c(C(=O)c2ccccc2)sc1Nc1ccccc1,06Z,{3QTR},Fc1cc(Nc2nc(N)c(C(=O)c3ccccc3)s2)ccc1,{3RJC}
1,c1c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N)...,09I,"{6L05, 5ZA9}",c1(c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N...,{5ZC5}
2,N(c1ccc(cc1)C(=O)O)c1nc(Nc2ccccc2)ccn1,0BZ,"{3UNJ, 3UO5}",Fc1c(Nc2nc(Nc3ccc(cc3)C(=O)O)ncc2)cccc1,{3UNZ}
3,Clc1ccccc1Nc1ccnc(Nc2ccc(C(=O)O)cc2)n1,0C6,"{3UNK, 3UO6}",Clc1ccccc1Nc1c(cnc(Nc2ccc(C(=O)O)cc2)n1)F,{3UOK}
4,C(O[P@@](=O)(O)OP(=O)(O)O)/C=C(\C)/CCC=C(C)C,0FV,"{5XK8, 3VC2, 3KRP, 2E8X, 1W55, 1H47, 1ZCW, 2VG...",C(O[P@@](=O)(O)OP(=O)(O)O)/C(=C(\C)/CCC=C(C)C)/F,"{5NX7, 3V1X, 5NX5, 5UV1}"
...,...,...,...,...,...
545,n1c2ccccc2sc1NC(=O)C,ZC3,{4UVH},n1c2cc(F)ccc2sc1NC(=O)C,{5A4L}
546,c1c(CN)cccc1,ZDV,"{3HAT, 1UTJ, 5MNL, 1N6Y, 6T56, 5PA9, 2BZA, 2EU...",c1c(CN)cc(F)cc1,{}
547,O=C(C)N[C@@H]1[C@@H](NC(=N)N)C=C(O[C@H]1[C@H](...,ZGE,"{4I00, 4CPZ, 3CKZ, 3TI5, 6EKU, 2CML, 6HCX, 4CP...",O=C(C)N[C@@H]1[C@@H](NC(=N)N)C(=C(O[C@H]1[C@H]...,{3W09}
548,C1C(=O)C=C2CC[C@@H]3[C@@H]([C@]2(C1)C)[C@H](C[...,ZK5,"{6HGG, 6HGF, 4P6X, 6NWL, 5HGC, 2VDY, 6ITP, 2V9...",C1C(=O)C=C2CC[C@@H]3[C@@]([C@]2(C1)C)([C@H](C[...,{1GS4}


# Merged table operaions

In [174]:
merged_table_fmt

Unnamed: 0,H_smiles,F_ligand_id,H_pdb_list,F_smiles,F_PDB
0,n1c(N)c(C(=O)c2ccccc2)sc1Nc1ccccc1,06Z,{3QTR},Fc1cc(Nc2nc(N)c(C(=O)c3ccccc3)s2)ccc1,{3RJC}
1,c1c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N)...,09I,"{6L05, 5ZA9}",c1(c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N...,{5ZC5}
2,N(c1ccc(cc1)C(=O)O)c1nc(Nc2ccccc2)ccn1,0BZ,"{3UNJ, 3UO5}",Fc1c(Nc2nc(Nc3ccc(cc3)C(=O)O)ncc2)cccc1,{3UNZ}
3,Clc1ccccc1Nc1ccnc(Nc2ccc(C(=O)O)cc2)n1,0C6,"{3UNK, 3UO6}",Clc1ccccc1Nc1c(cnc(Nc2ccc(C(=O)O)cc2)n1)F,{3UOK}
4,C(O[P@@](=O)(O)OP(=O)(O)O)/C=C(\C)/CCC=C(C)C,0FV,"{5XK8, 3VC2, 3KRP, 2E8X, 1W55, 1H47, 1ZCW, 2VG...",C(O[P@@](=O)(O)OP(=O)(O)O)/C(=C(\C)/CCC=C(C)C)/F,"{5NX7, 3V1X, 5NX5, 5UV1}"
...,...,...,...,...,...
545,n1c2ccccc2sc1NC(=O)C,ZC3,{4UVH},n1c2cc(F)ccc2sc1NC(=O)C,{5A4L}
546,c1c(CN)cccc1,ZDV,"{3HAT, 1UTJ, 5MNL, 1N6Y, 6T56, 5PA9, 2BZA, 2EU...",c1c(CN)cc(F)cc1,{}
547,O=C(C)N[C@@H]1[C@@H](NC(=N)N)C=C(O[C@H]1[C@H](...,ZGE,"{4I00, 4CPZ, 3CKZ, 3TI5, 6EKU, 2CML, 6HCX, 4CP...",O=C(C)N[C@@H]1[C@@H](NC(=N)N)C(=C(O[C@H]1[C@H]...,{3W09}
548,C1C(=O)C=C2CC[C@@H]3[C@@H]([C@]2(C1)C)[C@H](C[...,ZK5,"{6HGG, 6HGF, 4P6X, 6NWL, 5HGC, 2VDY, 6ITP, 2V9...",C1C(=O)C=C2CC[C@@H]3[C@@]([C@]2(C1)C)([C@H](C[...,{1GS4}


## Explode H PDB lists and find activity info

In [183]:
merged_table_fmt_h_explode = merged_table_fmt.explode("H_PDB")

In [185]:
df = merged_table_fmt_h_explode.copy()

In [189]:
affinity_dfs = []
for i in df.copy().itertuples():
    
    af_df = get_activity_df_by_pdb(i.H_PDB)
    if af_df is not None:
        af_df["H_PDB"] = i.H_PDB
        affinity_dfs.append(af_df)
    
affinity_df = pd.concat(affinity_dfs)
res = df.merge(affinity_df, how = "left", on = "H_PDB")

3QTR
6L05
5ZA9
3UNJ
3UO5
3UNK
3UO6
5XK8
3VC2
3KRP
2E8X
1W55
1H47
1ZCW
2VG3
5GP0
1YQN
1W57
4TQ3
2VG0
6PGN
5B03
6YPF
3ERN
3EOR
2AMT
1UBW
2GZL
4K7T
5B00
3ZOU
3ELC
3ZMC
4F86
1H48
3FBA
3ESJ
4G8V
4QWD
5DDE
4FOY
4GIH
4GMY
4GII
10183
7CAV
7RNI
2ICK
2ZRZ
3Q1O
5XK7
3EPL
1UBY
4GP1
6M34
3KEF
1YHL
3B06
6QLG
5XK9
3Q2Q
2ZRX
5YNU
4GP2
5YNT
3QQV
4TQ4
3OYR
3P41
3PDE
1O7N
3Q8I
2B24
2P85
4EAN
1EG9
4UUJ
5XDD
1L4H
5XDC
185L
2XVJ
1UUV
3SNM
5LVW
3SLI
2JKB
2SLI
4X4A
5MNY
5MOO
3GUN
2OV4
5CIE
3MS3
1HJ9
5MNC
5MNA
7BS7
7BS0
1PPA
1AEE
6N2R
5EYO
6U16
6OD5
4M9V
5KL5
5EXH
4Y7N
4R2R
6N2S
4Y52
3UO7
4PWM
5ZAT
5KL7
4J0Y
4J0V
4J0P
4J1F
4J1H
4J0Z
3ZMG
7LAJ
5TA8
5VBP
7L9J
7L72
7BJY
4E47
5BUV
5EPU
2P2R
5ERE
5LOK
5KO5
4IQI
3SLB
5KO6
3OAH
5EFO
3NG9
4ZT8
4LD4
3IKE
3MBM
6KDJ
5U2S
5TBC
4K4H
6UJY
5TB8
6KDO
6OUN
4HKI
2G0L
5M36
5EXA
3R9S
6OS5
5EWZ
7S6S
6QL3
4IJI
7RJZ
5FXF
6OTL
5ZZS
6SNN
6AWT
4UU8
3TX2
4MF6
4UTR
4EVR
5XFI
4M51
3IEO
6RMX
4XDQ
3IT4
1DJR
7BNH
4UTZ
1NCW
6VK4
2Q0I
6LTF
3O9M
6VK8
5M35
5JOS
2AJY
3FFP
1RED
2VL2
1YAJ
6Q38
7M8R

In [191]:
union_ligands_fmt_export_h_explode_with_affinity = res.copy()

try:
    union_ligands_fmt_export_h_explode_with_affinity.to_pickle(data_folder / "union_ligands_fmt_export_h_explode_with_affinity.pkl")
except NameError:
    union_ligands_fmt_export_h_explode_with_affinity = pd.read_pickle(data_folder / "union_ligands_fmt_export_h_explode_with_affinity.pkl")

In [193]:
union_ligands_fmt_export_h_explode_with_affinity_flitered = \
    union_ligands_fmt_export_h_explode_with_affinity.dropna(subset = ["value"]).drop(["reference_sequence_identity", "symbol"], axis = 1)

In [194]:
union_ligands_fmt_export_h_explode_with_affinity_flitered

Unnamed: 0,H_smiles,F_ligand_id,H_PDB,F_smiles,F_PDB,comp_id,type,value,unit,provenance_code,link
0,n1c(N)c(C(=O)c2ccccc2)sc1Nc1ccccc1,06Z,3QTR,Fc1cc(Nc2nc(N)c(C(=O)c3ccccc3)s2)ccc1,{3RJC},X36,IC50,930.0,nM,PDBBind,http://www.pdbbind-cn.org/quickpdb.php?quickpd...
2,c1c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N)...,09I,5ZA9,c1(c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N...,{5ZC5},50I,IC50,530.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...
3,c1c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N)...,09I,5ZA9,c1(c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N...,{5ZC5},50I,Ki,183.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...
4,c1c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N)...,09I,5ZA9,c1(c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N...,{5ZC5},50I,Ki,183.0,nM,Binding MOAD,http://www.bindingmoad.org/pdbrecords/index/5za9
5,N(c1ccc(cc1)C(=O)O)c1nc(Nc2ccccc2)ccn1,0BZ,3UNJ,Fc1c(Nc2nc(Nc3ccc(cc3)C(=O)O)ncc2)cccc1,{3UNZ},0BX,IC50,11000.0,nM,Binding MOAD,http://www.bindingmoad.org/pdbrecords/index/3unj
...,...,...,...,...,...,...,...,...,...,...,...
50652,C1C(=O)C=C2CC[C@@H]3[C@@H]([C@]2(C1)C)[C@H](C[...,ZK5,2VDY,C1C(=O)C=C2CC[C@@H]3[C@@]([C@]2(C1)C)([C@H](C[...,{1GS4},HCY,Ki,13.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...
50653,C1C(=O)C=C2CC[C@@H]3[C@@H]([C@]2(C1)C)[C@H](C[...,ZK5,6ITP,C1C(=O)C=C2CC[C@@H]3[C@@]([C@]2(C1)C)([C@H](C[...,{1GS4},HCY,Kd,22400.0,nM,Binding MOAD,http://www.bindingmoad.org/pdbrecords/index/6itp
50654,C1C(=O)C=C2CC[C@@H]3[C@@H]([C@]2(C1)C)[C@H](C[...,ZK5,2V95,C1C(=O)C=C2CC[C@@H]3[C@@]([C@]2(C1)C)([C@H](C[...,{1GS4},PDN,Kd,5.0,nM,PDBBind,http://www.pdbbind-cn.org/quickpdb.php?quickpd...
50656,C1C(=O)C=C2CC[C@@H]3[C@@H]([C@]2(C1)C)[C@H](C[...,ZK5,4C49,C1C(=O)C=C2CC[C@@H]3[C@@]([C@]2(C1)C)([C@H](C[...,{1GS4},HCY,Ki,13.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...


In [221]:
h_exploded_with_affinity = union_ligands_fmt_export_h_explode_with_affinity_flitered.\
    rename({
        "comp_id": "H_ligand_id", 
        "type": "H_affinity_type",
        "value": "H_affinity_value",
        "unit": "H_affinity_unit",
        "provenance_code": "H_affinity_provenance_code",
        "link": "H_affinity_link",
        "F_pdb": "F_PDB"
    }, axis = 1)

In [223]:
try:
    h_exploded_with_affinity.to_pickle(data_folder / "h_exploded_with_affinity.pkl")
except NameError:
    h_exploded_with_affinity = pd.read_pickle(data_folder / "h_exploded_with_affinity.pkl")

## Explode F ligands and find activity for Fluorine ligands

In [198]:
hf_exploded = h_exploded_with_affinity.explode("F_PDB")

In [200]:
same_subset = list(hf_exploded.F_PDB.unique())

In [202]:
affinity_dfs_F = []
for i in same_subset: #.difference(set(failed_ligands)):
    # try:
    af_df = get_activity_df_by_pdb(i)
    # except:
    #     failed_ligands.append(i)
    #     continue
        
    if af_df is not None:
        af_df["F_PDB"] = i
        affinity_dfs_F.append(af_df)
    # time.sleep(0.1)
    
affinity_df_F = pd.concat(affinity_dfs_F)


3RJC
5ZC5
3UNZ
3UOK
5NX7
3V1X
5NX5
5UV1
4G90
4GQ4
4GJ2
4GJ3
4H4C
5IXE
3FUF
4XJU
1LGW
4J1K
4J1I
4J1E
4J1C
4J53
5C51
6UIR
6UJX
6OR7
5C52
4L2G
5FP5
4OZ2
3M00
3M02
2PKK
4UBE
1Z35
4DAN
1PK9
3CWS
3CWT
4OG6
4OBV
4QGG
4QGH
2ZDV
4OWM
5ZSX
2AS4
4JMA
4W4X
4WHR
4WHQ
4WHS
4WPD
2N8C
6CDX
6R2R
4ZAB
4LBS
4LBR
5C53
5NVX
3ROP
5DDC
5DPV
5VIH
3SHY
2QCF
2VP6
3MW7
3SGU
3G1V
3G3D
3SW6
3G3M
4BA0
4AMX
5HJR
5EK4
4OWN
3T78
1TGV
7R9F
1RXC
4PB2
3R17
5F61
5F62
5DX6
5KBY
6AFC
5TZW
7RTQ
4V26
1O35
5U8Z
5UG2
5XP5
5PHG
6CQC
5NBA
5NK4
1O2O
5VD7
5Q0V
5Q14
5VV2
5OEP
3GB9
5ORK
6IMT
5WNY
5AEG
4AMW
5JOV
3IJ8
5B1S
6B1C
2HBQ
2F4O
1CP3
3D6H
2H51
2H54
1X3Z
3UO8
1JXQ
7AEG
2HBZ
2H48
3V4O
2H9H
7C8B
3D6F
3D6M
7CUT
2H4W
2HAL
2H4Y
2HBY
4DM9
3QNW
2H6M
3UOA
3V4L
2HBR
1JDJ
6K3N
3L7C
4BGM
3KR2
6LWP
6LWM
6LWN
6LWO
6LWL
6CBH
6GFZ
3D3V
6OVU
1V1J
6QHU
5SWN
3P3I
3B12
6GXD
4BAU
6QHW
5ONL
3P2R
6QHV
6QHY
6QI0
6QI1
4B9E
6QHS
4BB0
6QHP
3R3V
6GXT
6QHQ
3KVI
6QHT
6QHX
6HRJ
3KV8
3KUW
6QHZ
6QKW
1CSR
1IF4
1IF5
1IF6
7AZE
4F9M
1CSS
3KTU
3P5R
3PCF
5RTZ
1GYY


In [215]:
hf_exploded_with_F_PDB = hf_exploded.merge(affinity_df_F, how = "left", on = "F_PDB").\
    rename({
        "comp_id": "F_ligand_id_", 
        "type": "F_affinity_type",
        "value": "F_affinity_value",
        "unit": "F_affinity_unit",
        "provenance_code": "F_affinity_provenance_code",
        "link": "F_affinity_link",
        "H_PDB_x": "H_PDB"
    }, axis = 1).drop(["H_PDB_y", "symbol"], axis = 1)

In [229]:
hf_exploded_with_F_PDB

Unnamed: 0,H_smiles,F_ligand_id,H_PDB,F_smiles,F_PDB,H_ligand_id,H_affinity_type,H_affinity_value,H_affinity_unit,H_affinity_provenance_code,H_affinity_link,F_ligand_id_,F_affinity_type,F_affinity_value,F_affinity_unit,F_affinity_provenance_code,F_affinity_link,reference_sequence_identity
0,n1c(N)c(C(=O)c2ccccc2)sc1Nc1ccccc1,06Z,3QTR,Fc1cc(Nc2nc(N)c(C(=O)c3ccccc3)s2)ccc1,3RJC,X36,IC50,930.0,nM,PDBBind,http://www.pdbbind-cn.org/quickpdb.php?quickpd...,06Z,IC50,5700.0,nM,PDBBind,http://www.pdbbind-cn.org/quickpdb.php?quickpd...,
1,c1c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N)...,09I,5ZA9,c1(c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N...,5ZC5,50I,IC50,530.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,09I,Ki,88.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,99.0
2,c1c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N)...,09I,5ZA9,c1(c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N...,5ZC5,50I,IC50,530.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,09I,Ki,88.0,nM,Binding MOAD,http://www.bindingmoad.org/pdbrecords/index/5zc5,
3,c1c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N)...,09I,5ZA9,c1(c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N...,5ZC5,50I,IC50,530.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,09I,IC50,430.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,99.0
4,c1c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N)...,09I,5ZA9,c1(c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N...,5ZC5,50I,Ki,183.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,09I,Ki,88.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,99.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263693,C1C(=O)C=C2CC[C@@H]3[C@@H]([C@]2(C1)C)[C@H](C[...,ZK5,2VDY,C1C(=O)C=C2CC[C@@H]3[C@@]([C@]2(C1)C)([C@H](C[...,1GS4,HCY,Ki,13.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,,,,,,,
263694,C1C(=O)C=C2CC[C@@H]3[C@@H]([C@]2(C1)C)[C@H](C[...,ZK5,6ITP,C1C(=O)C=C2CC[C@@H]3[C@@]([C@]2(C1)C)([C@H](C[...,1GS4,HCY,Kd,22400.0,nM,Binding MOAD,http://www.bindingmoad.org/pdbrecords/index/6itp,,,,,,,
263695,C1C(=O)C=C2CC[C@@H]3[C@@H]([C@]2(C1)C)[C@H](C[...,ZK5,2V95,C1C(=O)C=C2CC[C@@H]3[C@@]([C@]2(C1)C)([C@H](C[...,1GS4,PDN,Kd,5.0,nM,PDBBind,http://www.pdbbind-cn.org/quickpdb.php?quickpd...,,,,,,,
263696,C1C(=O)C=C2CC[C@@H]3[C@@H]([C@]2(C1)C)[C@H](C[...,ZK5,4C49,C1C(=O)C=C2CC[C@@H]3[C@@]([C@]2(C1)C)([C@H](C[...,1GS4,HCY,Ki,13.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,,,,,,,


In [217]:
try:
    hf_exploded_with_F_PDB.to_pickle(data_folder / "hf_exploded_with_F_PDB.pkl")
except NameError:
    hf_exploded_with_F_PDB = pd.read_pickle(data_folder / "hf_exploded_with_F_PDB.pkl")

In [226]:
hf_exploded_with_F_PDB_filter1 =\
hf_exploded_with_F_PDB.dropna(subset = ["F_affinity_value", "H_affinity_value"])

In [227]:
hf_exploded_with_F_PDB_filter1

Unnamed: 0,H_smiles,F_ligand_id,H_PDB,F_smiles,F_PDB,H_ligand_id,H_affinity_type,H_affinity_value,H_affinity_unit,H_affinity_provenance_code,H_affinity_link,F_ligand_id_,F_affinity_type,F_affinity_value,F_affinity_unit,F_affinity_provenance_code,F_affinity_link,reference_sequence_identity
0,n1c(N)c(C(=O)c2ccccc2)sc1Nc1ccccc1,06Z,3QTR,Fc1cc(Nc2nc(N)c(C(=O)c3ccccc3)s2)ccc1,3RJC,X36,IC50,930.0,nM,PDBBind,http://www.pdbbind-cn.org/quickpdb.php?quickpd...,06Z,IC50,5700.0,nM,PDBBind,http://www.pdbbind-cn.org/quickpdb.php?quickpd...,
1,c1c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N)...,09I,5ZA9,c1(c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N...,5ZC5,50I,IC50,530.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,09I,Ki,88.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,99.0
2,c1c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N)...,09I,5ZA9,c1(c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N...,5ZC5,50I,IC50,530.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,09I,Ki,88.0,nM,Binding MOAD,http://www.bindingmoad.org/pdbrecords/index/5zc5,
3,c1c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N)...,09I,5ZA9,c1(c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N...,5ZC5,50I,IC50,530.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,09I,IC50,430.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,99.0
4,c1c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N)...,09I,5ZA9,c1(c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N...,5ZC5,50I,Ki,183.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,09I,Ki,88.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,99.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
263323,c1(c(nc(s1)Nc1ccc(cc1)S(=O)(=O)N)N)C(=O)Nc1ccc...,Z71,3R9D,c1(c(nc(s1)Nc1ccc(cc1)S(=O)(=O)N)N)C(=O)Nc1cc(...,3R9O,X6B,IC50,71000.0,nM,PDBBind,http://www.pdbbind-cn.org/quickpdb.php?quickpd...,Z71,IC50,100000.0,nM,PDBBind,http://www.pdbbind-cn.org/quickpdb.php?quickpd...,
263324,c1cccc(F)c1C(=O)NC[B](O)(O)O,ZA5,2Y2J,Fc1cccc(F)c1C(=O)NC[B](O)(O)O,2Y2K,ZA4,IC50,16000.0,nM,PDBBind,http://www.pdbbind-cn.org/quickpdb.php?quickpd...,ZA5,IC50,6900.0,nM,PDBBind,http://www.pdbbind-cn.org/quickpdb.php?quickpd...,
263325,c1cccc(F)c1C(=O)NC[B](O)(O)O,ZA5,2Y2J,Fc1cccc(F)c1C(=O)NC[B](O)(O)O,2Y2K,ZA4,IC50,16000.0,nM,PDBBind,http://www.pdbbind-cn.org/quickpdb.php?quickpd...,ZA5,IC50,6900.0,nM,PDBBind,http://www.pdbbind-cn.org/quickpdb.php?quickpd...,
263326,Fc1ccccc1C(=O)NC[B](O)(O)O,ZA5,2Y2J,Fc1cccc(F)c1C(=O)NC[B](O)(O)O,2Y2K,ZA4,IC50,16000.0,nM,PDBBind,http://www.pdbbind-cn.org/quickpdb.php?quickpd...,ZA5,IC50,6900.0,nM,PDBBind,http://www.pdbbind-cn.org/quickpdb.php?quickpd...,


In [5]:
try:
    hf_exploded_with_F_PDB_filter1.to_pickle(data_folder / "hf_exploded_with_F_PDB_filter1.pkl")
except NameError:
    hf_exploded_with_F_PDB_filter1 = pd.read_pickle(data_folder / "hf_exploded_with_F_PDB_filter1.pkl")

## Find ligands with the same protein

In [7]:
f_entity_ids_df =  hf_exploded_with_F_PDB_filter1.copy()

f_unitprots = f_entity_ids_df.F_PDB.apply()

In [9]:
# Check that API is available
print(get_subunits_by_pdb_and_ligand_id("3IME", "BZ2"))

Nothing found for  3IME BZ2
None


In [21]:
for i in f_entity_ids_df.itertuples():
    print(i)
    break

Pandas(Index=0, H_smiles='n1c(N)c(C(=O)c2ccccc2)sc1Nc1ccccc1', F_ligand_id='06Z', H_PDB='3QTR', F_smiles='Fc1cc(Nc2nc(N)c(C(=O)c3ccccc3)s2)ccc1', F_PDB='3RJC', H_ligand_id='X36', H_affinity_type='IC50', H_affinity_value=930.0, H_affinity_unit='nM', H_affinity_provenance_code='PDBBind', H_affinity_link='http://www.pdbbind-cn.org/quickpdb.php?quickpdb=3qtr', F_ligand_id_='06Z', F_affinity_type='IC50', F_affinity_value=5700.0, F_affinity_unit='nM', F_affinity_provenance_code='PDBBind', F_affinity_link='http://www.pdbbind-cn.org/quickpdb.php?quickpdb=3rjc', reference_sequence_identity=nan)


In [22]:
subunits_ids_df_list = []
for i in f_entity_ids_df.itertuples():
    subunits = get_subunits_by_pdb_and_ligand_id(i.H_PDB, i.H_ligand_id)
    df = pd.DataFrame.from_dict(
        {"_": {"H_PDB": i.H_PDB, "H_ligand_id": i.H_ligand_id, "entity_id": subunits}},
        orient = "index").\
        explode("entity_id")
    subunits_ids_df_list.append(df)

Nothing found for  3QTR X36
Nothing found for  5ZA9 50I
Nothing found for  3UNJ 0BX
Nothing found for  3UO5 0BX
Nothing found for  3UNK 0BY
Nothing found for  3UO6 0BY
Nothing found for  4TQ3 GPP
Nothing found for  3ERN CAR
Nothing found for  3EOR CFV
Nothing found for  2AMT 1AA
Nothing found for  2GZL 2AA
Nothing found for  3ELC F01
Nothing found for  3ESJ CC7
Nothing found for  4G8V 0EY
Nothing found for  5DDE 5A0
Nothing found for  4GIH 0X5
Nothing found for  4GMY 0X5
Nothing found for  4GII 0X6
Nothing found for  185L IND
Nothing found for  5MOO WOT
Nothing found for  2OV4 AQP
Nothing found for  4J0Y 1H6
Nothing found for  4J0V 1H7
Nothing found for  4J0P 1H8
Nothing found for  4J1F 1HL
Nothing found for  4J1H 1HJ
Nothing found for  4J0Z 1H5
Nothing found for  3ZMG 6Z0
Nothing found for  5TA8 79C
Nothing found for  6KDJ 1RZ
Nothing found for  6UJY 1RZ
Nothing found for  6KDO 1RZ
Nothing found for  6OUN 1RZ
Nothing found for  4HKI FLN
Nothing found for  1LIK ADN
Nothing found for  1

In [23]:
with open("temp.pkl", "wb") as f:
    pickle.dump(subunits_ids_df_list, f)

In [24]:
with open("temp.pkl", "rb") as f:
    t = pickle.load(f)

In [34]:
subunits_ids_df = pd.concat(subunits_ids_df_list).drop_duplicates()

subunits_ids_df

Unnamed: 0,H_PDB,H_ligand_id,entity_id
_,3QTR,X36,
_,5ZA9,50I,
_,3UNJ,0BX,
_,3UO5,0BX,
_,3UNK,0BY,
...,...,...,...
_,6OAH,M2V,
_,6N7Z,KF7,
_,6N7Y,KFA,
_,3R9D,X6B,


In [36]:
hf_exploded_entity_id_H = f_entity_ids_df.\
    merge(subunits_ids_df, 
          how = "left", 
          on = ["H_PDB", "H_ligand_id"]).rename({"entity_id": "H_entity_id"}, axis = 1)
    

In [None]:
subunits_ids_df_list = []
for i in f_entity_ids_df.itertuples():
    subunits = get_subunits_by_pdb_and_ligand_id(i.H_PDB, i.H_ligand_id)
    df = pd.DataFrame.from_dict(
        {"_": {"H_PDB": i.H_PDB, "H_ligand_id": i.H_ligand_id, "entity_id": subunits}},
        orient = "index").\
        explode("entity_id")
    subunits_ids_df_list.append(df)

In [40]:
subunits_ids_df_list_F = []
unique_pdb_ligand_F = f_entity_ids_df[["F_PDB", "F_ligand_id"]].drop_duplicates()

for i in unique_pdb_ligand_F.itertuples():
    subunits = get_subunits_by_pdb_and_ligand_id(i.F_PDB, i.F_ligand_id)
    df = pd.DataFrame.from_dict(
        {"_": {"F_PDB": i.F_PDB, "F_ligand_id": i.F_ligand_id, "F_entity_id": subunits}},
        orient = "index").\
        explode("F_entity_id")
    subunits_ids_df_list_F.append(df)
    
subunits_ids_df_F = pd.concat(subunits_ids_df_list_F)

Nothing found for  3RJC 06Z
Nothing found for  5ZC5 09I
Nothing found for  3UNZ 0BZ
Nothing found for  3UOK 0C6
Nothing found for  5UV1 0FV
Nothing found for  4G90 0G0
Nothing found for  4GQ4 0RT
Nothing found for  4GJ2 0XH
Nothing found for  4GJ3 0XP
Nothing found for  5IXE 14O
Nothing found for  3FUF 14O
Nothing found for  1LGW 1AN
Nothing found for  4J0Z 1H5
Nothing found for  4J0Y 1H6
Nothing found for  4J0V 1H7
Nothing found for  4J1K 1HG
Nothing found for  4J1I 1HH
Nothing found for  4J1E 1HM
Nothing found for  4J1C 1HO
Nothing found for  4J53 1J4
Nothing found for  6UIR 1RY
Nothing found for  6UJX 1RY
Nothing found for  6OR7 1RY
Nothing found for  4L2G 1V4
Nothing found for  2ZJ0 2FA
Nothing found for  4OG6 2S9
Nothing found for  4QGG 32C
Nothing found for  4QGH 32E
Nothing found for  3LEP 388
Nothing found for  2IKI 388
Nothing found for  3LQG 388
Nothing found for  3M4H 388
Nothing found for  3LZ3 388
Nothing found for  5LIU 388
Nothing found for  2AS4 3FA
Nothing found for  3

In [41]:
hf_exploded_entity_id_F = hf_exploded_entity_id_H.\
    merge(subunits_ids_df_F, 
          how = "left", 
          on = ["F_PDB", "F_ligand_id"])

In [45]:
try:
    hf_exploded_entity_id_F.to_pickle(data_folder / "hf_exploded_entity_id_F.pkl")
except NameError:
    hf_exploded_entity_id_F = pd.read_pickle(data_folder / "hf_exploded_entity_id_F.pkl")

## Get UNIPROT from PDB and entity id

In [44]:
get_uniprot_by_pdb_id("3IME", None)

Query '3IME 1'  successed!
Query '3IME 2'  failed!
Query '3IME 3'  failed!


['P9WIL5']

In [47]:
uniprots_ids_df_list_H = []
unique_pdb_ligand_H = hf_exploded_entity_id_F[["H_PDB", "H_entity_id"]].drop_duplicates()


In [49]:

for i in unique_pdb_ligand_H.itertuples():
    uniprots = get_uniprot_by_pdb_id(i.H_PDB, i.H_entity_id)
    df = pd.DataFrame.from_dict(
        {"_": {"H_PDB": i.H_PDB, "H_entity_id": i.H_entity_id, "H_uniprots": set(uniprots)}},
        orient = "index")
    uniprots_ids_df_list_H.append(df)
    
unique_pdb_df = pd.concat(uniprots_ids_df_list_H)

hf_unique_pdb_df_H = hf_exploded_entity_id_F.\
    merge(unique_pdb_df, 
          how = "left", 
          on = ["H_PDB", "H_entity_id"])
    

Query '3QTR 1'  successed!
Query '3QTR 2'  failed!
Query '3QTR 3'  failed!
Query '5ZA9 1'  successed!
Query '5ZA9 2'  failed!
Query '5ZA9 3'  failed!
Query '3UNJ 1'  successed!
Query '3UNJ 2'  failed!
Query '3UNJ 3'  failed!
Query '3UO5 1'  successed!
Query '3UO5 2'  failed!
Query '3UO5 3'  failed!
Query '3UNK 1'  successed!
Query '3UNK 2'  failed!
Query '3UNK 3'  failed!
Query '3UO6 1'  successed!
Query '3UO6 2'  failed!
Query '3UO6 3'  failed!
Query '4TQ3 1'  successed!
Query '4TQ3 2'  failed!
Query '4TQ3 3'  failed!
Query '3ERN 1'  successed!
Query '3ERN 2'  failed!
Query '3ERN 3'  failed!
Query '3EOR 1'  successed!
Query '3EOR 2'  failed!
Query '3EOR 3'  failed!
Query '2AMT 1'  successed!
Query '2AMT 2'  failed!
Query '2AMT 3'  failed!
Query '2GZL 1'  successed!
Query '2GZL 2'  failed!
Query '2GZL 3'  failed!
Query '3ELC 1'  successed!
Query '3ELC 2'  failed!
Query '3ELC 3'  failed!
Query '3ESJ 1'  successed!
Query '3ESJ 2'  failed!
Query '3ESJ 3'  failed!
Query '4G8V 1'  successed

In [51]:
try:
    hf_unique_pdb_df_H.to_pickle(data_folder / "hf_unique_pdb_df_H.pkl")
except NameError:
    hf_unique_pdb_df_H = pd.read_pickle(data_folder / "hf_unique_pdb_df_H.pkl")

In [53]:
uniprots_ids_df_list_F = []
unique_pdb_ligand_F = hf_unique_pdb_df_H[["F_PDB", "F_entity_id"]].drop_duplicates()

for i in unique_pdb_ligand_F.itertuples():
    uniprots = get_uniprot_by_pdb_id(i.F_PDB, i.F_entity_id)
    df = pd.DataFrame.from_dict(
        {"_": {"F_PDB": i.F_PDB, "F_entity_id": i.F_entity_id, "F_uniprots": set(uniprots)}},
        orient = "index")
    uniprots_ids_df_list_F.append(df)
    
unique_pdb_df_F = pd.concat(uniprots_ids_df_list_F)

hf_unique_pdb_df_HF = hf_unique_pdb_df_H.\
    merge(unique_pdb_df_F, 
          how = "left", 
          on = ["F_PDB", "F_entity_id"])

Query '3RJC 1'  successed!
Query '3RJC 2'  failed!
Query '3RJC 3'  failed!
Query '5ZC5 1'  successed!
Query '5ZC5 2'  failed!
Query '5ZC5 3'  failed!
Query '3UNZ 1'  successed!
Query '3UNZ 2'  failed!
Query '3UNZ 3'  failed!
Query '3UOK 1'  successed!
Query '3UOK 2'  failed!
Query '3UOK 3'  failed!
Query '5UV1 1'  successed!
Query '5UV1 2'  failed!
Query '5UV1 3'  failed!
Query '4G90 1'  successed!
Query '4G90 2'  failed!
Query '4G90 3'  failed!
Query '4GQ4 1'  successed!
Query '4GQ4 2'  failed!
Query '4GQ4 3'  failed!
Query '4GJ2 1'  successed!
Query '4GJ2 2'  failed!
Query '4GJ2 3'  failed!
Query '4GJ3 1'  successed!
Query '4GJ3 2'  failed!
Query '4GJ3 3'  failed!
Query '5IXE 1'  successed!
Query '5IXE 2'  failed!
Query '5IXE 3'  failed!
Query '3FUF 1'  successed!
Query '3FUF 2'  failed!
Query '3FUF 3'  failed!
Query '1LGW 1'  successed!
Query '1LGW 2'  failed!
Query '1LGW 3'  failed!
Query '4J1K 1'  successed!
Query '4J1K 2'  failed!
Query '4J1K 3'  failed!
Query '4J1I 1'  successed

In [54]:
try:
    hf_unique_pdb_df_HF.to_pickle(data_folder / "hf_unique_pdb_df_HF.pkl")
except NameError:
    hf_unique_pdb_df_HF = pd.read_pickle(data_folder / "hf_unique_pdb_df_HF.pkl")

In [55]:
hf_unique_pdb_df_HF_filtered = hf_unique_pdb_df_HF.dropna(subset = ["H_uniprots", "F_uniprots"]).copy()

hf_unique_pdb_df_HF_filtered.loc[:, "uniprots_intersection"] = hf_unique_pdb_df_HF_filtered.apply(lambda x: x["H_uniprots"].intersection(x["F_uniprots"]), axis = 1)


In [58]:
same_proteins_df = hf_unique_pdb_df_HF_filtered[hf_unique_pdb_df_HF_filtered["uniprots_intersection"].apply(len) > 0]

In [59]:
same_proteins_df

Unnamed: 0,H_smiles,F_ligand_id,H_PDB,F_smiles,F_PDB,H_ligand_id,H_affinity_type,H_affinity_value,H_affinity_unit,H_affinity_provenance_code,...,F_affinity_value,F_affinity_unit,F_affinity_provenance_code,F_affinity_link,reference_sequence_identity,H_entity_id,F_entity_id,H_uniprots,F_uniprots,uniprots_intersection
0,n1c(N)c(C(=O)c2ccccc2)sc1Nc1ccccc1,06Z,3QTR,Fc1cc(Nc2nc(N)c(C(=O)c3ccccc3)s2)ccc1,3RJC,X36,IC50,930.0,nM,PDBBind,...,5700.0,nM,PDBBind,http://www.pdbbind-cn.org/quickpdb.php?quickpd...,,,,{P24941},{P24941},{P24941}
1,c1c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N)...,09I,5ZA9,c1(c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N...,5ZC5,50I,IC50,530.0,nM,BindingDB,...,88.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,99.0,,,{P00749},{P00749},{P00749}
2,c1c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N)...,09I,5ZA9,c1(c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N...,5ZC5,50I,IC50,530.0,nM,BindingDB,...,88.0,nM,Binding MOAD,http://www.bindingmoad.org/pdbrecords/index/5zc5,,,,{P00749},{P00749},{P00749}
3,c1c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N)...,09I,5ZA9,c1(c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N...,5ZC5,50I,IC50,530.0,nM,BindingDB,...,430.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,99.0,,,{P00749},{P00749},{P00749}
4,c1c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N)...,09I,5ZA9,c1(c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N...,5ZC5,50I,Ki,183.0,nM,BindingDB,...,88.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,99.0,,,{P00749},{P00749},{P00749}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68953,c1(c(nc(s1)Nc1ccc(cc1)S(=O)(=O)N)N)C(=O)Nc1ccc...,Z71,3R9D,c1(c(nc(s1)Nc1ccc(cc1)S(=O)(=O)N)N)C(=O)Nc1cc(...,3R9O,X6B,IC50,71000.0,nM,PDBBind,...,100000.0,nM,PDBBind,http://www.pdbbind-cn.org/quickpdb.php?quickpd...,,,,{P24941},{P24941},{P24941}
68954,c1cccc(F)c1C(=O)NC[B](O)(O)O,ZA5,2Y2J,Fc1cccc(F)c1C(=O)NC[B](O)(O)O,2Y2K,ZA4,IC50,16000.0,nM,PDBBind,...,6900.0,nM,PDBBind,http://www.pdbbind-cn.org/quickpdb.php?quickpd...,,,,{Q7CRA4},{Q7CRA4},{Q7CRA4}
68955,c1cccc(F)c1C(=O)NC[B](O)(O)O,ZA5,2Y2J,Fc1cccc(F)c1C(=O)NC[B](O)(O)O,2Y2K,ZA4,IC50,16000.0,nM,PDBBind,...,6900.0,nM,PDBBind,http://www.pdbbind-cn.org/quickpdb.php?quickpd...,,,,{Q7CRA4},{Q7CRA4},{Q7CRA4}
68956,Fc1ccccc1C(=O)NC[B](O)(O)O,ZA5,2Y2J,Fc1cccc(F)c1C(=O)NC[B](O)(O)O,2Y2K,ZA4,IC50,16000.0,nM,PDBBind,...,6900.0,nM,PDBBind,http://www.pdbbind-cn.org/quickpdb.php?quickpd...,,,,{Q7CRA4},{Q7CRA4},{Q7CRA4}


In [61]:
try:
    same_proteins_df.to_pickle(data_folder / "same_proteins_df.pkl")
except NameError:
    same_proteins_df = pd.read_pickle(data_folder / "same_proteins_df.pkl")

## Get PUBMEDs data to filter the same studies

In [73]:
same_proteins_df["pubmed_H"] = same_proteins_df["H_PDB"].apply(pdb2pubmed)

Query '3QTR'  successed!
Query '5ZA9'  successed!
Query '3UO5'  successed!
Query '3UO6'  successed!
Query '4G8V'  successed!
Query '5DDE'  successed!
Query '4GIH'  successed!
Query '4GII'  successed!
Query '4J0Y'  successed!
Query '4J0V'  successed!
Query '4J0P'  successed!
Query '4J1F'  successed!
Query '4J1H'  successed!
Query '4J0Z'  successed!
Query '3ZMG'  successed!
Query '5TA8'  successed!
Query '6UJY'  successed!
Query '4HKI'  successed!
Query '3CE6'  successed!
Query '4OG4'  successed!
Query '4OG3'  successed!
Query '4LAU'  successed!
Query '5LIK'  successed!
Query '3NLZ'  successed!
Query '3PNG'  successed!
Query '3PNH'  successed!
Query '3LEP'  successed!
Query '2IKI'  successed!
Query '3LQG'  successed!
Query '3M4H'  successed!
Query '3LZ3'  successed!
Query '4LB3'  successed!
Query '5NVW'  successed!
Query '5DD9'  successed!
Query '5DDA'  successed!
Query '5DDB'  successed!
Query '5DDD'  successed!
Query '5DR6'  successed!
Query '5DEX'  successed!
Query '4OEX'  successed!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  same_proteins_df["pubmed_H"] = same_proteins_df["H_PDB"].apply(pdb2pubmed)


In [77]:
same_proteins_df["pubmed_F"] = same_proteins_df["F_PDB"].apply(pdb2pubmed)

Query '3RJC'  successed!
Query '5ZC5'  successed!
Query '3UNZ'  successed!
Query '3UOK'  successed!
Query '4G90'  successed!
Query '4GQ4'  successed!
Query '4GJ2'  successed!
Query '4GJ3'  successed!
Query '4J1K'  successed!
Query '4J1I'  successed!
Query '4J1E'  successed!
Query '4J1C'  successed!
Query '4J53'  successed!
Query '6UIR'  successed!
Query '6UJX'  successed!
Query '6OR7'  successed!
Query '4L2G'  successed!
Query '2ZJ0'  successed!
Query '4OG6'  successed!
Query '5LIU'  successed!
Query '3NLU'  successed!
Query '4LBS'  successed!
Query '4LBR'  successed!
Query '5NVX'  successed!
Query '5DDC'  successed!
Query '5DN3'  successed!
Query '5DPV'  successed!
Query '5VIH'  successed!
Query '3SHY'  successed!
Nothing found for  3SGU
Query '3G1V'  successed!
Nothing found for  3SW6
Query '2QCF'  successed!
Nothing found for  3MW7
Query '3G3D'  successed!
Query '3G3M'  successed!
Query '5EK4'  successed!
Query '5ETT'  successed!
Query '5ETO'  successed!
Query '4PB2'  successed!
Que

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  same_proteins_df["pubmed_F"] = same_proteins_df["F_PDB"].apply(pdb2pubmed)


In [78]:
same_proteins_df_pubmed = same_proteins_df.copy()

In [119]:
def to_int_str(e):
    try:
        return "{:08d}".format(int(e))
    except ValueError:
        return np.nan


In [122]:
same_proteins_df_pubmed["pubmed_H"] = same_proteins_df_pubmed["pubmed_H"].apply(to_int_str).astype
same_proteins_df_pubmed["pubmed_F"] = same_proteins_df_pubmed["pubmed_F"].apply(to_int_str)

In [133]:
try:
    same_proteins_df_pubmed.to_pickle(data_folder / "same_proteins_df_pubmed.pkl")
except NameError:
    same_proteins_df_pubmed = pd.read_pickle(data_folder / "same_proteins_df_pubmed.pkl")

## Filter the same study

In [134]:
def is_the_same_study(row):
    pb_h = row["pubmed_H"]
    pb_f = row["pubmed_F"]
    
    if pb_h == {''} or pb_h == {} or pb_f == {''} or pb_f == {}:
        return False
    
    if row["pubmed_H"] == row["pubmed_F"]:
        return True
    else:
        return False

In [None]:
same_proteins_df_pubmed.apply(is_the_same_study, axis = 1).sum()

1854

In [140]:
same_proteins_df_pubmed["same_study"] = same_proteins_df_pubmed.apply(is_the_same_study, axis = 1)

In [145]:
intersection_same_proteins_df_pubmed = \
    same_proteins_df_pubmed[same_proteins_df_pubmed["same_study"] == True]

In [146]:
try:
    intersection_same_proteins_df_pubmed.to_pickle(data_folder / "intersection_same_proteins_df_pubmed.pkl")
except NameError:
    intersection_same_proteins_df_pubmed = pd.read_pickle(data_folder / "intersection_same_proteins_df_pubmed.pkl")

In [159]:
set_columns = ["H_uniprots", "F_uniprots", "uniprots_intersection"]
ignore_columns = ["H_affinity_provenance_code", "F_affinity_provenance_code", "F_affinity_link", "reference_sequence_identity"] + set_columns
unique_columns =[c for c in intersection_same_proteins_df_pubmed.columns if c not in ignore_columns]
print(unique_columns)
intersection_same_proteins_df_pubmed\
    .drop(["H_uniprots", "F_uniprots", "uniprots_intersection"], axis = 1)\
    .drop_duplicates(subset=unique_columns)

['H_smiles', 'F_ligand_id', 'H_PDB', 'F_smiles', 'F_PDB', 'H_ligand_id', 'H_affinity_type', 'H_affinity_value', 'H_affinity_unit', 'H_affinity_link', 'F_ligand_id_', 'F_affinity_type', 'F_affinity_value', 'F_affinity_unit', 'H_entity_id', 'F_entity_id', 'pubmed_H', 'pubmed_F', 'same_study']


Unnamed: 0,H_smiles,F_ligand_id,H_PDB,F_smiles,F_PDB,H_ligand_id,H_affinity_type,H_affinity_value,H_affinity_unit,H_affinity_provenance_code,...,F_affinity_value,F_affinity_unit,F_affinity_provenance_code,F_affinity_link,reference_sequence_identity,H_entity_id,F_entity_id,pubmed_H,pubmed_F,same_study
0,n1c(N)c(C(=O)c2ccccc2)sc1Nc1ccccc1,06Z,3QTR,Fc1cc(Nc2nc(N)c(C(=O)c3ccccc3)s2)ccc1,3RJC,X36,IC50,930.0,nM,PDBBind,...,5700.0,nM,PDBBind,http://www.pdbbind-cn.org/quickpdb.php?quickpd...,,,,23600925,23600925,True
1,c1c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N)...,09I,5ZA9,c1(c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N...,5ZC5,50I,IC50,530.0,nM,BindingDB,...,88.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,99.0,,,30130401,30130401,True
3,c1c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N)...,09I,5ZA9,c1(c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N...,5ZC5,50I,IC50,530.0,nM,BindingDB,...,430.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,99.0,,,30130401,30130401,True
4,c1c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N)...,09I,5ZA9,c1(c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N...,5ZC5,50I,Ki,183.0,nM,BindingDB,...,88.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,99.0,,,30130401,30130401,True
6,c1c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N)...,09I,5ZA9,c1(c2c(ccc1)oc(c1c(N3CCCCCC3)nc(N)c(C(=O)NC(=N...,5ZC5,50I,Ki,183.0,nM,BindingDB,...,430.0,nM,BindingDB,http://www.bindingdb.org/jsp/dbsearch/PrimaryS...,99.0,,,30130401,30130401,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68947,c1c(Cl)c(C)ccc1c1sc2c(c1)c(ncn2)N[C@H](P(=O)(O...,YL6,6OAH,c1c(Cl)c(C)ccc1c1sc2c(c1)c(ncn2)N[C@H](P(=O)(O...,6N83,M2V,IC50,1500.0,nM,Binding MOAD,...,540.0,nM,Binding MOAD,http://www.bindingmoad.org/pdbrecords/index/6n83,,,,31577901,31577901,True
68950,c1(c(nc(s1)Nc1ccc(cc1)S(=O)(=O)N)N)C(=O)Nc1cc(...,Z71,3R9D,c1(c(nc(s1)Nc1ccc(cc1)S(=O)(=O)N)N)C(=O)Nc1cc(...,3R9O,X6B,IC50,71000.0,nM,PDBBind,...,100000.0,nM,PDBBind,http://www.pdbbind-cn.org/quickpdb.php?quickpd...,,,,23600925,23600925,True
68952,c1(c(nc(s1)Nc1ccc(cc1)S(=O)(=O)N)N)C(=O)Nc1ccc...,Z71,3R9D,c1(c(nc(s1)Nc1ccc(cc1)S(=O)(=O)N)N)C(=O)Nc1cc(...,3R9O,X6B,IC50,71000.0,nM,PDBBind,...,100000.0,nM,PDBBind,http://www.pdbbind-cn.org/quickpdb.php?quickpd...,,,,23600925,23600925,True
68954,c1cccc(F)c1C(=O)NC[B](O)(O)O,ZA5,2Y2J,Fc1cccc(F)c1C(=O)NC[B](O)(O)O,2Y2K,ZA4,IC50,16000.0,nM,PDBBind,...,6900.0,nM,PDBBind,http://www.pdbbind-cn.org/quickpdb.php?quickpd...,,,,21732689,21732689,True


## Format final table

In [162]:
final_table = intersection_same_proteins_df_pubmed[
    ["F_ligand_id", "F_smiles", "F_PDB", 
     "H_ligand_id", "H_smiles", "H_PDB", 
     "F_affinity_type", "F_affinity_value", "F_affinity_unit", "F_affinity_link",
     "H_affinity_type", "H_affinity_value", "H_affinity_unit", "H_affinity_link",
     "pubmed_H", "pubmed_F", "same_study"]]

final_table.to_csv("final_table.csv")

In [163]:
try:
    final_table.to_pickle(data_folder / "final_table.pkl")
except NameError:
    final_table = pd.read_pickle(data_folder / "final_table.pkl")