# Processing and generating bacterial subset of BindingDB database 

The database can be found [here](https://www.bindingdb.org/rwd/bind/index.jsp).

In [1]:
import pandas as pd
import json

pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
DATA_DIR = "../data"

In [3]:
binding_df = pd.read_csv(
    f"{DATA_DIR}/raw/BindingDB_All_202311.tsv",
    sep="\t",
    low_memory=False,
    on_bad_lines="skip",
    dtype=str,
)
binding_df.head(3)

Unnamed: 0,BindingDB Reactant_set_id,Ligand SMILES,Ligand InChI,Ligand InChI Key,BindingDB MonomerID,BindingDB Ligand Name,Target Name,Target Source Organism According to Curator or DataSource,Ki (nM),IC50 (nM),...,UniProt (SwissProt) Recommended Name of Target Chain.12,UniProt (SwissProt) Entry Name of Target Chain.12,UniProt (SwissProt) Primary ID of Target Chain.12,UniProt (SwissProt) Secondary ID(s) of Target Chain.12,UniProt (SwissProt) Alternative ID(s) of Target Chain.12,UniProt (TrEMBL) Submitted Name of Target Chain.12,UniProt (TrEMBL) Entry Name of Target Chain.12,UniProt (TrEMBL) Primary ID of Target Chain.12,UniProt (TrEMBL) Secondary ID(s) of Target Chain.12,UniProt (TrEMBL) Alternative ID(s) of Target Chain.12
0,1,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CCCCCC(O)=...,InChI=1S/C31H42N2O7/c34-27(35)17-9-3-11-19-32-...,XGEGDSLAQZJGCW-HHGOQMMWSA-N,608734,"6-[(4R,5S,6S,7R)-4,7-dibenzyl-3-(5-carboxypent...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.24,,...,,,,,,,,,,
1,2,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...,InChI=1S/C31H34N6O3/c38-29-27(17-23-9-3-1-4-10...,UZLMEAPBHYEHAC-UNTBESQGSA-N,22,"(4R,5S,6S,7R)-4,7-dibenzyl-5,6-dihydroxy-1,3-b...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.25,,...,,,,,,,,,,
2,3,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CC2CC2)C(=...,InChI=1S/C29H34N4O3/c34-27-25(16-21-8-3-1-4-9-...,HYNYUFZPPJMPOB-UTWJFGBXSA-N,23,"(4R,5S,6S,7R)-4,7-dibenzyl-1-(cyclopropylmethy...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.41,,...,,,,,,,,,,


In [4]:
binding_df["Target Source Organism According to Curator or DataSource"].nunique()

320

# Subselect bacterial strains

In [5]:
bacterial_species = json.load(open(f"{DATA_DIR}/mapping/bact_mapper.json"))
len(bacterial_species)

382

In [6]:
m = binding_df["Target Source Organism According to Curator or DataSource"].isin(
    bacterial_species
)
bac_binding_df = binding_df[m]

In [7]:
bac_binding_df.shape

(29372, 194)

In [8]:
cols_to_keep = [
    "Ligand SMILES",
    "Ligand InChI",
    "Ligand InChI Key",
    "BindingDB Ligand Name",
    "Target Name",
    "Target Source Organism According to Curator or DataSource",
    "Ki (nM)",
    "IC50 (nM)",
    "Kd (nM)",
    "EC50 (nM)",
    "Curation/DataSource",
    "PMID",
    "Patent Number",
    "PubChem CID",
]
bac_binding_df = bac_binding_df[cols_to_keep]

In [9]:
bac_binding_df.rename(
    columns={
        "Target Source Organism According to Curator or DataSource": "Organism",
    },
    inplace=True,
)
bac_binding_df["strain_type"] = bac_binding_df["Organism"].map(bacterial_species)

In [10]:
bac_binding_df.head()

Unnamed: 0,Ligand SMILES,Ligand InChI,Ligand InChI Key,BindingDB Ligand Name,Target Name,Organism,Ki (nM),IC50 (nM),Kd (nM),EC50 (nM),Curation/DataSource,PMID,Patent Number,PubChem CID,strain_type
1977,OS(=O)(=O)ON1[C@H]2CN([C@H](CC2)C(=O)NC2CCCNC2...,InChI=1S/C12H20N4O6S/c17-11(14-8-2-1-5-13-6-8)...,OMBRYYNMYXZMPQ-VXRWAFEHSA-N,"US8487073, 19B",Beta-lactamase,Klebsiella pneumoniae,,240,,,US Patent,,US8487093,89779203,gram-negative
1982,OS(=O)(=O)ON1[C@H]2CN([C@H](CC2)C(=O)NC2CCCNC2...,InChI=1S/C12H20N4O6S/c17-11(14-8-2-1-5-13-6-8)...,OMBRYYNMYXZMPQ-VXRWAFEHSA-N,"US8487073, 19B",Beta-lactamase,Pseudomonas aeruginosa,,480,,,US Patent,,US8487093,89779203,gram-negative
2072,OS(=O)(=O)ON1[C@H]2CN([C@@H](CC2)C(=O)NC2CCNCC...,InChI=1S/C12H20N4O6S/c17-11(14-8-3-5-13-6-4-8)...,SMOBCLHAZXOKDQ-ZJUUUORDSA-N,"US8487073, 1A",Beta-lactamase,Klebsiella pneumoniae,,210,,,US Patent,,US8487093,44129647,gram-negative
2649,OS(=O)(=O)ON1[C@H]2CN([C@@H](CC2)C(=O)NC2CCNCC...,InChI=1S/C12H20N4O6S/c17-11(14-8-3-5-13-6-4-8)...,SMOBCLHAZXOKDQ-ZJUUUORDSA-N,"US8487073, 1A",Beta-lactamase,Pseudomonas aeruginosa,,465,,,US Patent,,US8487093,44129647,gram-negative
14256,COc1ccc(cc1)N1C(=S)S\C(=C/c2ccc(o2)-c2cccc(c2)...,InChI=1S/C22H14F3NO3S2/c1-28-16-7-5-15(6-8-16)...,LAEQOMVPHKEMBO-UNOMPAQXSA-N,(5Z)-3-(4-methoxyphenyl)-2-sulfanylidene-5-({5...,Lethal factor,Bacillus anthracis,,300000,,,Curated from the literature by BindingDB,15983377.0,,1983310,gram-positive


In [11]:
bac_binding_df.to_csv(
    f"{DATA_DIR}/processed/bacterial_bindingdb.tsv", sep="\t", index=False
)

In [12]:
bac_binding_df["strain_type"].value_counts()

strain_type
gram-negative    12321
gram-positive    12224
acid-fast         4827
Name: count, dtype: int64

In [13]:
bac_binding_df["Organism"].nunique()

108