# Processing and generating bacterial subset of BindingDB database 

The database can be found [here](https://www.bindingdb.org/rwd/bind/index.jsp).

In [1]:
import pandas as pd

In [2]:
DATA_DIR = "../data"

In [3]:
binding_df = pd.read_csv(
    f"{DATA_DIR}/BindingDB_All_202311.tsv",
    sep="\t",
    low_memory=False,
    on_bad_lines="skip",
    dtype=str,
)
binding_df.head(3)

Unnamed: 0,BindingDB Reactant_set_id,Ligand SMILES,Ligand InChI,Ligand InChI Key,BindingDB MonomerID,BindingDB Ligand Name,Target Name,Target Source Organism According to Curator or DataSource,Ki (nM),IC50 (nM),...,UniProt (SwissProt) Recommended Name of Target Chain.12,UniProt (SwissProt) Entry Name of Target Chain.12,UniProt (SwissProt) Primary ID of Target Chain.12,UniProt (SwissProt) Secondary ID(s) of Target Chain.12,UniProt (SwissProt) Alternative ID(s) of Target Chain.12,UniProt (TrEMBL) Submitted Name of Target Chain.12,UniProt (TrEMBL) Entry Name of Target Chain.12,UniProt (TrEMBL) Primary ID of Target Chain.12,UniProt (TrEMBL) Secondary ID(s) of Target Chain.12,UniProt (TrEMBL) Alternative ID(s) of Target Chain.12
0,1,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CCCCCC(O)=...,InChI=1S/C31H42N2O7/c34-27(35)17-9-3-11-19-32-...,XGEGDSLAQZJGCW-HHGOQMMWSA-N,608734,"6-[(4R,5S,6S,7R)-4,7-dibenzyl-3-(5-carboxypent...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.24,,...,,,,,,,,,,
1,2,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(C\C=C\c2cn...,InChI=1S/C31H34N6O3/c38-29-27(17-23-9-3-1-4-10...,UZLMEAPBHYEHAC-UNTBESQGSA-N,22,"(4R,5S,6S,7R)-4,7-dibenzyl-5,6-dihydroxy-1,3-b...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.25,,...,,,,,,,,,,
2,3,O[C@@H]1[C@@H](O)[C@@H](Cc2ccccc2)N(CC2CC2)C(=...,InChI=1S/C29H34N4O3/c34-27-25(16-21-8-3-1-4-9-...,HYNYUFZPPJMPOB-UTWJFGBXSA-N,23,"(4R,5S,6S,7R)-4,7-dibenzyl-1-(cyclopropylmethy...",Dimer of Gag-Pol polyprotein [501-599],Human immunodeficiency virus 1,0.41,,...,,,,,,,,,,


In [4]:
binding_df["Target Source Organism According to Curator or DataSource"].nunique()

320

# Subselect bacterial strains

In [5]:
bacterial_species = {
    "Klebsiella pneumoniae": "gram-negative",
    "Pseudomonas aeruginosa": "gram-negative",
    "Bacillus anthracis": "gram-positive",
    "Staphylococcus aureus": "gram-positive",
    "Haemophilus influenzae": "gram-negative",
    "Escherichia coli": "gram-negative",
    "Streptococcus pneumoniae": "gram-positive",
    "Serratia marcescens": "gram-negative",
    "Helicobacter pylori (strain v225d)": "gram-negative",
    "Clostridium histolyticum": "gram-positive",
    "Mycobacterium tuberculosis": "acid-fast",
    "Mycobacterium avium": "gram-positive",
    "Staphylococcus aureus (strain NCTC 8325)": "gram-positive",
    "Bacillus cereus (strain ATCC 14579 / DSM 31)": "gram-positive",
    "Lactobacillus casei": "acid-fast",
    "Streptococcus pneumoniae (strain ATCC BAA-255 / R6)": "gram-positive",
    "Bacillus subtilis": "gram-positive",
    "Yersinia pestis": "gram-negative",
    "Deinococcus radiodurans": "gram-positive",
    "Clostridium botulinum": "gram-positive",
    "Aquifex aeolicus": "gram-negative",
    "Enterococcus faecalis": "gram-positive",
    "Streptococcus pyogenes": "gram-positive",
    "Bacillus pasteurii": "gram-positive",
    "Proteus vulgaris": "gram-negative",
    "Helicobacter pylori": "gram-negative",
    "Bacteroides thetaiotaomicron": "gram-negative",
    "Neisseria meningitidis": "gram-negative",
    "Francisella tularensis": "gram-negative",
    "Streptococcus pneumoniae serotype 2 (strain D39 / NCTC 7466)": "gram-positive",
    "Enterobacter cloacae": "gram-negative",
    "Yersinia enterocolitica": "gram-negative",
    "Thermus thermophilus": "gram-negative",
    "Mycobacterium tuberculosis H37Rv": "acid-fast",
    "Streptococcus pyogenes M1 GAS": "gram-positive",
    "Leuconostoc mesenteroides": "acid-fast",
    "Vibrio harveyi": "gram-negative",
    "Escherichia coli str. K-12 substr. MG1655": "gram-negative",
    "Helicobacter pylori (strain G27)": "gram-negative",
    "Zymomonas mobilis": "gram-negative",
    "Clostridium botulinum (strain Hall / ATCC 3502 / NCTC 13319 / Type A)": "gram-positive",
    "Acinetobacter baumannii": "gram-negative",
    "Pseudomonas putida": "gram-negative",
    "Bacillus thermoproteolyticus": "gram-positive",
    "Streptomyces coelicolor": "gram-positive",
    "Bacillus cereus": "gram-positive",
    "Bacillus lentus": "gram-positive",
    "Lactobacillus fermentum": "acid-fast",
    "Caldocellum saccharolyticum": "gram-positive",
    "Streptococcus pyogenes serotype M1": "gram-positive",
    "Pseudomonas paucimobilis": "gram-negative",
    "Neisseria gonorrhoeae": "gram-negative",
    "Stenotrophomonas maltophilia": "gram-negative",
    "Streptococcus pyogenes serotype M18": "gram-positive",
    "Salmonella typhimurium": "gram-negative",
    "Staphylococcus epidermidis (strain ATCC 35984 / RP62A)": "gram-positive",
    "Pseudomonas fluorescens": "gram-negative",
    "Citrobacter freundii": "gram-negative",
    "Clostridium perfringens": "gram-positive",
    "Enterococcus faecium": "gram-positive",
    "Escherichia coli O157:H7": "gram-negative",
    "Agrobacterium sp. ZY-2006e": "gram-negative",
    "Staphylococcus aureus (strain Mu50 / ATCC 700699)": "gram-positive",
    "Bacillus amyloliquefaciens": "gram-positive",
    "Providencia stuartii": "gram-negative",
    "Streptomyces caespitosus": "gram-positive",
    "Thermus aquaticus": "gram-negative",
    "Rhizopus chinensis": "gram-negative",
    "Alicyclobacillus acidocaldarius": "gram-positive",
    "Staphylococcus aureus (strain MW2)": "gram-positive",
    "Staphylococcus aureus (strain MRSA252)": "gram-positive",
    "Flavobacterium meningosepticum": "gram-negative",
    "Streptomyces avidinii": "gram-positive",
    "Thermoanaerobacter saccharolyticum": "gram-positive",
    "Vibrio harveyi (strain ATCC BAA-1116 / BB120)": "gram-negative",
    "Brucella suis": "gram-negative",
    "Serratia fonticola": "gram-negative",
    "Bacillus sporothermodurans": "gram-positive",
    "Vibrio fischeri": "gram-negative",
    "Vibrio fischeri (strain ATCC 700601 / ES114)": "gram-negative",
    "Chromobacterium violaceum": "gram-negative",
    "Bacillus licheniformis": "gram-positive",
    "Alcaligenes sp. (strain DSM 11172)": "gram-negative",
    "Escherichia coli (strain UTI89 / UPEC)": "gram-negative",
    "Legionella pneumophila": "gram-negative",
    "Actinomadura sp. (strain R39)": "gram-positive",
    "Vibrio cholerae": "gram-negative",
    "Ureaplasma parvum": "gram-positive",
    "Aeromonas hydrophila": "gram-negative",
    "Brachyspira pilosicoli": "gram-negative",
    "Haemophilus influenzae (strain ATCC 51907 / DSM 11121 / KW20 / Rd)": "gram-negative",
    "Mycobacterium smegmatis": "acid-fast",
    "Burkholderia cenocepacia": "gram-negative",
    "Acinetobacter genomosp. 3": "gram-negative",
    "Aeromonas allosaccharophila": "gram-negative",
    "Francisella tularensis subsp. tularensis": "gram-negative",
    "Escherichia coli O6": "gram-negative",
    "Clostridium tetani": "gram-positive",
    "Enterococcus durans": "gram-positive",
    "Spirulina platensis": "gram-negative",
    "Bacillus clausii": "gram-positive",
}

In [6]:
m = binding_df["Target Source Organism According to Curator or DataSource"].isin(
    bacterial_species
)
bac_binding_df = binding_df[m]

In [7]:
bac_binding_df.shape

(29127, 194)

In [8]:
cols_to_keep = [
    "Ligand SMILES",
    "Ligand InChI",
    "Ligand InChI Key",
    "BindingDB Ligand Name",
    "Target Name",
    "Target Source Organism According to Curator or DataSource",
    "Ki (nM)",
    "IC50 (nM)",
    "Kd (nM)",
    "EC50 (nM)",
    "Curation/DataSource",
    "PMID",
    "Patent Number",
    "PubChem CID",
]
bac_binding_df = bac_binding_df[cols_to_keep]

In [9]:
bac_binding_df.head()

Unnamed: 0,Ligand SMILES,Ligand InChI,Ligand InChI Key,BindingDB Ligand Name,Target Name,Target Source Organism According to Curator or DataSource,Ki (nM),IC50 (nM),Kd (nM),EC50 (nM),Curation/DataSource,PMID,Patent Number,PubChem CID
1977,OS(=O)(=O)ON1[C@H]2CN([C@H](CC2)C(=O)NC2CCCNC2...,InChI=1S/C12H20N4O6S/c17-11(14-8-2-1-5-13-6-8)...,OMBRYYNMYXZMPQ-VXRWAFEHSA-N,"US8487073, 19B",Beta-lactamase,Klebsiella pneumoniae,,240,,,US Patent,,US8487093,89779203
1982,OS(=O)(=O)ON1[C@H]2CN([C@H](CC2)C(=O)NC2CCCNC2...,InChI=1S/C12H20N4O6S/c17-11(14-8-2-1-5-13-6-8)...,OMBRYYNMYXZMPQ-VXRWAFEHSA-N,"US8487073, 19B",Beta-lactamase,Pseudomonas aeruginosa,,480,,,US Patent,,US8487093,89779203
2072,OS(=O)(=O)ON1[C@H]2CN([C@@H](CC2)C(=O)NC2CCNCC...,InChI=1S/C12H20N4O6S/c17-11(14-8-3-5-13-6-4-8)...,SMOBCLHAZXOKDQ-ZJUUUORDSA-N,"US8487073, 1A",Beta-lactamase,Klebsiella pneumoniae,,210,,,US Patent,,US8487093,44129647
2649,OS(=O)(=O)ON1[C@H]2CN([C@@H](CC2)C(=O)NC2CCNCC...,InChI=1S/C12H20N4O6S/c17-11(14-8-3-5-13-6-4-8)...,SMOBCLHAZXOKDQ-ZJUUUORDSA-N,"US8487073, 1A",Beta-lactamase,Pseudomonas aeruginosa,,465,,,US Patent,,US8487093,44129647
14256,COc1ccc(cc1)N1C(=S)S\C(=C/c2ccc(o2)-c2cccc(c2)...,InChI=1S/C22H14F3NO3S2/c1-28-16-7-5-15(6-8-16)...,LAEQOMVPHKEMBO-UNOMPAQXSA-N,(5Z)-3-(4-methoxyphenyl)-2-sulfanylidene-5-({5...,Lethal factor,Bacillus anthracis,,300000,,,Curated from the literature by BindingDB,15983377.0,,1983310


In [10]:
bac_binding_df.to_csv(f"{DATA_DIR}/bacterial_bindingdb.tsv", sep="\t", index=False)