# Processing and generating bacterial subset of ChEMBL (v33) database 

The database can be found [here](https://www.ebi.ac.uk/chembl/).

In [1]:
import pandas as pd
import json

pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
DATA_DIR = "../data"

In [3]:
chembl_df = pd.read_csv(
    f"{DATA_DIR}/raw/bioassay_chembl_33.tsv",
    sep="\t",
    low_memory=False,
)
chembl_df.head(3)

Unnamed: 0,pref_name,chembl_idx,inchi_key,smiles,inchi,assay_id,assay_organism,assay_tax_id,pchembl_value,assay_type,doi,doc_type,patent_id
0,,CHEMBL268556,QFSGJJCWIIUQCT-UUHCYPSGSA-P,c1cc2cc(c1)-c1cccc(c1)C[n+]1ccc(c3ccccc31)NCCC...,InChI=1S/C42H44N4/c1-2-4-6-12-26-44-40-24-28-4...,CHEMBL805641,Rattus norvegicus,10116,6.96,IC50,10.1016/S0960-894X(96)00568-9,PUBLICATION,
1,BROMOENOL LACTONE,CHEMBL6206,BYUCSFWXCMTYOI-ZRDIBKRKSA-N,O=C1O/C(=C/Br)CCC1c1cccc2ccccc12,InChI=1S/C16H13BrO2/c17-10-12-8-9-15(16(18)19-...,CHEMBL760624,Canis lupus familiaris,9615,6.85,IC50,10.1021/jm00053a012,PUBLICATION,
2,BROMOENOL LACTONE,CHEMBL6206,BYUCSFWXCMTYOI-ZRDIBKRKSA-N,O=C1O/C(=C/Br)CCC1c1cccc2ccccc12,InChI=1S/C16H13BrO2/c17-10-12-8-9-15(16(18)19-...,CHEMBL938652,Rattus norvegicus,10116,6.1,IC50,10.1021/jm800311k,PUBLICATION,


In [4]:
chembl_df["assay_organism"].nunique()

1483

In [5]:
bacterial_species = json.load(open(f"{DATA_DIR}/mapping/bact_mapper.json"))
len(bacterial_species)

382

In [6]:
m1 = chembl_df["assay_organism"].isin(bacterial_species)
m2 = chembl_df["assay_type"].isin(["IC50", "Ki"])
bact_chembl_df = chembl_df[m1 & m2]

In [7]:
bact_chembl_df.shape

(44543, 13)

In [8]:
bact_chembl_df["strain_type"] = bact_chembl_df["assay_organism"].map(bacterial_species)

In [9]:
bact_chembl_df.head(3)

Unnamed: 0,pref_name,chembl_idx,inchi_key,smiles,inchi,assay_id,assay_organism,assay_tax_id,pchembl_value,assay_type,doi,doc_type,patent_id,strain_type
711,OFLOXACIN,CHEMBL4,GSDSWSVVBLHKDQ-UHFFFAOYSA-N,CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23,InChI=1S/C18H20FN3O4/c1-10-9-26-17-14-11(16(23...,CHEMBL1280244,Mycobacterium tuberculosis,1773,4.55,IC50,10.1128/aac.01380-07,PUBLICATION,,acid-fast
712,OFLOXACIN,CHEMBL4,GSDSWSVVBLHKDQ-UHFFFAOYSA-N,CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23,InChI=1S/C18H20FN3O4/c1-10-9-26-17-14-11(16(23...,CHEMBL1280245,Mycobacterium tuberculosis,1773,5.16,IC50,10.1128/aac.01380-07,PUBLICATION,,acid-fast
713,OFLOXACIN,CHEMBL4,GSDSWSVVBLHKDQ-UHFFFAOYSA-N,CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23,InChI=1S/C18H20FN3O4/c1-10-9-26-17-14-11(16(23...,CHEMBL1280246,Mycobacterium tuberculosis,1773,5.1,IC50,10.1128/aac.01380-07,PUBLICATION,,acid-fast


In [10]:
bact_chembl_df.to_csv(
    f"{DATA_DIR}/processed/bacterial_chembl_33.tsv", sep="\t", index=False
)

In [11]:
bact_chembl_df["strain_type"].value_counts()

strain_type
gram-negative    22415
gram-positive    12526
acid-fast         9602
Name: count, dtype: int64

In [12]:
bact_chembl_df["assay_organism"].nunique()

295