# Processing and generating bacterial subset of Drug Central database 

The database can be found [here](https://drugcentral.org/).

In [1]:
import pandas as pd
import json

pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
DATA_DIR = "../data"

In [3]:
drugcentral_df = pd.read_csv(
    f"{DATA_DIR}/raw/drug.target.interaction.tsv.gz",
    sep="\t",
    low_memory=False,
    on_bad_lines="skip",
    compression="gzip",
)

struct_df = pd.read_csv(f"{DATA_DIR}/raw/structures.smiles.tsv", sep="\t")

drugcentral_df = drugcentral_df.merge(struct_df, left_on="STRUCT_ID", right_on="ID")

drugcentral_df.head(3)

Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,GENE,SWISSPROT,ACT_VALUE,ACT_UNIT,ACT_TYPE,...,MOA_SOURCE_URL,ACTION_TYPE,TDL,ORGANISM,SMILES,InChI,InChIKey,ID,INN,CAS_RN
0,levobupivacaine,4,Potassium voltage-gated channel subfamily H me...,Ion channel,Q12809,KCNH2,KCNH2_HUMAN,4.89,,IC50,...,,,Tclin,Homo sapiens,CCCCN1CCCC[C@H]1C(=O)NC1=C(C)C=CC=C1C,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,LEBVLXFERQHONN-INIZCTEOSA-N,4,levobupivacaine,27262-47-1
1,levobupivacaine,4,Sodium channel protein type 1 subunit alpha,Ion channel,P35498,SCN1A,SCN1A_HUMAN,5.79,,IC50,...,,,Tclin,Homo sapiens,CCCCN1CCCC[C@H]1C(=O)NC1=C(C)C=CC=C1C,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,LEBVLXFERQHONN-INIZCTEOSA-N,4,levobupivacaine,27262-47-1
2,levobupivacaine,4,Sodium channel protein type 4 subunit alpha,Ion channel,P35499,SCN4A,SCN4A_HUMAN,,,,...,https://www.ebi.ac.uk/chembl/compound/inspect/...,BLOCKER,Tclin,Homo sapiens,CCCCN1CCCC[C@H]1C(=O)NC1=C(C)C=CC=C1C,InChI=1S/C18H28N2O/c1-4-5-12-20-13-7-6-11-16(2...,LEBVLXFERQHONN-INIZCTEOSA-N,4,levobupivacaine,27262-47-1


In [4]:
drugcentral_df["ORGANISM"].nunique()

261

# Subset to bacterial assays

In [5]:
bacterial_species = json.load(open(f"{DATA_DIR}/mapping/bact_mapper.json"))
len(bacterial_species)

382

In [6]:
m1 = drugcentral_df["ORGANISM"].isin(bacterial_species)
m2 = drugcentral_df["ACT_TYPE"].isin(["IC50", "Ki", "MIC"])
bact_drugcentral_df = drugcentral_df[m1 & m2]

In [7]:
bact_drugcentral_df.shape

(433, 26)

In [8]:
cols_to_keep = [
    "DRUG_NAME",
    "STRUCT_ID",
    "TARGET_NAME",
    "ACCESSION",
    "ACT_VALUE",
    "ACT_UNIT",
    "ACT_TYPE",
    "ACT_SOURCE",
    "RELATION",
    "ORGANISM",
    "SMILES",
    "InChIKey",
]
bact_drugcentral_df = bact_drugcentral_df[cols_to_keep]

In [9]:
bact_drugcentral_df["strain_type"] = bact_drugcentral_df["ORGANISM"].map(
    bacterial_species
)

In [10]:
bact_drugcentral_df.head()

Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,ACCESSION,ACT_VALUE,ACT_UNIT,ACT_TYPE,ACT_SOURCE,RELATION,ORGANISM,SMILES,InChIKey,strain_type
17,aminopterin,21,Dihydrofolate reductase,P00381,8.3,,IC50,CHEMBL,=,Lactobacillus casei,NC1=NC2=NC=C(CNC3=CC=C(C=C3)C(=O)N[C@@H](CCC(O...,TVZGACDUOSZQKY-LBPRGKRZSA-N,acid-fast
18,aminopterin,21,Dihydrofolate reductase,B0BL08,7.96,,IC50,CHEMBL,=,Escherichia coli,NC1=NC2=NC=C(CNC3=CC=C(C=C3)C(=O)N[C@@H](CCC(O...,TVZGACDUOSZQKY-LBPRGKRZSA-N,gram-negative
31,fluorouracil,26,Uracil phosphoribosyltransferase,P0A8F0,4.89,,Ki,CHEMBL,=,Escherichia coli (strain K12),FC1=CNC(=O)NC1=O,GHASVSINZRGABV-UHFFFAOYSA-N,gram-negative
32,azaribine,27,Orotidine 5'-phosphate decarboxylase,O26232,4.96,,Ki,CHEMBL,=,Methanothermobacter thermautotrophicus (strain...,CC(=O)OC[C@H]1O[C@H]([C@H](OC(C)=O)[C@@H]1OC(C...,QQOBRRFOVWGIMD-OJAKKHQRSA-N,gram-positive
51,acarbose,39,Alpha-amylase,P94451,6.47,,Ki,CHEMBL,=,Geobacillus stearothermophilus,C[C@H]1O[C@H](O[C@@H]2[C@@H](CO)O[C@H](O[C@@H]...,XUFXOAAUWZOOIT-UGEKTDRHSA-N,gram-positive


In [11]:
bact_drugcentral_df.to_csv(
    f"{DATA_DIR}/processed/bacterial_drugcentral.tsv", sep="\t", index=False
)

In [12]:
bact_drugcentral_df["strain_type"].value_counts()

strain_type
gram-negative    281
acid-fast         81
gram-positive     71
Name: count, dtype: int64

In [13]:
bact_drugcentral_df["ORGANISM"].nunique()

68