# Processing and generating bacterial subset of Drug Central database 

The database can be found [here](https://drugcentral.org/).

In [1]:
import pandas as pd

pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
DATA_DIR = "../data"

In [3]:
drugcentral_df = pd.read_csv(
    f"{DATA_DIR}/drug.target.interaction.tsv.gz",
    sep="\t",
    low_memory=False,
    on_bad_lines="skip",
    compression="gzip",
)
drugcentral_df.head(3)

Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,TARGET_CLASS,ACCESSION,GENE,SWISSPROT,ACT_VALUE,ACT_UNIT,ACT_TYPE,ACT_COMMENT,ACT_SOURCE,RELATION,MOA,MOA_SOURCE,ACT_SOURCE_URL,MOA_SOURCE_URL,ACTION_TYPE,TDL,ORGANISM
0,levobupivacaine,4,Potassium voltage-gated channel subfamily H me...,Ion channel,Q12809,KCNH2,KCNH2_HUMAN,4.89,,IC50,Inhibition of wild-type human ERG channel expr...,CHEMBL,=,,,,,,Tclin,Homo sapiens
1,levobupivacaine,4,Sodium channel protein type 1 subunit alpha,Ion channel,P35498,SCN1A,SCN1A_HUMAN,5.79,,IC50,,WOMBAT-PK,=,,,,,,Tclin,Homo sapiens
2,levobupivacaine,4,Sodium channel protein type 4 subunit alpha,Ion channel,P35499,SCN4A,SCN4A_HUMAN,,,,,WOMBAT-PK,,1.0,CHEMBL,,https://www.ebi.ac.uk/chembl/compound/inspect/...,BLOCKER,Tclin,Homo sapiens


In [4]:
drugcentral_df["ORGANISM"].nunique()

264

In [5]:
bacterial_species = {
    "Lactobacillus casei": "acid-fast",
    "Escherichia coli": "gram-negative",
    "Escherichia coli (strain K12)": "gram-negative",
    "Methanothermobacter thermautotrophicus (strain ATCC 29096 / DSM 1053 / JCM 10044 / NBRC 100330 / Delta H)": "gram-positive",
    "Geobacillus stearothermophilus": "gram-positive",
    "Helicobacter pylori (strain ATCC 700392 / 26695)": "gram-negative",
    "Mycobacterium tuberculosis (strain ATCC 25618 / H37Rv)": "acid-fast",
    "Pseudomonas aeruginosa (strain ATCC 15692 / DSM 22644 / CIP 104116 / JCM 14847 / LMG 12228 / 1C / PRS 101 / PAO1)": "gram-negative",
    "Hydrogenovibrio crunogenus (strain XCL-2)": "gram-negative",
    "Sulfurihydrogenibium sp. (strain YO3AOP1)": "gram-negative",
    "Helicobacter pylori": "gram-negative",
    "Bacillus thermoproteolyticus": "gram-positive",
    "Pseudomonas aeruginosa": "gram-negative",
    "Shigella dysenteriae": "gram-negative",
    "Bacillus anthracis": "gram-positive",
    "Clostridium botulinum": "gram-positive",
    "Clostridium perfringens (strain 13 / Type A)": "gram-positive",
    "Pseudomonas aeruginosa (strain ATCC 15692 / PAO1 / 1C / PRS 101 / LMG 12228)": "gram-negative",
    "Citrobacter freundii": "gram-negative",
    "Salmonella newport": "gram-negative",
    "Klebsiella pneumoniae": "gram-negative",
    "Acinetobacter baumannii": "gram-negative",
    "Staphylococcus aureus": "gram-positive",
    "Neisseria meningitidis serogroup C / serotype 2a (strain ATCC 700532 / DSM 15464 / FAM18)": "gram-negative",
    "Yersinia pestis": "gram-negative",
    "Salmonella typhi": "gram-negative",
    "Haemophilus influenzae (strain ATCC 51907 / DSM 11121 / KW20 / Rd)": "gram-negative",
    "Streptococcus pyogenes serotype M1": "gram-positive",
    "Haemophilus influenzae": "gram-negative",
    "Streptococcus pyogenes": "gram-positive",
    "Bacillus subtilis (strain 168)": "gram-positive",
    "Bacillus licheniformis": "gram-positive",
    "Streptococcus pyogenes serotype M4 (strain MGAS10750)": "gram-positive",
    "Pseudomonas putida (strain KT2440)": "gram-negative",
    "Streptomyces coelicolor (strain ATCC BAA-471 / A3(2) / M145)": "gram-positive",
    "Enterobacter cloacae": "gram-negative",
    "Citrobacter gillenii": "gram-negative",
    "Bacillus clausii": "gram-positive",
    "Serratia fonticola": "gram-negative",
    "Brachyspira pilosicoli": "gram-negative",
    "Salmonella enterica subsp. enterica serovar Westhampton": "gram-negative",
    "Pseudomonas luteola": "gram-negative",
    "Haloarcula marismortui (strain ATCC 43049 / DSM 3752 / JCM 8966 / VKM B-1809)": "gram-negative",
    "Geobacillus kaustophilus (strain HTA426)": "gram-positive",
    "Mycobacterium tuberculosis (strain CDC 1551 / Oshkosh)": "acid-fast",
    "Mycobacterium leprae (strain TN)": "acid-fast",
    "Salmonella choleraesuis (strain SC-B67)": "gram-negative",
    "Streptomyces caespitosus": "gram-positive",
    "Neisseria gonorrhoeae": "gram-negative",
    "Arthrospira platensis": "gram-negative",
    "Aliivibrio fischeri": "gram-negative",
    "Enterococcus faecalis (strain ATCC 700802 / V583)": "gram-positive",
    "Streptococcus pneumoniae serotype 4 (strain ATCC BAA-334 / TIGR4)": "gram-positive",
    "Escherichia coli O157:H7": "gram-negative",
    "Lactobacillus fermentum": "acid-fast",
    "Streptococcus pneumoniae": "gram-positive",
    "Enterococcus faecium": "gram-positive",
    "Chlamydia pneumoniae": "gram-negative",
    "Thermus thermophilus": "gram-negative",
    "Staphylococcus aureus (strain Newman)": "gram-positive",
    "Campylobacter jejuni subsp. jejuni serotype O:2": "gram-negative",
    "Staphylococcus epidermidis (strain ATCC 35984 / RP62A)": "gram-positive",
    "Mycobacterium smegmatis": "acid-fast",
    "Mycolicibacterium smegmatis": "acid-fast",
    "Clostridium perfringens": "gram-positive",
    "Mycobacterium tuberculosis": "acid-fast",
    "Porphyromonas gingivalis": "gram-negative",
    "Acinetobacter pittii": "gram-negative",
    "Aeromonas allosaccharophila": "gram-negative",
    "Bacillus cereus": "gram-positive",
    "Mycobacterium fortuitum": "acid-fast",
    "Morganella morganii": "gram-negative",
    "Burkholderia cenocepacia": "gram-negative",
    "Mycobacterium avium": "acid-fast",
    "Lactococcus lactis subsp. lactis (strain IL1403)": "gram-positive",
    "Lactococcus lactis subsp. cremoris": "gram-positive",
    "Helicobacter pylori (strain HPAG1)": "gram-negative",
    "Staphylococcus aureus (strain MRSA252)": "gram-positive",
    "Bacillus megaterium": "gram-positive",
    "Alcaligenes sp. (strain DSM 11172)": "gram-negative",
    "Peptoclostridium difficile (strain 630)": "gram-positive",
    "Bacillus sp.": "gram-positive",
    "Pseudomonas putida": "gram-negative",
    "Clostridioides difficile": "gram-positive",
    "Stenotrophomonas maltophilia": "gram-negative",
    "Klebsiella pneumoniae subsp. pneumoniae (strain ATCC 700721 / MGH 78578)": "gram-negative",
    "Escherichia coli DEC1B": "gram-negative",
    "Acinetobacter baumannii (strain ATCC 19606 / DSM 30007 / CIP 70.34 / JCM 6841 / NBRC 109757 / NCIMB 12457 / NCTC 12156 / 81)": "gram-negative",
}

In [6]:
m = drugcentral_df["ORGANISM"].isin(bacterial_species)
bact_drugcentral_df = drugcentral_df[m]

In [7]:
bact_drugcentral_df.shape

(706, 20)

In [8]:
cols_to_keep = [
    "DRUG_NAME",
    "STRUCT_ID",
    "TARGET_NAME",
    "ACCESSION",
    "ACT_VALUE",
    "ACT_UNIT",
    "ACT_TYPE",
    "ACT_SOURCE",
    "RELATION",
    "ORGANISM",
]
bact_drugcentral_df = bact_drugcentral_df[cols_to_keep]

In [9]:
bact_drugcentral_df["strain_type"] = bact_drugcentral_df["ORGANISM"].map(
    bacterial_species
)

In [10]:
bact_drugcentral_df.head()

Unnamed: 0,DRUG_NAME,STRUCT_ID,TARGET_NAME,ACCESSION,ACT_VALUE,ACT_UNIT,ACT_TYPE,ACT_SOURCE,RELATION,ORGANISM,strain_type
17,aminopterin,21,Dihydrofolate reductase,P00381,8.3,,IC50,CHEMBL,=,Lactobacillus casei,acid-fast
18,aminopterin,21,Dihydrofolate reductase,B0BL08,7.96,,IC50,CHEMBL,=,Escherichia coli,gram-negative
31,fluorouracil,26,Uracil phosphoribosyltransferase,P0A8F0,4.89,,Ki,CHEMBL,=,Escherichia coli (strain K12),gram-negative
32,azaribine,27,Orotidine 5'-phosphate decarboxylase,O26232,4.96,,Ki,CHEMBL,=,Methanothermobacter thermautotrophicus (strain...,gram-positive
51,acarbose,39,Alpha-amylase,P94451,6.47,,Ki,CHEMBL,=,Geobacillus stearothermophilus,gram-positive


In [11]:
bact_drugcentral_df.to_csv(
    f"{DATA_DIR}/bacterial_drugcentral.tsv", sep="\t", index=False
)

In [12]:
bact_drugcentral_df["strain_type"].value_counts()

strain_type
gram-negative    432
gram-positive    151
acid-fast        123
Name: count, dtype: int64