# Processing and generating bacterial subset of CO-ADD database 

The database can be found [here](http://db.co-add.org/).

In [1]:
import pandas as pd
import json

pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
DATA_DIR = "../data"

In [3]:
coadd_df = pd.read_csv(
    f"{DATA_DIR}/raw/CO-ADD_DoseResponseData_r03_01-02-2020_CSV.zip",
    low_memory=False,
    on_bad_lines="skip",
    compression="zip",
)
coadd_df.head(3)

Unnamed: 0.1,Unnamed: 0,COADD_ID,COMPOUND_CODE,COMPOUND_NAME,SMILES,PROJECT_ID,LIBRARY_NAME,ASSAY_ID,ORGANISM,STRAIN,NASSAYS,DRVAL_TYPE,DRVAL_MEDIAN,DRVAL_UNIT,DMAX_AVE
0,0,CO-ADD:0136135,0367428:01,,[H]N1C(N([H])C(OC)C(C(OC)=O)(F)C1=O)=O,CO-ADD:PL0011,NIH/NCI (USA) - Diversity Set V,MA_007,Homo sapiens,HEK293; ATCC CRL1573,2,CC50,>10,uM,4.6
1,1,CO-ADD:0136135,0367428:01,,[H]N1C(N([H])C(OC)C(C(OC)=O)(F)C1=O)=O,CO-ADD:PL0011,NIH/NCI (USA) - Diversity Set V,GP_020,Staphylococcus aureus,ATCC 43300; MRSA,2,MIC,5,uM,97.9
2,2,CO-ADD:0136135,0367428:01,,[H]N1C(N([H])C(OC)C(C(OC)=O)(F)C1=O)=O,CO-ADD:PL0011,NIH/NCI (USA) - Diversity Set V,GN_042,Pseudomonas aeruginosa,ATCC 27853,2,MIC,>10,uM,17.5


In [4]:
coadd_df["ORGANISM"].nunique()

17

# Subset to bacterial assays

In [5]:
bacterial_species = json.load(open(f"{DATA_DIR}/mapping/bact_mapper.json"))
len(bacterial_species)

382

In [6]:
m1 = coadd_df["ORGANISM"].isin(bacterial_species)
m2 = coadd_df["DRVAL_TYPE"].isin(["IC50", "Ki", "MIC"])
bact_coadd_df = coadd_df[m1 & m2]

In [7]:
bact_coadd_df.shape

(25290, 15)

In [8]:
cols_to_keep = [
    "COADD_ID",
    "COMPOUND_NAME",
    "SMILES",
    "ORGANISM",
    "DRVAL_TYPE",
    "DRVAL_MEDIAN",
    "DRVAL_UNIT",
]
bact_coadd_df = bact_coadd_df[cols_to_keep]

In [9]:
bact_coadd_df["strain_type"] = bact_coadd_df["ORGANISM"].map(bacterial_species)

In [10]:
bact_coadd_df.head()

Unnamed: 0,COADD_ID,COMPOUND_NAME,SMILES,ORGANISM,DRVAL_TYPE,DRVAL_MEDIAN,DRVAL_UNIT,strain_type
1,CO-ADD:0136135,,[H]N1C(N([H])C(OC)C(C(OC)=O)(F)C1=O)=O,Staphylococcus aureus,MIC,5,uM,gram-positive
2,CO-ADD:0136135,,[H]N1C(N([H])C(OC)C(C(OC)=O)(F)C1=O)=O,Pseudomonas aeruginosa,MIC,>10,uM,gram-negative
3,CO-ADD:0136135,,[H]N1C(N([H])C(OC)C(C(OC)=O)(F)C1=O)=O,Acinetobacter baumannii,MIC,>10,uM,gram-negative
4,CO-ADD:0136135,,[H]N1C(N([H])C(OC)C(C(OC)=O)(F)C1=O)=O,Klebsiella pneumoniae,MIC,>10,uM,gram-negative
5,CO-ADD:0136135,,[H]N1C(N([H])C(OC)C(C(OC)=O)(F)C1=O)=O,Escherichia coli,MIC,>10,uM,gram-negative


In [11]:
bact_coadd_df.to_csv(f"{DATA_DIR}/processed/bacterial_coadd.tsv", sep="\t", index=False)

In [12]:
bact_coadd_df["strain_type"].value_counts()

strain_type
gram-negative    19969
gram-positive     5321
Name: count, dtype: int64

In [13]:
bact_coadd_df["ORGANISM"].nunique()

9