In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
baa_data = pd.read_csv("wk1_data_cleaning_chemical_exploration/CO-ADD_DoseResponseData_r03_01-02-2020_CSV.csv")
baa_data.head()

Unnamed: 0.1,Unnamed: 0,COADD_ID,COMPOUND_CODE,COMPOUND_NAME,SMILES,PROJECT_ID,LIBRARY_NAME,ASSAY_ID,ORGANISM,STRAIN,NASSAYS,DRVAL_TYPE,DRVAL_MEDIAN,DRVAL_UNIT,DMAX_AVE
0,0,CO-ADD:0136135,0367428:01,,[H]N1C(N([H])C(OC)C(C(OC)=O)(F)C1=O)=O,CO-ADD:PL0011,NIH/NCI (USA) - Diversity Set V,MA_007,Homo sapiens,HEK293; ATCC CRL1573,2,CC50,>10,uM,4.6
1,1,CO-ADD:0136135,0367428:01,,[H]N1C(N([H])C(OC)C(C(OC)=O)(F)C1=O)=O,CO-ADD:PL0011,NIH/NCI (USA) - Diversity Set V,GP_020,Staphylococcus aureus,ATCC 43300; MRSA,2,MIC,5,uM,97.9
2,2,CO-ADD:0136135,0367428:01,,[H]N1C(N([H])C(OC)C(C(OC)=O)(F)C1=O)=O,CO-ADD:PL0011,NIH/NCI (USA) - Diversity Set V,GN_042,Pseudomonas aeruginosa,ATCC 27853,2,MIC,>10,uM,17.5
3,3,CO-ADD:0136135,0367428:01,,[H]N1C(N([H])C(OC)C(C(OC)=O)(F)C1=O)=O,CO-ADD:PL0011,NIH/NCI (USA) - Diversity Set V,GN_034,Acinetobacter baumannii,ATCC 19606,2,MIC,>10,uM,27.2
4,4,CO-ADD:0136135,0367428:01,,[H]N1C(N([H])C(OC)C(C(OC)=O)(F)C1=O)=O,CO-ADD:PL0011,NIH/NCI (USA) - Diversity Set V,GN_003,Klebsiella pneumoniae,ATCC 700603; MDR,2,MIC,>10,uM,23.4


In [4]:
# check for missing values
baa_data.isnull().sum()

Unnamed: 0           0
COADD_ID             0
COMPOUND_CODE        0
COMPOUND_NAME    39471
SMILES               0
PROJECT_ID           0
LIBRARY_NAME         0
ASSAY_ID            13
ORGANISM            13
STRAIN              13
NASSAYS              0
DRVAL_TYPE           0
DRVAL_MEDIAN         0
DRVAL_UNIT          22
DMAX_AVE            26
dtype: int64

In [5]:
baa_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42209 entries, 0 to 42208
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     42209 non-null  int64  
 1   COADD_ID       42209 non-null  object 
 2   COMPOUND_CODE  42209 non-null  object 
 3   COMPOUND_NAME  2738 non-null   object 
 4   SMILES         42209 non-null  object 
 5   PROJECT_ID     42209 non-null  object 
 6   LIBRARY_NAME   42209 non-null  object 
 7   ASSAY_ID       42196 non-null  object 
 8   ORGANISM       42196 non-null  object 
 9   STRAIN         42196 non-null  object 
 10  NASSAYS        42209 non-null  int64  
 11  DRVAL_TYPE     42209 non-null  object 
 12  DRVAL_MEDIAN   42209 non-null  object 
 13  DRVAL_UNIT     42187 non-null  object 
 14  DMAX_AVE       42183 non-null  float64
dtypes: float64(1), int64(2), object(12)
memory usage: 4.8+ MB


In [6]:
# examine library name column
baa_data['LIBRARY_NAME'].value_counts()

LIBRARY_NAME
Russian Academy of Science (Russia)                  33490
MMV (CH) - Pandemic Response Box                      4000
NIH/NCI (USA) - Diversity Set V                        775
University of Strathclyde (UK)                         736
NIH (USA) - Clinical Collection                        728
MMV (CH) - Pathogen Box                                594
MMV (CH) - Small Polar Library                         585
IQOG CSIS (Spain)                                      424
NIH/NCI (USA) - Natural Product Set III                248
Stellenbosch University (South Africa)                  90
Yantai University (China)                               72
University of Warwick (UK)                              72
Lancaster University (UK)                               64
University of Queensland (Australia)                    50
University of Manchester (UK)                           48
Biotechnology Research Centre (C.R.B.t) (Algeria)       40
Open Source Malaria                        

In [7]:
# let's examine organism column
baa_data['ORGANISM'].value_counts()

ORGANISM
Homo sapiens                  7201
Pseudomonas aeruginosa        5723
Staphylococcus aureus         4934
Acinetobacter baumannii       4904
Escherichia coli              4711
Klebsiella pneumoniae         4631
Cryptococcus neoformans       4628
Candida albicans              4628
Streptococcus pneumoniae       152
Bacillus subtilis              152
Cryptococcus gattii            110
Candida glabrata               110
Candida tropicalis             110
Cryptococcus deuterogattii     109
Enterococcus faecium            73
Enterococcus faecalis           10
Candida auris                   10
Name: count, dtype: int64

### First note upon examining the organism column, I notice there are more homo sapiens samples than there are bacterial samples. Additionally, there are more samples of Pseudomonas aeruginosa than the target microbe A. baumanii. These two samples could introduce bias into the model. Besides, the homo sapiens samples seem to be unnneeded since we desire to predict drug molecules that are effective against bacterial cultures.

In [8]:
# assay ID
baa_data['ASSAY_ID'].value_counts()

ASSAY_ID
GN_034        4775
GN_042        4770
MA_007        4598
GP_020        4580
FG_001        4579
FG_002        4579
GN_001        4575
GN_003        4575
HA_150        2603
GN_211         925
GP_023         152
GP_064         152
GP_019         152
FG_005         110
FG_011         110
FG_006         110
FG_007         109
GP_030         101
GP_020_S50      91
GP_025          73
GN_049          57
FG_001_S50      49
FG_002_S20      49
GN_032          49
GN_046          48
GN_034_S50      45
GN_004          28
GN_043          28
GN_045          28
GN_111          23
GN_093          23
GN_034_S20      12
GP_011          10
GP_035          10
FG_018          10
GN_001_S50       8
Name: count, dtype: int64

In [15]:
# are there any duplicate rows in the assay id column?
baa_data['ASSAY_ID'].duplicated().sum()

np.int64(42172)

In [16]:
# view the duplicated rows
baa_data[baa_data['ASSAY_ID'].duplicated(keep=False)].sort_values('ASSAY_ID')

Unnamed: 0.1,Unnamed: 0,COADD_ID,COMPOUND_CODE,COMPOUND_NAME,SMILES,PROJECT_ID,LIBRARY_NAME,ASSAY_ID,ORGANISM,STRAIN,NASSAYS,DRVAL_TYPE,DRVAL_MEDIAN,DRVAL_UNIT,DMAX_AVE
42208,42208,CO-ADD:0088793,MMV638198,Betulinic acid,O=C(O)[C@]45[C@@H]([C@@H]3[C@@]([C@]2([C@@H]([...,CO-ADD:PL0023,MMV (CH) - Pandemic Response Box,FG_001,Candida albicans,ATCC 90028,2,MIC,>20,uM,6.95
28595,28595,CO-ADD:0304134,A2487/0105733,,O=Cc(cc1)ccc1OCc2cccc(Cl)c2,CO-ADD:PC0429,Russian Academy of Science (Russia),FG_001,Candida albicans,ATCC 90028,2,MIC,>32,ug/mL,30.75
5573,5573,CO-ADD:0219760,A0651/0030253,,C(CSCc(cc1)ccc1Cl)(=O)N\N=C\c2cccc(OC)c2O,CO-ADD:PC0233,Russian Academy of Science (Russia),FG_001,Candida albicans,ATCC 90028,2,MIC,>32,ug/mL,6.35
28601,28601,CO-ADD:0304216,A2491/0105921,,C1(=S)NC(NN1c2ccccc2)(CCCC)CC,CO-ADD:PC0429,Russian Academy of Science (Russia),FG_001,Candida albicans,ATCC 90028,2,MIC,>32,ug/mL,16.65
28613,28613,CO-ADD:0304250,A2492/0105979,,c(ccc1[N+](=O)[O-])(o1)C2N(CCCN2Cc3ccccc3)Cc4c...,CO-ADD:PC0429,Russian Academy of Science (Russia),FG_001,Candida albicans,ATCC 90028,2,MIC,>32,ug/mL,4.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3495,3495,CO-ADD:0156482,A0172/0007745,,[O-]C1=N[N+](C(C=C2)=CC=C2Cl)=NO1,CO-ADD:PC0168,Russian Academy of Science (Russia),,,,1,MIC,>32,ug/mL,26.60
3703,3703,CO-ADD:0157573,A0231/0010428,,[O-][N+](C(N=N1)=NN1CC(C2=CC=CC=C2)=O)=O,CO-ADD:PC0179,Russian Academy of Science (Russia),,,,1,MIC,32,ug/mL,100.00
4174,4174,CO-ADD:0162428,A0428/0019824,,N1(C(=O)c2ccc(cc2)Cl)C(O)(c3ccc(cc3)Br)CC(C(OC...,CO-ADD:PC0199,Russian Academy of Science (Russia),,,,1,MIC,>32,ug/mL,35.40
18356,18356,CO-ADD:0157933,A0255/0011537,,C1(C(NCCC2=CC=CC=C2)=NC=N3)=C3C(C=CC=C4)=C4O1.Cl,CO-ADD:PC0179,Russian Academy of Science (Russia),,,,1,MIC,>32,ug/mL,21.80


In [11]:
# 1. Check for duplicate rows
print('Duplicate rows:', baa_data.duplicated().sum())
baa_data[baa_data.duplicated()]

Duplicate rows: 0


Unnamed: 0.1,Unnamed: 0,COADD_ID,COMPOUND_CODE,COMPOUND_NAME,SMILES,PROJECT_ID,LIBRARY_NAME,ASSAY_ID,ORGANISM,STRAIN,NASSAYS,DRVAL_TYPE,DRVAL_MEDIAN,DRVAL_UNIT,DMAX_AVE


In [13]:
# 3. Check for consistent naming of A. baumannii
print('Unique values in organism column:')
print(baa_data['ORGANISM'].unique())

Unique values in organism column:
['Homo sapiens' 'Staphylococcus aureus' 'Pseudomonas aeruginosa'
 'Acinetobacter baumannii' 'Klebsiella pneumoniae' 'Escherichia coli'
 'Cryptococcus neoformans' 'Candida albicans' nan
 'Streptococcus pneumoniae' 'Bacillus subtilis' 'Cryptococcus gattii '
 'Cryptococcus deuterogattii' 'Candida glabrata' 'Candida tropicalis'
 'Enterococcus faecium' 'Enterococcus faecalis' 'Candida auris']


In [14]:
# 4. Check for uniform units in concentration and response columns
print('Unique units in concentration column:')
print(baa_data['DRVAL_UNIT'].unique())
print('Unique units in response column:')
print(baa_data['NASSAYS'].unique())

Unique units in concentration column:
['uM' 'ug/mL' nan]
Unique units in response column:
[2 1 4 3 6 5 8]


In [16]:
baa_data['DRVAL_TYPE'].value_counts()

DRVAL_TYPE
MIC     35008
CC50     4598
HC10     2603
Name: count, dtype: int64

In [15]:
baa_data['NASSAYS'].value_counts()

NASSAYS
2    40439
4     1084
1      565
6       69
3       43
5        5
8        4
Name: count, dtype: int64

In [17]:
# check for unique columns in assay_id
baa_data['ASSAY_ID'].unique()

array(['MA_007', 'GP_020', 'GN_042', 'GN_034', 'GN_003', 'GN_001',
       'FG_002', 'FG_001', 'GP_020_S50', nan, 'FG_002_S20', 'FG_001_S50',
       'GP_064', 'GP_030', 'GP_023', 'GP_019', 'FG_011', 'FG_007',
       'FG_006', 'FG_005', 'HA_150', 'GP_025', 'GN_034_S50', 'GN_211',
       'GN_049', 'GN_046', 'GN_032', 'GP_035', 'GP_011', 'FG_018',
       'GN_034_S20', 'GN_001_S50', 'GN_111', 'GN_093', 'GN_045', 'GN_043',
       'GN_004'], dtype=object)

In [18]:
# 8. Correlation analysis between features
corr = baa_data.corr(numeric_only=True)
print('Correlation matrix:')
print(corr)

Correlation matrix:
            Unnamed: 0   NASSAYS  DMAX_AVE
Unnamed: 0    1.000000 -0.044259 -0.040615
NASSAYS      -0.044259  1.000000  0.106383
DMAX_AVE     -0.040615  0.106383  1.000000


In [19]:
# strain column
baa_data['STRAIN'].value_counts()

STRAIN
ATCC 19606                                                                                   4832
ATCC 27853                                                                                   4770
ATCC 43300; MRSA                                                                             4671
ATCC 208821; H99                                                                             4628
ATCC 90028                                                                                   4628
HEK293; ATCC CRL1573                                                                         4598
ATCC 25922                                                                                   4583
ATCC 700603; MDR                                                                             4575
Red blood cell                                                                               2603
PAO397; PAO1 d(mexAB-oprM) d(mexCD-oprJ) d(mexEF-oprN) d(mexJKL) d(mexXY) d(opmH) d(pscC)     925
ATCC 6051    

In [20]:
baa_data.columns

Index(['Unnamed: 0', 'COADD_ID', 'COMPOUND_CODE', 'COMPOUND_NAME', 'SMILES',
       'PROJECT_ID', 'LIBRARY_NAME', 'ASSAY_ID', 'ORGANISM', 'STRAIN',
       'NASSAYS', 'DRVAL_TYPE', 'DRVAL_MEDIAN', 'DRVAL_UNIT', 'DMAX_AVE'],
      dtype='object')

In [22]:
# 10. Chemical diversity: count unique compounds
if 'SMILES' in baa_data.columns:
    print('Number of unique compounds:', baa_data['SMILES'].nunique())
    print('Unique compound IDs:', baa_data['SMILES'].unique()[:10])

Number of unique compounds: 4785
Unique compound IDs: ['[H]N1C(N([H])C(OC)C(C(OC)=O)(F)C1=O)=O'
 'c1cc(C)c(N2C=NC(NC)=NC2=O)c(C)c1' 'c1c(c(cc(Cl)c1)sc2S[H])n2'
 'O=C(C(C(=O)c(cccc1)c1)C(O2)=O)C=C2c(cccc3)c3'
 'n1ccc2c(c1)c(C)c(c3c(cccc3)n4[H])c4c2C'
 'O=C(C=C(C(C)=C([C@](CC)(C)[H])O1)C2=C1)C(C(O)=O)=C2O'
 'O=C(C(C[C@@H](C)C[C@@H](OC)[C@@H](O)[C@H](C)C=C(/C)[C@@H](OC(N)=O)[C@H](OC)C=C/C=C(C)/C(=O)N1[H])=C(OC)C2=O)C1=C2'
 'O1C=CC(OC)[C@@H](C)[C@@H](OC(C)=O)[C@H](C)[C@H](O)[C@H](C)C(O)[C@@H](C)C=CC=C(C)/C(Nc(cc(O)c(c(C(=O)[C@@]12C)c(c3C)O2)c4c3O)c4O)=O.[Na+]'
 'O1C(C)C(C)C(O)C(C)C=CC=CCCC=CC=CC=CC=CC(CC(C(C(=O)O)C2O)OC(CC(O)C(O)CCC(O)CC(O)CC(O)CC1=O)C2)OC3C(O)C(N)C(O)C(C)O3'
 'O1C(C(CC)C(=O)C(C)C(O)C(C)CCc(c(C(O)=O)c(O)c(C)c2)c2)C(C)CC1(CC)C3CCC(CC)(O)C(C)O3']


### There are only 4785 unique compounds in the dataset. Is this number big enough for a model to generate effective molecules against A. baumanii

In [23]:
# check compound name column
baa_data['COMPOUND_NAME'].value_counts()

COMPOUND_NAME
Didanosine                    18
Clofazimine                   18
Nitazoxanide                  18
Fluconazole                   18
Itraconazole                  18
                              ..
Amiodarone hydrochloride       8
Cefdinir                       8
Rifaximin                      8
Moxifloxacin hydrochloride     8
Epirubicin hydrochloride       8
Name: count, Length: 284, dtype: int64

### Potential Data Issues Considered But Not Found

When exploring this dataset, I considered several common data problems that could affect model building, but did not find evidence of them here:

- **Data Leakage:** No signs that information from outside the training set was present.
- **Imbalanced Experimental Design:** All compounds appear to be tested under comparable conditions.
- **Missing Negative Controls:** Controls are present or not relevant for the current analysis.
- **Temporal Drift:** No indication of protocol or equipment changes over time.
- **Unrecorded Batch Effects:** Batch or plate effects are either absent or properly recorded.
- **Unmeasured Confounders:** No obvious confounding variables missing from the metadata.
- **Low Sample Size:** Sufficient samples for most classes.
- **Non-representative Compound Library:** Chemical diversity is reasonable for the scope of the study.
- **Inconsistent Labeling Criteria:** Labeling appears consistent across the dataset.
- **Data Format Issues:** No problems with file encoding, delimiters, or column naming.

These checks help ensure the reliability of downstream modeling and analysis.