In [1]:
import numpy as np  
import pandas as pd  
import matplotlib.pyplot as plt

Read in web processed data

In [2]:
plates = pd.read_csv('Data/plate_datatype.csv')
kinetics = pd.read_csv('Data/kinetic_datatype.csv')
summary = pd.read_csv('Data/summary_table.csv')

non_control_plate_reference = pd.read_csv('Data/ecoli_MG1655_kinetic_data.csv')
non_control_plate_reference


Unnamed: 0,Plate IDs,Strain ID,Strain,Metadata/Modifications,Project,Well,Plate,Media,Replicates,Compound,KEGG ID,CAS ID,Max Resp,Max Resp Rate,Time till max resp rate,AUC,Growth,MLST,Phylogroup/Genome Cluster
0,ECP395,MG1655 WT,MG1655,WT,AntibiotICA,A01,PM11C,IF10b,R1,Amikacin_1,D00865,CAS 39831-55-5,314.462794,52.693016,39.25,12935.687658,1,10,A
1,ECP395,MG1655 WT,MG1655,WT,AntibiotICA,A01,PM11C,IF10b,R2,Amikacin_1,D00865,CAS 39831-55-5,308.297937,54.726883,40.00,12649.642634,1,10,A
2,ECP395,MG1655 WT,MG1655,WT,AntibiotICA,A02,PM11C,IF10b,R1,Amikacin_2,D00865,CAS 39831-55-5,305.861257,52.126786,39.75,12462.492126,1,10,A
3,ECP395,MG1655 WT,MG1655,WT,AntibiotICA,A02,PM11C,IF10b,R2,Amikacin_2,D00865,CAS 39831-55-5,310.501395,54.530693,40.50,12610.347766,1,10,A
4,ECP395,MG1655 WT,MG1655,WT,AntibiotICA,A03,PM11C,IF10b,R1,Amikacin_3,D00865,CAS 39831-55-5,305.197711,52.774471,37.50,12345.150575,1,10,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,ECP395,MG1655 WT,MG1655,WT,AntibiotICA,H10,PM11C,IF10b,R2,Ofloxacin_2,D00453,CAS 82419-36-1,296.479993,50.768528,48.00,12034.984786,1,10,A
188,ECP395,MG1655 WT,MG1655,WT,AntibiotICA,H11,PM11C,IF10b,R1,Ofloxacin_3,D00453,CAS 82419-36-1,214.369584,12.424895,48.00,4736.745299,1,10,A
189,ECP395,MG1655 WT,MG1655,WT,AntibiotICA,H11,PM11C,IF10b,R2,Ofloxacin_3,D00453,CAS 82419-36-1,114.641974,8.243999,47.25,2705.359169,0,10,A
190,ECP395,MG1655 WT,MG1655,WT,AntibiotICA,H12,PM11C,IF10b,R1,Ofloxacin_4,D00453,CAS 82419-36-1,12.342796,0.662017,48.00,543.575644,0,10,A


In [3]:
wells_with_negative_controls = ['PM01','PM02A','PM03B','PM04A','PM05','PM06','PM07','PM08']
wells_with_positive_controls = ['PM05']




Ensure naming for strains are consistent to avoid errors during replicate grouping

In [4]:
plate_strains = kinetics[['Strain','Plate']].drop_duplicates().groupby('Plate')['Strain'].unique()

for plate, strains in plate_strains.items():
    print(f"{plate}: {', '.join(strains)}")


PM01: BW25113, MS5762, MS8806, yfdF, yhjE, ypjC, yqeG
PM02A: argF, argH, BW25113, BW25113wt, malE, MS5762, MS8806, yfdF, yhjE, ypjC, yqeG
PM05: BW25113wt, hisJ
PM09: BW25113, BW25113WT, dedA, yahO, ycaC, ydfR, ygaM, ygaY, yiaG, yjjU, yjjV, ynfN, yodC
PM10: BW25113, BW25113WT, dedA, yahO, ycaC, ydfR, ydgD, ygaM, yiaG, yjjU, yjjV, ynfN, yodC
PM11C: appY, BW25113, gnsB, wbbK, yahO, ycaC, ydcD, ygaM, yiaG
PM12B: appY, BW25113, gnsB, wbbK, ydcD, ydjM, yebF, yebG


Rename strains below and re-annotate replicate IDs for merged strains (comment out if not needed) [For both kinetics and plate datatype]

In [5]:
kinetics['Strain'] = kinetics['Strain'].replace('BW25113wt', 'BW25113').replace('BW25113WT', 'BW25113')

# (optional) sort so numbering is stable/predictable
kinetics =kinetics.sort_values(['Strain', 'Media', 'Well', 'Plate', 'PlateIDs'])

# re-annotate Replicates: R1, R2, ... within each (Strain, Media, Well, Plate)
kinetics['Replicates'] = (
    kinetics.groupby(['Strain', 'Media', 'Well', 'Plate'], dropna=False)
      .cumcount()
      .add(1)
      .astype(str)
      .radd('R')
)
kinetics = kinetics.reset_index(drop=True)
kinetics

Unnamed: 0,PlateIDs,Strain,Specie,Well,Plate,Media,Replicates,Compound,Description,KEGG ID,CAS ID,Max Resp,Max Resp Rate,Time till max resp rate,AUC,Growth,Control Well Growth_x,Control Well Growth_y
0,ECP0,BW25113,E. coli,A01,PM05,IF0a-NaSuccinate-Fe Citrate,R1,Negative Control,"Nutritional supplement, Negative control",,CAS Negative Control,261.753811,18.563138,29.25,9175.345418,1,1,1
1,ECP1,BW25113,E. coli,A01,PM05,IF0a-NaSuccinate-Fe Citrate,R2,Negative Control,"Nutritional supplement, Negative control",,CAS Negative Control,237.861545,15.101668,26.00,8396.822747,1,1,1
2,ECP0,BW25113,E. coli,A02,PM05,IF0a-NaSuccinate-Fe Citrate,R1,Positive Control,"Nutritional supplement, Positive control",,CAS Luria Broth,264.675626,16.492538,39.50,8660.186453,1,1,1
3,ECP1,BW25113,E. coli,A02,PM05,IF0a-NaSuccinate-Fe Citrate,R2,Positive Control,"Nutritional supplement, Positive control",,CAS Luria Broth,280.122149,17.406779,45.50,9235.361102,1,1,1
4,ECP0,BW25113,E. coli,A03,PM05,IF0a-NaSuccinate-Fe Citrate,R1,L-Alanine,Nutritional supplement,C00041,CAS 56-41-7,267.919578,14.981677,34.25,8812.777956,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12091,ECP124,yqeG,E. coli,H11,PM02A,M9 No Carbon,R2,"2,3-Butanedione","C-Source, alcohol",C00741,CAS 431-03-8,39.233041,3.253104,48.00,1237.025130,0,0,0
12092,ECP123,yqeG,E.Coli,H12,PM01,M9 No Carbon,R1,2-Aminoethanol,"C-Source, alcohol",C00189,CAS 2002-24-6,65.290281,9.003425,41.75,2844.773308,0,0,0
12093,ECP125,yqeG,E.Coli,H12,PM01,M9 No Carbon,R2,2-Aminoethanol,"C-Source, alcohol",C00189,CAS 2002-24-6,62.152873,7.807276,48.00,2621.327148,0,0,0
12094,ECP122,yqeG,E. coli,H12,PM02A,M9 No Carbon,R1,3-Hydroxy-2-butanone,"C-Source, alcohol",C00466,CAS 513-86-0,94.041746,12.522454,46.25,2903.816716,0,0,0


In [6]:
plates['Strain'] = plates['Strain'].replace('BW25113wt', 'BW25113').replace('BW25113WT', 'BW25113')

plates = plates.sort_values(['Strain', 'Media', 'Well', 'Plate', 'PlateIDs'])

# re-annotate Replicates: R1, R2, ... within each (Strain, Media, Well, Plate)
plates['Replicates'] = (
    plates.groupby(['Strain', 'Media', 'Well', 'Plate'], dropna=False)
      .cumcount()
      .add(1)
      .astype(str)
      .radd('R')
)
plates = plates.reset_index(drop=True)
plates

Unnamed: 0,PlateIDs,Strain,Plate,Well,Media,Replicates,Compound,Description,KEGG ID,CAS ID,...,46.0hrs,46.25hrs,46.5hrs,46.75hrs,47.0hrs,47.25hrs,47.5hrs,47.75hrs,48.0hrs,Control Well Growth
0,ECP0,BW25113,PM05,A01,IF0a-NaSuccinate-Fe Citrate,R1,Negative Control,"Nutritional supplement, Negative control",,CAS Negative Control,...,259.8905,258.9458,261.5874,261.0262,259.5861,258.4217,259.3940,261.6190,258.5275,1
1,ECP1,BW25113,PM05,A01,IF0a-NaSuccinate-Fe Citrate,R2,Negative Control,"Nutritional supplement, Negative control",,CAS Negative Control,...,229.1980,231.4736,230.5658,230.8308,228.2374,229.3019,229.9319,230.2345,230.3918,1
2,ECP0,BW25113,PM05,A02,IF0a-NaSuccinate-Fe Citrate,R1,Positive Control,"Nutritional supplement, Positive control",,CAS Luria Broth,...,263.4907,263.8907,264.2415,262.3236,263.7173,265.1358,264.9956,263.3537,263.6501,1
3,ECP1,BW25113,PM05,A02,IF0a-NaSuccinate-Fe Citrate,R2,Positive Control,"Nutritional supplement, Positive control",,CAS Luria Broth,...,278.2465,280.7276,279.6475,278.5150,282.2908,281.5971,278.9417,279.3536,278.2820,1
4,ECP0,BW25113,PM05,A03,IF0a-NaSuccinate-Fe Citrate,R1,L-Alanine,Nutritional supplement,C00041,CAS 56-41-7,...,267.7734,267.3104,268.2467,268.2766,265.5294,266.5340,270.1043,267.6949,267.3128,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12091,ECP124,yqeG,PM02A,H11,M9 No Carbon,R2,"2,3-Butanedione","C-Source, alcohol",C00741,CAS 431-03-8,...,36.5544,38.2169,39.9751,38.8064,36.9787,36.0019,37.7878,35.7819,38.4929,0
12092,ECP123,yqeG,PM01,H12,M9 No Carbon,R1,2-Aminoethanol,"C-Source, alcohol",C00189,CAS 2002-24-6,...,69.4646,69.4969,65.9572,61.8436,63.3428,61.2615,64.6685,64.0035,64.9462,0
12093,ECP125,yqeG,PM01,H12,M9 No Carbon,R2,2-Aminoethanol,"C-Source, alcohol",C00189,CAS 2002-24-6,...,63.8375,60.1078,60.9342,62.6236,61.5034,63.5197,60.9612,62.4699,60.8619,0
12094,ECP122,yqeG,PM02A,H12,M9 No Carbon,R1,3-Hydroxy-2-butanone,"C-Source, alcohol",C00466,CAS 513-86-0,...,93.3293,93.9702,92.5352,95.7805,94.1607,92.9203,96.8387,90.7808,95.0467,0


In [7]:
kinetics = kinetics.drop(columns=['Growth','Control Well Growth_x','Control Well Growth_y'])
plates = plates.drop(columns=['Control Well Growth'])

In [8]:
def make_growth_df_MAD(kinetic_df, k=3.0, ridge_frac=0.05, fold_min=None, delta_min=None,control=True):
    """
    k:           robust 'k-sigma' multiplier (e.g., 3.0)
    ridge_frac:  small noise floor as a fraction of the median control AUC (e.g., 0.05 = 5%)
    fold_min:    optional fold-change gate (e.g., 1.5) -> requires (AUC / ctrl_median) >= fold_min
    delta_min:   optional absolute-difference gate (e.g., 100) -> requires (AUC - ctrl_median) >= delta_min
    """

    # 1) Aggregate replicate kinetics to get per-(Strain, Well, ...) means
    agg_cols = ["Max Resp", "Max Resp Rate", "AUC"]
    agg_cols = ["Max Resp", "Max Resp Rate", "AUC"]
    meta_cols = ["Strain", "Well", "Media","Plate", "Compound"]
    growth = (kinetic_df.groupby(meta_cols, as_index=False)[agg_cols].mean())

    # 2) Pull all control wells (A01) across strains
    if(control):
        ctrl = kinetic_df.loc[kinetic_df["Well"] == "A01", ["Strain", "AUC"]].copy()
        
        if ctrl.empty:
            raise ValueError("No control wells (A01) found in `kinetic_df`.")
        
        # 3) Per-strain control median (keeps per-strain baseline differences)
        ctrl_med_per_strain = ctrl.groupby("Strain")["AUC"].median().rename("ctrl_median")
        ctrl_std_per_strain = ctrl.groupby("Strain")["AUC"].std().rename("ctrl_std")
        tmp = ctrl.merge(ctrl_med_per_strain, on="Strain", how="left")
        resid = (tmp["AUC"] - tmp["ctrl_median"]).to_numpy()
        MAD_pooled = 1.4826 * np.median(np.abs(resid)) 
        ridge = ridge_frac * np.median(ctrl["AUC"])
        sigma = max(MAD_pooled, ridge)
        growth = growth.merge(ctrl_med_per_strain, on="Strain", how="left")
        growth = growth.merge(ctrl_std_per_strain, on="Strain", how="left")
        growth["Threshold"] = growth["ctrl_median"] + k * sigma
        activity = growth["AUC"] > growth["Threshold"]

        growth["Activity"] = activity.astype(bool)
        growth = growth.merge(kinetic_df[['Compound','Description',"KEGG ID", "CAS ID", "Description"]].drop_duplicates().reset_index(drop=True),on='Compound')
        
    else:
        
        tmp = growth[['Strain','AUC']]
        ctrl_median = non_control_plate_reference.loc[non_control_plate_reference['Growth']==0,["Strain","AUC"]]["AUC"].median()
        ctrl_std = non_control_plate_reference.loc[non_control_plate_reference['Growth']==0,["Strain","AUC"]]["AUC"].std()
        
        growth['ctrl_median'] = ctrl_median
        growth['ctrl_std'] = ctrl_std

        growth["Threshold"] = growth["ctrl_median"] + 2.5 * ctrl_std
        activity = growth["AUC"] > growth["Threshold"]

        growth["Activity"] = activity.astype(bool)
        growth = growth.merge(kinetics[['Compound','Description',"KEGG ID", "CAS ID", "Description"]].drop_duplicates().reset_index(drop=True),on='Compound')

    return growth

In [9]:
temp = pd.DataFrame()

for plate in kinetics['Plate'].unique():
    print(f"Processing {plate} ...")
    if plate in wells_with_negative_controls:
        temp = pd.concat([temp, make_growth_df_MAD(kinetics[kinetics['Plate']==plate], control=True)])
    else:
        temp = pd.concat([temp, make_growth_df_MAD(kinetics[kinetics['Plate']==plate], control=False)])

Processing PM05 ...
Processing PM09 ...
Processing PM10 ...
Processing PM11C ...
Processing PM12B ...
Processing PM01 ...
Processing PM02A ...


In [11]:
temp.to_csv('Data/growth_calls_new.csv', index=False)   
kinetics.to_csv('Data/kinetic_datatype_new.csv', index=False)
plates.to_csv('Data/plate_datatype_new.csv', index=False)