In [2]:
import numpy as np  
import pandas as pd  
import matplotlib.pyplot as plt

Read in web processed data

In [3]:
plates = pd.read_csv('Data/plate_datatype.csv')
kinetics = pd.read_csv('Data/kinetic_datatype.csv')
summary = pd.read_csv('Data/summary_table.csv')



In [4]:
wells_with_negative_controls = ['PM01','PM02A','PM03B','PM04A','PM05','PM06','PM07','PM08']
wells_with_positive_controls = ['PM05']



Ensure naming for strains are consistent to avoid errors during replicate grouping

In [5]:
plate_strains = kinetics[['Strain','Plate']].drop_duplicates().groupby('Plate')['Strain'].unique()

for plate, strains in plate_strains.items():
    print(f"{plate}: {', '.join(strains)}")


PM01: BW25113, MS5762, MS8806, yfdF, yhjE, ypjC, yqeG
PM02A: argF, argH, BW25113, BW25113wt, malE, MS5762, MS8806, yfdF, yhjE, ypjC, yqeG
PM05: BW25113wt, hisJ
PM09: BW25113, BW25113WT, dedA, yahO, ycaC, ydfR, ygaM, ygaY, yiaG, yjjU, yjjV, ynfN, yodC
PM10: BW25113, BW25113WT, dedA, yahO, ycaC, ydfR, ydgD, ygaM, yiaG, yjjU, yjjV, ynfN, yodC
PM11C: appY, BW25113, gnsB, wbbK, yahO, ycaC, ydcD, ygaM, yiaG
PM12B: appY, BW25113, gnsB, wbbK, ydcD, ydjM, yebF, yebG


Rename strains below and re-annotate replicate IDs for merged strains (comment out if not needed)

In [7]:
kinetics['Strain'] = kinetics['Strain'].replace('BW25113wt', 'BW25113').replace('BW25113WT', 'BW25113')

# (optional) sort so numbering is stable/predictable
kinetics =kinetics.sort_values(['Strain', 'Media', 'Well', 'Plate', 'PlateIDs'])

# re-annotate Replicates: R1, R2, ... within each (Strain, Media, Well, Plate)
kinetics['Replicates'] = (
    kinetics.groupby(['Strain', 'Media', 'Well', 'Plate'], dropna=False)
      .cumcount()
      .add(1)
      .astype(str)
      .radd('R')
)
kinetics 

Unnamed: 0,PlateIDs,Strain,Specie,Well,Plate,Media,Replicates,Compound,Description,KEGG ID,CAS ID,Max Resp,Max Resp Rate,Time till max resp rate,AUC,Growth,Control Well Growth_x,Control Well Growth_y
0,ECP0,BW25113,E. coli,A01,PM05,IF0a-NaSuccinate-Fe Citrate,R1,Negative Control,"Nutritional supplement, Negative control",,CAS Negative Control,261.753811,18.563138,29.25,9175.345418,1,1,1
96,ECP1,BW25113,E. coli,A01,PM05,IF0a-NaSuccinate-Fe Citrate,R2,Negative Control,"Nutritional supplement, Negative control",,CAS Negative Control,237.861545,15.101668,26.00,8396.822747,1,1,1
1,ECP0,BW25113,E. coli,A02,PM05,IF0a-NaSuccinate-Fe Citrate,R1,Positive Control,"Nutritional supplement, Positive control",,CAS Luria Broth,264.675626,16.492538,39.50,8660.186453,1,1,1
97,ECP1,BW25113,E. coli,A02,PM05,IF0a-NaSuccinate-Fe Citrate,R2,Positive Control,"Nutritional supplement, Positive control",,CAS Luria Broth,280.122149,17.406779,45.50,9235.361102,1,1,1
2,ECP0,BW25113,E. coli,A03,PM05,IF0a-NaSuccinate-Fe Citrate,R1,L-Alanine,Nutritional supplement,C00041,CAS 56-41-7,267.919578,14.981677,34.25,8812.777956,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11998,ECP124,yqeG,E. coli,H11,PM02A,M9 No Carbon,R2,"2,3-Butanedione","C-Source, alcohol",C00741,CAS 431-03-8,39.233041,3.253104,48.00,1237.025130,0,0,0
11903,ECP123,yqeG,E.Coli,H12,PM01,M9 No Carbon,R1,2-Aminoethanol,"C-Source, alcohol",C00189,CAS 2002-24-6,65.290281,9.003425,41.75,2844.773308,0,0,0
12095,ECP125,yqeG,E.Coli,H12,PM01,M9 No Carbon,R2,2-Aminoethanol,"C-Source, alcohol",C00189,CAS 2002-24-6,62.152873,7.807276,48.00,2621.327148,0,0,0
11807,ECP122,yqeG,E. coli,H12,PM02A,M9 No Carbon,R1,3-Hydroxy-2-butanone,"C-Source, alcohol",C00466,CAS 513-86-0,94.041746,12.522454,46.25,2903.816716,0,0,0


In [None]:
def make_growth_df_MAD(kinetic_df, k=3.0, ridge_frac=0.05, fold_min=None, delta_min=None):
    """
    k:           robust 'k-sigma' multiplier (e.g., 3.0)
    ridge_frac:  small noise floor as a fraction of the median control AUC (e.g., 0.05 = 5%)
    fold_min:    optional fold-change gate (e.g., 1.5) -> requires (AUC / ctrl_median) >= fold_min
    delta_min:   optional absolute-difference gate (e.g., 100) -> requires (AUC - ctrl_median) >= delta_min
    """

    # 1) Aggregate replicate kinetics to get per-(Strain, Well, ...) means
    agg_cols = ["Max Resp", "Max Resp Rate", "AUC"]
    meta_cols = ["Strain", "Well", "Plate", "Media", "Compound", "KEGG ID", "CAS ID", "Description"]
    growth = (kinetic_df.groupby(meta_cols, as_index=False)[agg_cols].mean())

    # 2) Pull all control wells (A01) across strains
    ctrl = kinetic_df.loc[kinetic_df["Well"] == "A01", ["Strain", "AUC"]].copy()
    if ctrl.empty:
        raise ValueError("No control wells (A01) found in `kinetic_df`.")

    # 3) Per-strain control median (keeps per-strain baseline differences)
    ctrl_med_per_strain = ctrl.groupby("Strain")["AUC"].median().rename("ctrl_median")
    ctrl_std_per_strain = ctrl.groupby("Strain")["AUC"].std().rename("ctrl_std")
    tmp = ctrl.merge(ctrl_med_per_strain, on="Strain", how="left")
    resid = (tmp["AUC"] - tmp["ctrl_median"]).to_numpy()
    MAD_pooled = 1.4826 * np.median(np.abs(resid)) 
    ridge = ridge_frac * np.median(ctrl["AUC"])
    sigma = max(MAD_pooled, ridge)
    growth = growth.merge(ctrl_med_per_strain, on="Strain", how="left")
    growth = growth.merge(ctrl_std_per_strain, on="Strain", how="left")
    growth["Threshold"] = growth["ctrl_median"] + k * sigma
    activity = growth["AUC"] > growth["Threshold"]

    growth["Activity"] = activity.astype(bool)
    return growth