In [None]:
## DEPENDENCIES
import numpy as np
import pandas as pd
from pathlib import Path
# import drc_timepoint_composite_score as hamza


## FUNCTIONS
# Valid file check
def read_csv_file(file_path):
    """Reads a CSV file and returns a DataFrame."""
    if not file_path.exists():
        raise FileNotFoundError(f"File '{file_path}' does not exist.")
    
    if file_path.suffix.lower() != '.csv':
        raise ValueError("Input can be only CSV file")
    return pd.read_csv(file_path)



# Standardize OD values if requested
def standardize_od(df, od_field, method):
    """Normalize Raw OD values using minmax or zscore"""
    if method == 'zscore':
        return (df[od_field] - df[od_field].mean()) / df[od_field].std()
    elif method == 'minmax':
        return (df[od_field] - df[od_field].min()) / (df[od_field].max() - df[od_field].min())
    else:
        raise ValueError("Invalid method for standardization.")



# Best timepoint using mean difference
def timepoint_mean_difference(df, group_fields, dose_field, od_field, time_field, time_threshold=3, standardize=False, method='minmax'):
    """Returns optimal timepoint using mean diff for each group"""
     # Standardize OD values if requested, by default its False
    if standardize:
        df[od_field] = standardize_od(df, od_field, method)

    # Calculate the mean OD for each group, dose, and time point
    mean_df = df.groupby(group_fields + [dose_field] + [time_field])[od_field].mean().reset_index()
    mean_df = mean_df.rename(columns={od_field: 'mean_raw_od'})

    # Sort the DataFrame so Plate wise absolute diff can be calculated
    mean_df.sort_values(by=(group_fields + [dose_field] + [time_field]), inplace=True)
    # Calculate the absolute difference in mean OD within each plate for successive hours
    mean_df['diff_mean_raw_od'] = mean_df.groupby(group_fields + [dose_field])['mean_raw_od'].diff().abs()

    # Apply the minimum hour threshold
    mean_df = mean_df[mean_df[time_field] >= time_threshold].copy()
    
    # Group by the specified fields and select the row with the highest diff_mean_raw_od
    best_timepoints_df = mean_df.loc[mean_df.groupby(group_fields)['diff_mean_raw_od'].idxmax()]
    # Return relevant columns, including the calculated diff_mean_raw_od
    return best_timepoints_df[group_fields + [time_field]]



# Best timepoint using SNR
def timepoint_snr(df, condition_fields, dose_field, od_field, time_field, time_threshold=0, standardize=False, method='minmax'):
    """Returns optimal timepoint using SNR for each group"""
    control_dose = df[dose_field].min()  # assuming the lowest dose as control
    
     # Standardize OD values if requested, by default its False
    if standardize:
        df[od_field] = standardize_od(df, od_field, method)
    
    # Filter out rows based on the minimum time threshold
    df = df[df[time_field] >= time_threshold].copy()
    
    best_timepoints = []
    # Iterate over each unique combination of Condition, Ratio, and Plate
    for condition_values, condition_df in df.groupby(condition_fields):
        snr_by_time = []

        # Iterate over each time point within this condition group
        for time, time_df in condition_df.groupby(time_field):
            # Separate control and non-control doses
            control_od = time_df[time_df[dose_field] == control_dose][od_field]
            mean_control_od = control_od.mean()  # baseline signal
            # Calculate signal as difference from control for each dose level
            time_df['Signal'] = time_df[od_field] - mean_control_od
            # Calculate noise as standard deviation within each dose level
            noise = time_df.groupby(dose_field)[od_field].std().mean()
            # Calculate SNR as mean absolute signal over noise
            signal = time_df['Signal'].abs().mean()
            snr = signal / noise if noise != 0 else 0  # avoid division by zero
            # Store SNR with the corresponding time point
            snr_by_time.append((time, snr))
        
        # Select the time point with the highest SNR for this condition group
        best_time, best_snr = max(snr_by_time, key=lambda x: x[1])
        # Store result with the condition values
        best_timepoints.append((*condition_values, best_time, best_snr))
    
    # Convert results to a DataFrame for easy viewing
    best_timepoints_df = pd.DataFrame(best_timepoints, columns=condition_fields + ['Best_Timepoint', 'Best_SNR'])
    # Return relevant columns
    return best_timepoints_df

### Read Data

In [2]:
# timepoint for sf
FILE_PATH_RAW = 'data/timepoint_sf.csv'
file_path = Path(FILE_PATH_RAW)
df = read_csv_file(file_path)
GROUP_FIELDS = ['Condition', 'Ratio', 'Plate']
DOSE_FIELD = 'XMIC'
OD_FIELD = 'Raw_od'
TIME_FIELD = 'hour'
THRESHOLD = 3

# timepoint for vallo (shud be b.w 9.5 to 10.5)
# FILE_PATH_RAW = 'data/timepoint_vallo.csv'
# file_path = Path(FILE_PATH_RAW)
# df = read_csv_file(file_path)
# GROUP_FIELDS = ['Species']
# DOSE_FIELD = 'uM'
# OD_FIELD = 'RawOD'
# TIME_FIELD = 'Time_h'
# THRESHOLD = 3

### Test1: Mean absolute diff between each group

In [3]:
timepoint_using_mean_diff = timepoint_mean_difference(df, GROUP_FIELDS, DOSE_FIELD, OD_FIELD, TIME_FIELD, time_threshold=THRESHOLD, standardize=True, method='minmax')

timepoint_using_mean_diff

Unnamed: 0,Condition,Ratio,Plate,hour
16,20MSynComm,20,1,15.99556
501,20MSynComm,20,2,10.99417
800,20MSynComm,20,3,15.99556
1285,20MSynComm,20,4,10.99389
1584,20MSynComm+ SF,10+10,1,15.99556
2082,20MSynComm+ SF,10+10,2,23.9975
2372,20MSynComm+ SF,2+18,1,19.99639
2758,20MSynComm+ SF,2+18,2,13.995
3152,20MSynComm+ SFP,10+10,3,15.99556
3542,20MSynComm+ SFP,10+10,4,13.995


### Test2: Max SNR ratio

In [4]:
timepoint_using_snr = timepoint_snr(df, GROUP_FIELDS, DOSE_FIELD, OD_FIELD, TIME_FIELD, time_threshold=THRESHOLD, standardize=True, method='minmax')

timepoint_using_snr

Unnamed: 0,Condition,Ratio,Plate,Best_Timepoint,Best_SNR
0,20MSynComm,20,1,37.00056,2.869877
1,20MSynComm,20,2,38.00083,2.504541
2,20MSynComm,20,3,16.99556,2.550221
3,20MSynComm,20,4,43.00222,3.314093
4,20MSynComm+ SF,10+10,1,5.99278,10.443768
5,20MSynComm+ SF,10+10,2,28.99861,4.330901
6,20MSynComm+ SF,2+18,1,4.9925,11.680565
7,20MSynComm+ SF,2+18,2,4.9925,6.085689
8,20MSynComm+ SFP,10+10,3,6.99306,9.191035
9,20MSynComm+ SFP,10+10,4,6.99306,9.818398
