In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

import statsmodels.formula.api as smf
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, matthews_corrcoef
from kneed import KneeLocator

from matplotlib.lines import Line2D
from matplotlib.patches import Patch

In [61]:
file_path = 'database_2025-04-25.csv'  
df = pd.read_csv(file_path, low_memory=False)

In [62]:
phase2_plates = ['30v1','30v2','30v3','31v1','31v2','31v3','32v1','32v2','32v3','33v1','33v2','33v3']
phase2_df= df[df['plate'].isin(phase2_plates)]

In [63]:
# drop last data point
y2_cols = [f'y2_{i}' for i in range(1, 91)] 
def drop_last_valid(row):
    valid = row[y2_cols].last_valid_index()
    # if pd.notna(row[valid]):
    if valid is not None and pd.notna(row[valid]):
        row[valid] = np.nan
    return row

phase2_df1 = phase2_df.apply(drop_last_valid, axis=1)

In [64]:
phase2_df1.to_csv('phase2_df2.csv',index=False)

## Run quantile normalization

In [5]:
from scipy import interpolate
from scipy.stats import rankdata

def normalize_quantiles(A, ties=True):
    A = np.asarray(A, dtype=np.float64)
    n_rows, n_cols = A.shape
    if n_cols == 1:
        return A.copy()

    i = np.linspace(0, 1, n_rows)
    S = np.full((n_rows, n_cols), np.nan)
    nobs = np.zeros(n_cols, dtype=int)
    sort_idx = []

    for j in range(n_cols):
        col = A[:, j]
        not_nan = ~np.isnan(col)
        x = col[not_nan]
        nobs[j] = len(x)
        sort_order = np.argsort(x)
        sorted_x = x[sort_order]

        if nobs[j] < n_rows:
            f = interpolate.interp1d(np.linspace(0, 1, nobs[j]), sorted_x,
                                     bounds_error=False, fill_value="extrapolate")
            S[:, j] = f(i)
        else:
            S[:, j] = sorted_x

        sort_idx.append(np.argsort(np.argsort(col[not_nan])))

    m = np.nanmean(S, axis=1)
    A_out = np.full_like(A, np.nan)

    for j in range(n_cols):
        col = A[:, j]
        not_nan = ~np.isnan(col)

        if ties:
            r = rankdata(col[not_nan], method='average')
            quant_pos = (r - 1) / (nobs[j] - 1)
            f = interpolate.interp1d(i, m, bounds_error=False, fill_value="extrapolate")
            A_out[not_nan, j] = f(quant_pos)
        else:
            ranks = sort_idx[j]
            A_out[not_nan, j] = m[ranks.astype(int)]

    return A_out

In [6]:

plates = ['30v1', '30v2', '30v3']
df_30v =phase2_df[phase2_df['plate'].isin(plates)]

group_counts = (
    df_30v.groupby(['plate', 'light_regime', 'mutant_ID', 'mutated_genes'])
    .size()
    .reset_index(name='count')
)

# Step 3: For each plate and light_regime, count how many mutants had 1, 2, ... rows
summary = (
    group_counts.groupby(['light_regime','plate', 'count'])
    .size()
    .reset_index(name='n_mutants')
)

# Optional: Sort for easier reading
summary = summary.sort_values(by=['light_regime','plate', 'count'])

# Show result
summary

Unnamed: 0,light_regime,plate,count,n_mutants
0,10min-10min,30v1,1,375
1,10min-10min,30v1,7,1
2,10min-10min,30v2,1,375
3,10min-10min,30v2,7,1
4,10min-10min,30v3,1,375
5,10min-10min,30v3,7,1
6,1min-1min,30v1,3,375
7,1min-1min,30v1,21,1
8,1min-1min,30v2,1,375
9,1min-1min,30v2,7,1


## 30 plate 20h_ML


In [7]:
def quantile_normalize_light_regime(df, light_regime, plates, y2_cols, tie_handling=True):
    """
    Quantile-normalize all y2_cols across selected plates within a given light regime.
    
    Parameters:
    - df: pandas DataFrame, full dataset
    - light_regime: str, target light regime (e.g. '20h_ML')
    - plates: list of str, target plate names (e.g. ['30v1', '30v2', '30v3'])
    - y2_cols: list of str, column names like ['y2_1', ..., 'y2_44']
    - tie_handling: bool, passed to normalize_quantiles (default=True)
    
    Returns:
    - df_normalized: pandas DataFrame with normalized y2_cols
    """
    # Filter data
    subset_df = df[(df['light_regime'] == light_regime) & (df['plate'].isin(plates))].copy()
    df_normalized = subset_df.copy()

    for timepoint in y2_cols:
        position_values = []
        valid_plate_indices = {}

        for plate in plates:
            plate_df = subset_df[subset_df['plate'] == plate].copy()

            wt_rows = plate_df[plate_df['mutant_ID'] == 'WT'].copy()
            non_wt_rows = plate_df[plate_df['mutant_ID'] != 'WT'].copy()

            wt_rows = wt_rows.sort_values(['mutant_ID', 'mutated_genes', 'well_id'])
            non_wt_rows = non_wt_rows.sort_values(['mutant_ID', 'mutated_genes'])

            sorted_df = pd.concat([wt_rows, non_wt_rows], axis=0)
            values = sorted_df[timepoint].values
            index = sorted_df.index.values

            position_values.append(values)
            valid_plate_indices[plate] = index

        # Validate shape
        lengths = [len(v) for v in position_values]
        if len(set(lengths)) != 1:
            raise ValueError(f"Length mismatch at {timepoint}: {lengths}")

        matrix = np.column_stack(position_values)
        normalized_matrix = normalize_quantiles(matrix, ties=tie_handling)

        # Write back
        for col_idx, plate in enumerate(plates):
            indices = valid_plate_indices[plate]
            df_normalized.loc[indices, timepoint] = normalized_matrix[:, col_idx]

    return df_normalized


In [8]:
# Define inputs
plates = ['30v1', '30v2', '30v3']
y2_cols = [f'y2_{i}' for i in range(1, 45)]

# Run normalization
phase2_30_20h_ML_normalized = quantile_normalize_light_regime(
    df=phase2_df1,
    light_regime='20h_ML',
    plates=plates,
    y2_cols=y2_cols
)

# View a few columns
phase2_30_20h_ML_normalized[['plate', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,y2_6,...,y2_35,y2_36,y2_37,y2_38,y2_39,y2_40,y2_41,y2_42,y2_43,y2_44
57833,30v1,LMJ.RY0402.052440,Cre01.g028650,A02,0.317473,0.345383,0.342457,0.341718,0.321135,0.381001,...,0.348216,0.382330,0.344126,0.346239,0.365021,0.371518,0.353667,0.341384,0.371579,0.338046
57834,30v1,LMJ.RY0402.055420,Cre01.g045150,A03,,,,,,,...,,,,,,,,,,
57835,30v1,LMJ.RY0402.047311,"Cre08.g375800,Cre01.g049900,Cre01.g049900 & Cr...",A04,0.289352,0.290864,0.238669,0.327479,0.283163,0.340558,...,0.284329,0.292717,0.224750,0.285272,0.277097,0.255732,0.277211,0.283263,0.264593,0.293461
57836,30v1,LMJ.RY0402.054897,Cre02.g095115,A05,0.462545,0.477473,0.461819,0.462798,0.478395,0.501161,...,0.465958,0.494608,0.477707,0.484583,0.489060,0.473700,0.464559,0.460349,0.465634,0.450707
57837,30v1,LMJ.RY0402.054597,Cre03.g153400,A06,0.427820,0.427159,0.490440,0.432836,0.444741,0.463786,...,0.417580,0.415090,0.368422,0.391992,0.402364,0.389320,0.373587,0.382895,0.378580,0.384347
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63190,30v3,LMJ.RY0402.240458,"Cre13.g588405 & Cre13.g588453,Cre02.g106650",P20,0.403500,0.449339,0.396441,0.439710,0.430444,0.429042,...,0.392719,0.391175,0.396436,0.409427,0.385991,0.416788,0.405656,0.407436,0.421082,0.424165
63191,30v3,LMJ.RY0402.076841,Cre04.g216050,P21,0.389773,0.377288,0.449997,0.419598,0.396417,0.399080,...,0.430129,0.414840,0.441616,0.418253,0.397925,0.433820,0.400865,0.376559,0.436669,0.378360
63192,30v3,LMJ.RY0402.039259,Cre01.g023050,P22,0.298555,0.312649,0.372683,0.381547,0.364668,0.353471,...,0.340208,0.350328,0.342966,0.358541,0.318328,0.331368,0.330159,0.290430,0.366164,0.361275
63193,30v3,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",P23,0.376339,0.335370,0.363388,0.397211,0.372747,0.385569,...,0.356111,0.374606,0.344247,0.315802,0.353075,0.334224,0.361650,0.364837,0.334779,0.327738


In [10]:
plates = ['30v1', '30v2', '30v3']
phase2_30_20h_ML= phase2_df1[(phase2_df1['light_regime'] == '20h_ML') & (phase2_df1['plate'].isin(plates))].copy()
phase2_30_20h_ML[['plate', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,y2_6,...,y2_35,y2_36,y2_37,y2_38,y2_39,y2_40,y2_41,y2_42,y2_43,y2_44
57833,30v1,LMJ.RY0402.052440,Cre01.g028650,A02,0.313075,0.340892,0.335842,0.338194,0.306467,0.368952,...,0.322108,0.350214,0.317106,0.318287,0.337013,0.340123,0.315887,0.311126,0.342863,0.312872
57834,30v1,LMJ.RY0402.055420,Cre01.g045150,A03,,,,,,,...,,,,,,,,,,
57835,30v1,LMJ.RY0402.047311,"Cre08.g375800,Cre01.g049900,Cre01.g049900 & Cr...",A04,0.280045,0.288696,0.247353,0.325361,0.278562,0.326701,...,0.254442,0.259391,0.205432,0.259399,0.246406,0.227078,0.237442,0.251058,0.239258,0.271839
57836,30v1,LMJ.RY0402.054897,Cre02.g095115,A05,0.463137,0.476212,0.456988,0.458731,0.466273,0.497010,...,0.435644,0.463652,0.449026,0.460196,0.461487,0.443504,0.436026,0.430273,0.436668,0.426859
57837,30v1,LMJ.RY0402.054597,Cre03.g153400,A06,0.417027,0.423763,0.484107,0.430076,0.434725,0.455363,...,0.392095,0.381440,0.343718,0.363159,0.370159,0.363914,0.336446,0.351251,0.349093,0.358236
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63190,30v3,LMJ.RY0402.240458,"Cre13.g588405 & Cre13.g588453,Cre02.g106650",P20,0.417077,0.461926,0.410899,0.456567,0.455360,0.446217,...,0.424602,0.423757,0.426434,0.448112,0.417867,0.451464,0.438934,0.446466,0.457926,0.453793
63191,30v3,LMJ.RY0402.076841,Cre04.g216050,P21,0.402689,0.384884,0.467014,0.435917,0.423330,0.412871,...,0.459968,0.444697,0.469751,0.454732,0.429216,0.466842,0.434265,0.413817,0.471117,0.407011
63192,30v3,LMJ.RY0402.039259,Cre01.g023050,P22,0.310397,0.312508,0.388726,0.395121,0.385682,0.366395,...,0.368584,0.378563,0.369550,0.392513,0.348063,0.361941,0.361083,0.327812,0.401803,0.387513
63193,30v3,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",P23,0.387525,0.339922,0.376708,0.408916,0.396215,0.400359,...,0.382681,0.403808,0.370099,0.345508,0.384129,0.364662,0.391218,0.401115,0.368984,0.353466


### 30 plate 20h_HL

In [11]:
# Define inputs
plates = ['30v1', '30v2', '30v3']
y2_cols = [f'y2_{i}' for i in range(1, 45)]

# Run normalization
phase2_30_20h_HL_normalized = quantile_normalize_light_regime(
    df=phase2_df1,
    light_regime='20h_ML',
    plates=plates,
    y2_cols=y2_cols
)

# View a few columns
phase2_30_20h_HL_normalized[['plate', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,y2_6,...,y2_35,y2_36,y2_37,y2_38,y2_39,y2_40,y2_41,y2_42,y2_43,y2_44
57833,30v1,LMJ.RY0402.052440,Cre01.g028650,A02,0.317473,0.345383,0.342457,0.341718,0.321135,0.381001,...,0.348216,0.382330,0.344126,0.346239,0.365021,0.371518,0.353667,0.341384,0.371579,0.338046
57834,30v1,LMJ.RY0402.055420,Cre01.g045150,A03,,,,,,,...,,,,,,,,,,
57835,30v1,LMJ.RY0402.047311,"Cre08.g375800,Cre01.g049900,Cre01.g049900 & Cr...",A04,0.289352,0.290864,0.238669,0.327479,0.283163,0.340558,...,0.284329,0.292717,0.224750,0.285272,0.277097,0.255732,0.277211,0.283263,0.264593,0.293461
57836,30v1,LMJ.RY0402.054897,Cre02.g095115,A05,0.462545,0.477473,0.461819,0.462798,0.478395,0.501161,...,0.465958,0.494608,0.477707,0.484583,0.489060,0.473700,0.464559,0.460349,0.465634,0.450707
57837,30v1,LMJ.RY0402.054597,Cre03.g153400,A06,0.427820,0.427159,0.490440,0.432836,0.444741,0.463786,...,0.417580,0.415090,0.368422,0.391992,0.402364,0.389320,0.373587,0.382895,0.378580,0.384347
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63190,30v3,LMJ.RY0402.240458,"Cre13.g588405 & Cre13.g588453,Cre02.g106650",P20,0.403500,0.449339,0.396441,0.439710,0.430444,0.429042,...,0.392719,0.391175,0.396436,0.409427,0.385991,0.416788,0.405656,0.407436,0.421082,0.424165
63191,30v3,LMJ.RY0402.076841,Cre04.g216050,P21,0.389773,0.377288,0.449997,0.419598,0.396417,0.399080,...,0.430129,0.414840,0.441616,0.418253,0.397925,0.433820,0.400865,0.376559,0.436669,0.378360
63192,30v3,LMJ.RY0402.039259,Cre01.g023050,P22,0.298555,0.312649,0.372683,0.381547,0.364668,0.353471,...,0.340208,0.350328,0.342966,0.358541,0.318328,0.331368,0.330159,0.290430,0.366164,0.361275
63193,30v3,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",P23,0.376339,0.335370,0.363388,0.397211,0.372747,0.385569,...,0.356111,0.374606,0.344247,0.315802,0.353075,0.334224,0.361650,0.364837,0.334779,0.327738


In [22]:
plates = ['30v1', '30v2', '30v3']
y2_cols = [f'y2_{i}' for i in range(1, 45)]

# Filter the data
phase2_30_20h_HL = phase2_df1[
    (phase2_df1['light_regime'] == '20h_HL') &
    (phase2_df1['plate'].isin(plates))
].copy()

# Copy to write normalized data
phase2_30_20h_HL_normalized = phase2_30_20h_HL.copy()

# Loop over each y2 column (timepoint)
for timepoint in y2_cols:
    position_values = []
    valid_plate_indices = []

    # Loop through (plate, start_date) technical replicates


    for (plate, start_date), group in phase2_30_20h_HL.groupby(['plate', 'start_date']):


        subset = group.copy()

        # Separate WT and non-WT rows
        wt_rows = subset[subset['mutant_ID'] == 'WT'].copy()
        non_wt_rows = subset[subset['mutant_ID'] != 'WT'].copy()

        # Sort for reproducibility
        wt_rows = wt_rows.sort_values(['mutant_ID', 'mutated_genes', 'well_id', 'start_date'])
        non_wt_rows = non_wt_rows.sort_values(['mutant_ID', 'mutated_genes', 'start_date'])


        # Combine sorted rows
        subset_sorted = pd.concat([wt_rows, non_wt_rows], axis=0)

        # Extract values and index
        values = subset_sorted[timepoint].values
        index = subset_sorted.index.values

        position_values.append(values)
        valid_plate_indices.append(index)

    # Skip timepoint if mismatch or empty
    lengths = [len(v) for v in position_values]
    if len(set(lengths)) != 1 or 0 in lengths:
        print(f"⚠️ Skipping {timepoint} due to mismatch or empty data: lengths = {lengths}")
        continue

    # Quantile normalize
    matrix = np.column_stack(position_values)
    normalized_matrix = normalize_quantiles(matrix, ties=True)

    # Write back
    for col_idx, index in enumerate(valid_plate_indices):
        phase2_30_20h_HL_normalized.loc[index, timepoint] = normalized_matrix[:, col_idx]

# Optional preview
phase2_30_20h_HL_normalized[['plate', 'start_date', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]


Unnamed: 0,plate,start_date,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,...,y2_35,y2_36,y2_37,y2_38,y2_39,y2_40,y2_41,y2_42,y2_43,y2_44
56684,30v1,2024-06-09,LMJ.RY0402.052440,Cre01.g028650,A02,0.225776,0.257271,0.249799,0.250054,0.230483,...,0.149696,0.157190,0.125299,0.156633,0.143969,0.178927,0.110484,0.146086,0.156005,0.148991
56685,30v1,2024-06-09,LMJ.RY0402.055420,Cre01.g045150,A03,,,,,,...,,,,,,,,,,
56686,30v1,2024-06-09,LMJ.RY0402.047311,"Cre08.g375800,Cre01.g049900,Cre01.g049900 & Cr...",A04,0.204386,0.166536,0.229096,0.187597,0.226768,...,0.139017,0.146093,0.128650,0.137684,0.074288,0.105305,0.141218,0.123816,0.145473,0.134924
56687,30v1,2024-06-09,LMJ.RY0402.054897,Cre02.g095115,A05,,0.311199,0.273021,,0.255820,...,0.177065,0.159212,0.143823,0.168711,0.163326,0.168832,0.151072,0.172369,0.182630,0.166293
56688,30v1,2024-06-09,LMJ.RY0402.054597,Cre03.g153400,A06,0.275487,0.237599,0.206021,0.242812,0.229522,...,0.132054,0.136678,0.146031,0.133247,0.154159,0.132031,0.117435,0.087312,0.108456,0.107747
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62807,30v3,2024-06-26,LMJ.RY0402.240458,"Cre13.g588405 & Cre13.g588453,Cre02.g106650",P20,0.229679,0.228573,0.256740,0.236101,0.191666,...,0.169951,0.140892,0.149778,0.189469,0.167850,0.137299,0.142635,0.161091,0.135874,0.158747
62808,30v3,2024-06-26,LMJ.RY0402.076841,Cre04.g216050,P21,0.264628,0.235709,0.211074,0.238113,0.227502,...,0.172412,0.134199,0.150768,0.164933,0.137863,0.082837,0.158579,0.169777,0.165056,0.146625
62809,30v3,2024-06-26,LMJ.RY0402.039259,Cre01.g023050,P22,0.173254,0.182697,0.159220,0.176717,0.145089,...,0.096158,0.150238,0.134026,0.124555,0.139025,0.116727,0.124511,0.170502,0.154372,0.134200
62810,30v3,2024-06-26,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",P23,0.214273,0.212980,0.160776,0.132887,0.224773,...,0.152222,0.106717,0.145893,0.163053,0.122998,0.090451,0.182984,0.153049,0.153189,0.121018


In [20]:
import numpy as np
import pandas as pd

# Example normalize_quantiles function if not defined
def normalize_quantiles(matrix, ties=True):
    """
    Normalize the columns of a matrix to have the same distribution.
    """
    ranked = np.argsort(np.argsort(matrix, axis=0), axis=0)
    sorted_matrix = np.sort(matrix, axis=0)
    mean_ranks = np.mean(sorted_matrix, axis=1)

    normalized = np.zeros_like(matrix)
    for i in range(matrix.shape[1]):
        normalized[:, i] = mean_ranks[ranked[:, i]]
    return normalized

# Plates and timepoints
plates = ['30v1', '30v2', '30v3']
y2_cols = [f'y2_{i}' for i in range(1, 45)]

# Filter the data
phase2_30_20h_HL = phase2_df1[
    (phase2_df1['light_regime'] == '20h_HL') &
    (phase2_df1['plate'].isin(plates))
].copy()

# Copy to write normalized data
phase2_30_20h_HL_normalized = phase2_30_20h_HL.copy()

# Dictionary to store mutant ID matrices
mutant_id_matrices = {}

# Loop over each y2 column (timepoint)
for timepoint in y2_cols:
    position_values = []
    valid_plate_indices = []
    mutant_ids_per_column = []

    # Loop through (plate, start_date) technical replicates
    for (plate, start_date), group in phase2_30_20h_HL.groupby(['plate', 'start_date']):
        subset = group.copy()

        # Separate WT and non-WT rows
        wt_rows = subset[subset['mutant_ID'] == 'WT'].copy()
        non_wt_rows = subset[subset['mutant_ID'] != 'WT'].copy()

        # Sort for reproducibility
        wt_rows = wt_rows.sort_values(['mutant_ID', 'mutated_genes', 'well_id', 'start_date'])
        non_wt_rows = non_wt_rows.sort_values(['mutant_ID', 'mutated_genes', 'start_date'])

        # Combine sorted rows
        subset_sorted = pd.concat([wt_rows, non_wt_rows], axis=0)

        # Extract values, index, and mutant_IDs
        values = subset_sorted[timepoint].values
        index = subset_sorted.index.values
        mutant_ids = subset_sorted['mutant_ID'].values

        if len(values) == 0:
            continue

        position_values.append(values)
        valid_plate_indices.append(index)
        mutant_ids_per_column.append(mutant_ids)

    # Skip timepoint if mismatch or empty
    lengths = [len(v) for v in position_values]
    if len(set(lengths)) != 1 or 0 in lengths:
        print(f"⚠️ Skipping {timepoint} due to mismatch or empty data: lengths = {lengths}")
        continue

    # Quantile normalize
    matrix = np.column_stack(position_values)
    normalized_matrix = normalize_quantiles(matrix, ties=True)

    # Store mutant_ID matrix
    mutant_id_matrix = np.column_stack(mutant_ids_per_column)
    mutant_id_matrices[timepoint] = pd.DataFrame(mutant_id_matrix)

    # Write normalized values back
    for col_idx, index in enumerate(valid_plate_indices):
        phase2_30_20h_HL_normalized.loc[index, timepoint] = normalized_matrix[:, col_idx]

# Optional: preview
print("Mutant ID matrix for y2_5:")
print(mutant_id_matrices['y2_5'].head())

# And for normalized values
print(phase2_30_20h_HL_normalized[['plate', 'start_date', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols].head())


Mutant ID matrix for y2_5:
    0   1   2   3
0  WT  WT  WT  WT
1  WT  WT  WT  WT
2  WT  WT  WT  WT
3  WT  WT  WT  WT
4  WT  WT  WT  WT
      plate  start_date          mutant_ID  \
56684  30v1  2024-06-09  LMJ.RY0402.052440   
56685  30v1  2024-06-09  LMJ.RY0402.055420   
56686  30v1  2024-06-09  LMJ.RY0402.047311   
56687  30v1  2024-06-09  LMJ.RY0402.054897   
56688  30v1  2024-06-09  LMJ.RY0402.054597   

                                           mutated_genes well_id      y2_1  \
56684                                      Cre01.g028650     A02  0.225776   
56685                                      Cre01.g045150     A03       NaN   
56686  Cre08.g375800,Cre01.g049900,Cre01.g049900 & Cr...     A04  0.204386   
56687                                      Cre02.g095115     A05       NaN   
56688                                      Cre03.g153400     A06  0.275487   

           y2_2      y2_3      y2_4      y2_5  ...     y2_35     y2_36  \
56684  0.257271  0.249799  0.250054  0.230483

In [13]:
plates = ['30v1', '30v2', '30v3']
phase2_30_20h_HL= phase2_df1[(phase2_df1['light_regime'] == '20h_HL') & (phase2_df1['plate'].isin(plates))].copy()
phase2_30_20h_HL[['plate', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,y2_6,...,y2_35,y2_36,y2_37,y2_38,y2_39,y2_40,y2_41,y2_42,y2_43,y2_44
56684,30v1,LMJ.RY0402.052440,Cre01.g028650,A02,0.216069,0.226566,0.219704,0.226053,0.203732,0.206989,...,0.135800,0.145019,0.121900,0.142633,0.133393,0.165752,0.103543,0.136308,0.138788,0.137637
56685,30v1,LMJ.RY0402.055420,Cre01.g045150,A03,,,,,,,...,,,,,,,,,,
56686,30v1,LMJ.RY0402.047311,"Cre08.g375800,Cre01.g049900,Cre01.g049900 & Cr...",A04,0.197926,0.145801,0.202518,0.175168,0.200352,0.176079,...,0.129036,0.136701,0.124636,0.126411,0.072175,0.095150,0.129910,0.117267,0.130215,0.127302
56687,30v1,LMJ.RY0402.054897,Cre02.g095115,A05,0.293028,0.259729,0.235626,0.268108,0.221802,0.242372,...,0.158637,0.146434,0.139955,0.151930,0.147556,0.155923,0.139480,0.161199,0.161822,0.154330
56688,30v1,LMJ.RY0402.054597,Cre03.g153400,A06,0.255417,0.209508,0.181639,0.222756,0.203067,0.200664,...,0.124133,0.126743,0.141157,0.121754,0.142472,0.123336,0.109190,0.081225,0.096619,0.103661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62807,30v3,LMJ.RY0402.240458,"Cre13.g588405 & Cre13.g588453,Cre02.g106650",P20,0.235882,0.226034,0.247627,0.218881,0.178483,0.231215,...,0.135895,0.102763,0.102441,0.155127,0.135944,0.099079,0.108183,0.131564,0.097591,0.124664
62808,30v3,LMJ.RY0402.076841,Cre04.g216050,P21,0.274974,0.234371,0.201711,0.221186,0.216220,0.202973,...,0.139467,0.094458,0.104079,0.124304,0.102106,0.043886,0.124835,0.141565,0.126980,0.110277
62809,30v3,LMJ.RY0402.039259,Cre01.g023050,P22,0.176214,0.186103,0.148495,0.156291,0.131041,0.142431,...,0.060515,0.111167,0.087260,0.082330,0.103342,0.078698,0.086278,0.142120,0.118378,0.096692
62810,30v3,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",P23,0.218064,0.211960,0.149888,0.107821,0.214074,0.224083,...,0.116498,0.065071,0.098037,0.122088,0.090205,0.051065,0.149591,0.119651,0.116663,0.084761


## 30 plate 2h-2h¶

In [24]:
# Define inputs
plates = ['30v1', '30v2', '30v3']
y2_cols = [f'y2_{i}' for i in range(1, 49)]

# Run normalization
phase2_30_2h_2h_normalized = quantile_normalize_light_regime(
    df=phase2_df1,
    light_regime='2h-2h',
    plates=plates,
    y2_cols=y2_cols
)

# View a few columns
phase2_30_2h_2h_normalized[['plate', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols[:10]]

Unnamed: 0,plate,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,y2_6,y2_7,y2_8,y2_9,y2_10
58216,30v1,LMJ.RY0402.052440,Cre01.g028650,A02,0.153923,0.125966,0.179593,0.126986,0.192877,0.218741,0.141356,0.222047,0.523404,0.570614
58217,30v1,LMJ.RY0402.055420,Cre01.g045150,A03,,,,,,,,,,
58218,30v1,LMJ.RY0402.047311,"Cre08.g375800,Cre01.g049900,Cre01.g049900 & Cr...",A04,0.173734,0.188289,0.187506,0.172232,0.119337,0.103212,0.189212,0.126723,0.534451,0.524216
58219,30v1,LMJ.RY0402.054897,Cre02.g095115,A05,0.279289,0.250536,0.267207,0.235298,0.236394,0.254684,0.276383,0.253110,0.632008,0.653696
58220,30v1,LMJ.RY0402.054597,Cre03.g153400,A06,0.225808,0.211319,0.248452,0.167633,0.252082,0.211906,0.274065,0.198093,0.638364,0.674933
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63573,30v3,LMJ.RY0402.240458,"Cre13.g588405 & Cre13.g588453,Cre02.g106650",P20,0.189865,0.191438,0.185688,0.212769,0.220831,0.198224,0.193850,0.210555,0.570168,0.594130
63574,30v3,LMJ.RY0402.076841,Cre04.g216050,P21,0.227457,0.184950,0.249927,0.137225,0.162363,0.144991,0.282182,0.278076,0.594983,0.657199
63575,30v3,LMJ.RY0402.039259,Cre01.g023050,P22,0.162737,0.140461,0.148397,0.188602,0.195473,0.193934,0.199294,0.193098,0.478202,0.505536
63576,30v3,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",P23,0.181963,0.179786,0.147826,0.082155,0.199090,0.247504,0.208898,0.161288,0.536603,0.537923


In [23]:
phase2_30_2h_2h= phase2_df1[(phase2_df1['light_regime'] == '2h-2h') & (phase2_df1['plate'].isin(plates))].copy()
phase2_30_2h_2h[['plate', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols[:10]]

Unnamed: 0,plate,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,y2_6,y2_7,y2_8,y2_9,y2_10
58216,30v1,LMJ.RY0402.052440,Cre01.g028650,A02,0.149536,0.118418,0.172688,0.119780,0.190143,0.214951,0.130722,0.210320,0.509822,0.557434
58217,30v1,LMJ.RY0402.055420,Cre01.g045150,A03,,,,,,,,,,
58218,30v1,LMJ.RY0402.047311,"Cre08.g375800,Cre01.g049900,Cre01.g049900 & Cr...",A04,0.166283,0.186892,0.181170,0.168928,0.106746,0.089804,0.179855,0.119110,0.520025,0.510441
58219,30v1,LMJ.RY0402.054897,Cre02.g095115,A05,0.254300,0.241386,0.253756,0.234268,0.232971,0.250657,0.262042,0.242339,0.614033,0.637819
58220,30v1,LMJ.RY0402.054597,Cre03.g153400,A06,0.221033,0.204123,0.240292,0.164685,0.249126,0.208331,0.258897,0.189821,0.619235,0.662281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63573,30v3,LMJ.RY0402.240458,"Cre13.g588405 & Cre13.g588453,Cre02.g106650",P20,0.191597,0.186985,0.188094,0.213490,0.232895,0.209429,0.199349,0.225181,0.583076,0.605526
63574,30v3,LMJ.RY0402.076841,Cre04.g216050,P21,0.236258,0.180472,0.261213,0.135198,0.166985,0.150131,0.310765,0.299801,0.610539,0.673820
63575,30v3,LMJ.RY0402.039259,Cre01.g023050,P22,0.164571,0.133661,0.147957,0.185705,0.204953,0.205004,0.205662,0.205731,0.484071,0.519651
63576,30v3,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",P23,0.184557,0.174594,0.147825,0.083603,0.208384,0.264728,0.216916,0.171816,0.544571,0.543251


## plate 30 1min-1min

In [25]:
plates = ['30v1', '30v2', '30v3']
y2_cols = [f'y2_{i}' for i in range(1, 89)]

# Filter the data
phase2_30_1min_1min = phase2_df1[
    (phase2_df1['light_regime'] == '1min-1min') &
    (phase2_df1['plate'].isin(plates))
].copy()

# Copy to write normalized data
phase2_30_1min_1min_normalized = phase2_30_1min_1min.copy()

# Loop over each y2 column (timepoint)
for timepoint in y2_cols:
    position_values = []
    valid_plate_indices = []

    # Loop through (plate, start_date) technical replicates
    for (plate, start_date), group in phase2_30_1min_1min.groupby(['plate', 'start_date']):
        subset = group.copy()

        # Separate WT and non-WT rows
        wt_rows = subset[subset['mutant_ID'] == 'WT'].copy()
        non_wt_rows = subset[subset['mutant_ID'] != 'WT'].copy()

        # Sort for reproducibility
        wt_rows = wt_rows.sort_values(['mutant_ID', 'mutated_genes', 'well_id', 'start_date'])
        non_wt_rows = non_wt_rows.sort_values(['mutant_ID', 'mutated_genes', 'start_date'])

        # Combine sorted rows
        subset_sorted = pd.concat([wt_rows, non_wt_rows], axis=0)

        # Extract values and index
        values = subset_sorted[timepoint].values
        index = subset_sorted.index.values

        position_values.append(values)
        valid_plate_indices.append(index)

    # Skip timepoint if mismatch or empty
    lengths = [len(v) for v in position_values]
    if len(set(lengths)) != 1 or 0 in lengths:
        print(f"⚠️ Skipping {timepoint} due to mismatch or empty data: lengths = {lengths}")
        continue

    # Quantile normalize
    matrix = np.column_stack(position_values)
    normalized_matrix = normalize_quantiles(matrix, ties=True)

    # Write back
    for col_idx, index in enumerate(valid_plate_indices):
        phase2_30_1min_1min_normalized.loc[index, timepoint] = normalized_matrix[:, col_idx]

# Optional preview
phase2_30_1min_1min_normalized[['plate', 'start_date', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,start_date,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,...,y2_79,y2_80,y2_81,y2_82,y2_83,y2_84,y2_85,y2_86,y2_87,y2_88
56301,30v1,2024-06-08,LMJ.RY0402.052440,Cre01.g028650,A02,0.173605,0.535414,0.217840,0.510784,0.146091,...,0.175604,0.507088,0.171992,0.509004,0.203782,0.499883,0.158108,0.471754,0.178791,0.493525
56302,30v1,2024-06-08,LMJ.RY0402.055420,Cre01.g045150,A03,,,,,,...,,,,,,,,,,
56303,30v1,2024-06-08,LMJ.RY0402.047311,"Cre08.g375800,Cre01.g049900,Cre01.g049900 & Cr...",A04,0.118358,0.475538,0.134833,0.480871,0.148046,...,0.121655,0.492628,0.136536,0.479833,0.142521,0.466156,0.134019,0.483466,0.142242,0.487804
56304,30v1,2024-06-08,LMJ.RY0402.054897,Cre02.g095115,A05,0.342905,,0.324267,,0.264964,...,0.332678,,0.280989,,0.267263,,0.268034,,0.262451,
56305,30v1,2024-06-08,LMJ.RY0402.054597,Cre03.g153400,A06,0.257731,0.613999,0.257146,0.572812,0.221723,...,0.211028,0.533103,0.210070,0.541133,0.206227,0.535935,0.210226,0.560942,0.211909,0.542488
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73914,30v1,2024-07-27,LMJ.RY0402.101554,Cre08.g374450,P20,0.113567,0.651231,0.124400,0.623706,0.164563,...,0.168851,0.582009,0.185768,0.577777,0.213009,0.578593,0.182311,0.590451,0.185230,0.591191
73915,30v1,2024-07-27,LMJ.RY0402.105853,Cre08.g382620,P21,0.065143,0.630422,0.115359,0.610969,0.106037,...,0.188299,0.591828,0.170818,0.584460,0.178811,0.588588,0.054303,0.584850,0.139990,0.585393
73916,30v1,2024-07-27,LMJ.RY0402.144998,Cre08.g382620,P22,0.383768,0.652478,0.337082,0.614257,0.322426,...,0.327063,0.608221,0.302285,0.616128,0.313931,0.602513,0.315610,0.606697,0.295260,0.601747
73917,30v1,2024-07-27,LMJ.RY0402.042144,Cre09.g387400,P23,0.377855,0.624485,0.366128,,0.358844,...,0.300075,0.608784,0.305732,0.548463,0.319815,,0.309297,,0.285155,0.605831


In [26]:
y2_cols = [f'y2_{i}' for i in range(1, 92)]
phase2_30_1min_1min= phase2_df1[(phase2_df1['light_regime'] == '1min-1min') & (phase2_df1['plate'].isin(plates))].copy()
phase2_30_1min_1min[['plate', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,y2_6,...,y2_82,y2_83,y2_84,y2_85,y2_86,y2_87,y2_88,y2_89,y2_90,y2_91
56301,30v1,LMJ.RY0402.052440,Cre01.g028650,A02,0.195066,0.525049,0.227124,0.498801,0.166499,0.496727,...,0.491067,0.205501,0.488170,0.165160,0.454686,0.185617,0.475774,,,
56302,30v1,LMJ.RY0402.055420,Cre01.g045150,A03,,,,,,,...,,,,,,,,,,
56303,30v1,LMJ.RY0402.047311,"Cre08.g375800,Cre01.g049900,Cre01.g049900 & Cr...",A04,0.135520,0.457937,0.147918,0.468682,0.167921,0.481376,...,0.458865,0.149867,0.454159,0.145884,0.464637,0.144864,0.468520,,,
56304,30v1,LMJ.RY0402.054897,Cre02.g095115,A05,0.339916,0.644972,0.314028,0.601064,0.276733,0.584802,...,0.586885,0.265814,0.583848,0.261327,0.588227,0.255930,0.577061,,,
56305,30v1,LMJ.RY0402.054597,Cre03.g153400,A06,0.272385,0.599256,0.263786,0.559597,0.237700,0.554318,...,0.523960,0.207730,0.524490,0.214196,0.544969,0.214929,0.526317,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73914,30v1,LMJ.RY0402.101554,Cre08.g374450,P20,0.098739,0.694234,0.127614,0.676051,0.159901,0.702140,...,0.600258,0.217761,0.601376,0.180092,0.635392,0.181877,0.634866,,,
73915,30v1,LMJ.RY0402.105853,Cre08.g382620,P21,-0.002663,0.646938,0.123557,0.646740,0.099176,0.705124,...,0.618905,0.178863,0.629358,0.031576,0.614252,0.139390,0.620357,,,
73916,30v1,LMJ.RY0402.144998,Cre08.g382620,P22,0.480808,0.696475,0.411659,0.653837,0.374212,0.706855,...,0.695978,0.415903,0.664024,0.439042,0.683531,0.369671,0.673935,,,
73917,30v1,LMJ.RY0402.042144,Cre09.g387400,P23,0.463091,0.635114,0.489145,0.728981,0.513699,0.772260,...,0.559082,0.442246,0.734101,0.417610,0.734155,0.334846,0.681812,,,


In [36]:
phase2_30_1min_1min[(phase2_30_1min_1min['plate']=='30v1')&(phase2_30_1min_1min['mutant_ID']=='LMJ.RY0402.236577')]

Unnamed: 0,plate,measurement,start_date,light_regime,dark_threshold,light_threshold,num_frames,i,j,fv_fm,...,measurement_time_173,measurement_time_174,measurement_time_175,measurement_time_176,measurement_time_177,well_id,mutant_ID,feature,mutated_genes,num_mutations
56552,30v1,M1,2024-06-08,1min-1min,18.779654,25.444115,180,10,12,0.616165,...,,,,,,K13,LMJ.RY0402.236577,intron,Cre08.g378800,1.0
57318,30v1,M3,2024-06-10,1min-1min,15.956943,22.289219,180,10,12,0.650724,...,,,,,,K13,LMJ.RY0402.236577,intron,Cre08.g378800,1.0
73787,30v1,M1,2024-07-27,1min-1min,15.812491,22.504938,180,10,12,0.650923,...,,,,,,K13,LMJ.RY0402.236577,intron,Cre08.g378800,1.0


### 30 plate 30s-30s

In [39]:
# Define inputs
plates = ['30v1', '30v2', '30v3']
y2_cols = [f'y2_{i}' for i in range(1, 89)]

# Run normalization
phase2_30_30s_30s_normalized = quantile_normalize_light_regime(
    df=phase2_df1,
    light_regime='30s-30s',
    plates=plates,
    y2_cols=y2_cols
)

# View a few columns
phase2_30_30s_30s_normalized[['plate', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,y2_6,...,y2_79,y2_80,y2_81,y2_82,y2_83,y2_84,y2_85,y2_86,y2_87,y2_88
58599,30v1,LMJ.RY0402.052440,Cre01.g028650,A02,0.206102,0.526917,0.203809,0.527712,0.176738,0.495360,...,0.194877,0.486622,0.216016,0.471062,0.199129,0.484056,0.187751,0.446081,0.170786,0.492250
58600,30v1,LMJ.RY0402.055420,Cre01.g045150,A03,,,,,,,...,,,,,,,,,,
58601,30v1,LMJ.RY0402.047311,"Cre08.g375800,Cre01.g049900,Cre01.g049900 & Cr...",A04,0.145174,0.486800,0.143455,0.476263,0.135190,0.433102,...,0.136573,0.431004,0.138311,0.426272,0.147675,0.449765,0.131877,0.397023,0.124362,0.417104
58602,30v1,LMJ.RY0402.054897,Cre02.g095115,A05,0.359618,0.627519,0.358295,,0.361145,0.586864,...,0.296966,,,,0.338746,0.590809,0.315479,0.579588,0.297656,
58603,30v1,LMJ.RY0402.054597,Cre03.g153400,A06,0.293370,0.572217,0.257843,0.546916,0.270961,0.539758,...,0.242871,0.522642,0.244163,0.556034,0.274172,0.562989,0.143978,0.528266,0.225927,0.502698
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64339,30v3,LMJ.RY0402.240458,"Cre13.g588405 & Cre13.g588453,Cre02.g106650",P20,0.287663,0.554196,0.244605,0.525086,0.249610,0.526613,...,0.211641,0.504490,0.257592,0.490326,0.256127,0.533414,0.227199,0.518164,0.242211,0.522656
64340,30v3,LMJ.RY0402.076841,Cre04.g216050,P21,0.294314,0.632862,0.301971,0.586833,0.295074,0.576877,...,0.213351,0.549670,0.223111,0.527934,0.247313,0.521344,0.287609,0.495517,0.243703,0.528434
64341,30v3,LMJ.RY0402.039259,Cre01.g023050,P22,0.154224,0.490022,0.163536,0.479765,0.204404,0.476291,...,0.132039,0.454655,0.136531,0.478718,0.145679,0.424429,0.120622,0.435576,0.117776,0.428733
64342,30v3,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",P23,0.206404,0.511160,0.180622,0.487703,0.157094,0.478783,...,0.172589,0.460127,0.149613,0.465635,0.155954,0.443105,0.106556,0.469964,0.154646,0.458043


In [41]:
y2_cols = [f'y2_{i}' for i in range(1, 89)]
phase2_30_30s_30s= phase2_df1[(phase2_df1['light_regime'] == '30s-30s') & (phase2_df1['plate'].isin(plates))].copy()
phase2_30_30s_30s[['plate', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,y2_6,...,y2_79,y2_80,y2_81,y2_82,y2_83,y2_84,y2_85,y2_86,y2_87,y2_88
58599,30v1,LMJ.RY0402.052440,Cre01.g028650,A02,0.201109,0.523365,0.198026,0.518878,0.166203,0.494774,...,0.194090,0.480022,0.215834,0.463492,0.194069,0.480103,0.179748,0.441193,0.170187,0.489277
58600,30v1,LMJ.RY0402.055420,Cre01.g045150,A03,,,,,,,...,,,,,,,,,,
58601,30v1,LMJ.RY0402.047311,"Cre08.g375800,Cre01.g049900,Cre01.g049900 & Cr...",A04,0.145757,0.480692,0.143619,0.467743,0.122912,0.434925,...,0.134837,0.422737,0.134068,0.422195,0.141833,0.448544,0.124233,0.390867,0.122021,0.418009
58602,30v1,LMJ.RY0402.054897,Cre02.g095115,A05,0.352869,0.623883,0.342290,0.605841,0.337922,0.579782,...,0.296446,0.578736,0.326278,0.586609,0.309941,0.579822,0.296848,0.568432,0.294651,0.600092
58603,30v1,LMJ.RY0402.054597,Cre03.g153400,A06,0.285635,0.568454,0.246520,0.535804,0.257991,0.533970,...,0.243769,0.517153,0.244464,0.545068,0.267870,0.559977,0.132139,0.523227,0.219190,0.499256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64339,30v3,LMJ.RY0402.240458,"Cre13.g588405 & Cre13.g588453,Cre02.g106650",P20,0.294753,0.552027,0.252073,0.530737,0.258017,0.529070,...,0.200344,0.505852,0.254529,0.495648,0.252602,0.534084,0.233167,0.519228,0.242505,0.524302
64340,30v3,LMJ.RY0402.076841,Cre04.g216050,P21,0.299497,0.646726,0.321889,0.598598,0.309467,0.585958,...,0.202405,0.555167,0.220268,0.540168,0.241901,0.518153,0.297078,0.496441,0.245715,0.530916
64341,30v3,LMJ.RY0402.039259,Cre01.g023050,P22,0.149009,0.485368,0.166897,0.481895,0.207456,0.479357,...,0.121257,0.455364,0.135012,0.486695,0.141413,0.416674,0.124697,0.434587,0.114873,0.427897
64342,30v3,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",P23,0.206929,0.502859,0.181319,0.490658,0.163335,0.481519,...,0.157379,0.461765,0.146174,0.473483,0.149835,0.436923,0.112899,0.467589,0.146110,0.456189


### plate 5min-5min

In [42]:
plates = ['30v1', '30v2', '30v3']
y2_cols = [f'y2_{i}' for i in range(1, 89)]

# Filter the data
phase2_30_5min_5min = phase2_df1[
    (phase2_df1['light_regime'] == '5min-5min') &
    (phase2_df1['plate'].isin(plates))
].copy()

# Copy to write normalized data
phase2_30_5min_5min_normalized = phase2_30_5min_5min.copy()

# Loop over each y2 column (timepoint)
for timepoint in y2_cols:
    position_values = []
    valid_plate_indices = []

    # Loop through (plate, start_date) technical replicates
    for (plate, start_date), group in phase2_30_5min_5min.groupby(['plate', 'start_date']):
        subset = group.copy()

        # Separate WT and non-WT rows
        wt_rows = subset[subset['mutant_ID'] == 'WT'].copy()
        non_wt_rows = subset[subset['mutant_ID'] != 'WT'].copy()

        # Sort for reproducibility
        wt_rows = wt_rows.sort_values(['mutant_ID', 'mutated_genes', 'well_id', 'start_date'])
        non_wt_rows = non_wt_rows.sort_values(['mutant_ID', 'mutated_genes', 'start_date'])

        # Combine sorted rows
        subset_sorted = pd.concat([wt_rows, non_wt_rows], axis=0)

        # Extract values and index
        values = subset_sorted[timepoint].values
        index = subset_sorted.index.values

        position_values.append(values)
        valid_plate_indices.append(index)

    # Skip timepoint if mismatch or empty
    lengths = [len(v) for v in position_values]
    if len(set(lengths)) != 1 or 0 in lengths:
        print(f"⚠️ Skipping {timepoint} due to mismatch or empty data: lengths = {lengths}")
        continue

    # Quantile normalize
    matrix = np.column_stack(position_values)
    normalized_matrix = normalize_quantiles(matrix, ties=True)

    # Write back
    for col_idx, index in enumerate(valid_plate_indices):
        phase2_30_5min_5min_normalized.loc[index, timepoint] = normalized_matrix[:, col_idx]

# Optional preview
phase2_30_5min_5min_normalized[['plate', 'start_date', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,start_date,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,...,y2_79,y2_80,y2_81,y2_82,y2_83,y2_84,y2_85,y2_86,y2_87,y2_88
61663,30v2,2024-06-24,LMJ.RY0402.051134,Cre01.g017200,A02,,,,,,...,,,,,,,,,,
61664,30v2,2024-06-24,LMJ.RY0402.189784,Cre01.g001800,A03,0.202128,0.634203,0.208485,0.620993,0.200474,...,0.174353,0.594291,0.151486,0.586118,0.188526,0.587124,0.171388,0.595291,0.150853,0.590029
61665,30v2,2024-06-24,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",A04,0.137899,0.551175,0.105919,0.576792,0.144838,...,0.127298,0.534458,0.100380,0.545186,0.112285,0.506863,0.100899,0.528620,0.093168,0.538607
61666,30v2,2024-06-24,LMJ.RY0402.164384,Cre09.g392500,A05,0.203338,0.610854,0.161276,0.624035,0.188615,...,0.113233,0.582882,0.124864,0.572349,0.166080,0.593069,0.139684,0.588152,0.124413,0.581604
61667,30v2,2024-06-24,LMJ.RY0402.163529,Cre01.g025400,A06,0.168383,0.571231,0.155124,0.565745,0.150660,...,0.158837,0.553316,0.120717,0.540584,0.118769,0.544471,0.111499,0.542863,0.117031,0.554226
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105320,30v3,2024-11-26,LMJ.RY0402.240458,"Cre13.g588405 & Cre13.g588453,Cre02.g106650",P20,0.250412,0.602965,0.231662,0.623698,0.237301,...,0.184049,0.617419,0.129567,0.583276,0.131066,0.595250,0.197193,0.589841,0.194618,0.581961
105321,30v3,2024-11-26,LMJ.RY0402.076841,Cre04.g216050,P21,0.197677,0.678635,0.157015,0.653489,0.118802,...,0.144085,,0.132259,0.614149,0.089222,,0.124108,0.597276,0.104114,0.634886
105322,30v3,2024-11-26,LMJ.RY0402.039259,Cre01.g023050,P22,0.252406,0.622583,,0.643556,0.213036,...,,0.601178,,0.616975,0.190150,0.590484,0.155855,,,
105323,30v3,2024-11-26,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",P23,0.156617,0.649192,0.172365,0.600120,0.200474,...,0.175179,0.588606,0.139168,0.573408,0.188526,0.627182,0.174999,0.573799,0.149828,0.625372


In [43]:
plates = ['30v1', '30v2','30v3']
y2_cols = [f'y2_{i}' for i in range(1, 90)]
phase2_30_5min_5min= phase2_df1[(phase2_df1['light_regime'] == '5min-5min') & (phase2_df1['plate'].isin(plates))].copy()
phase2_30_5min_5min[['plate', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,y2_6,...,y2_80,y2_81,y2_82,y2_83,y2_84,y2_85,y2_86,y2_87,y2_88,y2_89
61663,30v2,LMJ.RY0402.051134,Cre01.g017200,A02,,,,,,,...,,,,,,,,,,
61664,30v2,LMJ.RY0402.189784,Cre01.g001800,A03,0.219473,0.628291,0.218822,0.611663,0.204733,0.583965,...,0.586365,0.177225,0.576962,0.222916,0.576951,0.200629,0.591031,0.179134,0.585339,
61665,30v2,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",A04,0.130885,0.527061,0.092299,0.551516,0.140813,0.508574,...,0.505203,0.116432,0.522082,0.125390,0.472637,0.112021,0.497965,0.109269,0.514465,
61666,30v2,LMJ.RY0402.164384,Cre09.g392500,A05,0.220939,0.597080,0.164246,0.616587,0.191265,0.579720,...,0.569768,0.144817,0.558535,0.192215,0.585822,0.158917,0.580598,0.143732,0.573412,
61667,30v2,LMJ.RY0402.163529,Cre01.g025400,A06,0.174104,0.548586,0.154213,0.538536,0.146623,0.509359,...,0.528984,0.140201,0.517013,0.133615,0.520793,0.125136,0.517957,0.138121,0.534496,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105320,30v3,LMJ.RY0402.240458,"Cre13.g588405 & Cre13.g588453,Cre02.g106650",P20,0.213974,0.617186,0.211949,0.625069,0.214286,0.620723,...,0.606949,0.103445,0.584366,0.104968,0.592277,0.149208,0.588985,0.146213,0.581995,
105321,30v3,LMJ.RY0402.076841,Cre04.g216050,P21,0.184204,0.654939,0.159237,0.641793,0.130801,0.646940,...,0.626152,0.105128,0.601893,0.073075,0.627606,0.101255,0.593638,0.077005,0.608593,
105322,30v3,LMJ.RY0402.039259,Cre01.g023050,P22,0.215561,0.629708,0.267859,0.637591,0.201473,0.615533,...,0.595975,0.164078,0.603115,0.144397,0.589288,0.122607,0.616973,0.176602,0.611387,
105323,30v3,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",P23,0.156063,0.641679,0.170547,0.611881,0.193122,0.625269,...,0.590842,0.110772,0.579660,0.143992,0.609949,0.134931,0.579865,0.113994,0.607451,


## 30 plate 1min-5min

In [44]:
plates = ['30v1', '30v2', '30v3']
y2_cols = [f'y2_{i}' for i in range(1, 89)]

# Filter the data
phase2_30_1min_5min = phase2_df1[
    (phase2_df1['light_regime'] == '1min-5min') &
    (phase2_df1['plate'].isin(plates))
].copy()

# Copy to write normalized data
phase2_30_1min_5min_normalized = phase2_30_1min_5min.copy()

# Loop over each y2 column (timepoint)
for timepoint in y2_cols:
    position_values = []
    valid_plate_indices = []

    # Loop through (plate, start_date) technical replicates
    for (plate, start_date), group in phase2_30_1min_5min.groupby(['plate', 'start_date']):
        subset = group.copy()

        # Separate WT and non-WT rows
        wt_rows = subset[subset['mutant_ID'] == 'WT'].copy()
        non_wt_rows = subset[subset['mutant_ID'] != 'WT'].copy()

        # Sort for reproducibility
        wt_rows = wt_rows.sort_values(['mutant_ID', 'mutated_genes', 'well_id', 'start_date'])
        non_wt_rows = non_wt_rows.sort_values(['mutant_ID', 'mutated_genes', 'start_date'])

        # Combine sorted rows
        subset_sorted = pd.concat([wt_rows, non_wt_rows], axis=0)

        # Extract values and index
        values = subset_sorted[timepoint].values
        index = subset_sorted.index.values

        position_values.append(values)
        valid_plate_indices.append(index)

    # Skip timepoint if mismatch or empty
    lengths = [len(v) for v in position_values]
    if len(set(lengths)) != 1 or 0 in lengths:
        print(f"⚠️ Skipping {timepoint} due to mismatch or empty data: lengths = {lengths}")
        continue

    # Quantile normalize
    matrix = np.column_stack(position_values)
    normalized_matrix = normalize_quantiles(matrix, ties=True)

    # Write back
    for col_idx, index in enumerate(valid_plate_indices):
        phase2_30_1min_5min_normalized.loc[index, timepoint] = normalized_matrix[:, col_idx]

# Optional preview
phase2_30_1min_5min_normalized[['plate', 'start_date', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,start_date,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,...,y2_79,y2_80,y2_81,y2_82,y2_83,y2_84,y2_85,y2_86,y2_87,y2_88
61280,30v2,2024-06-23,LMJ.RY0402.051134,Cre01.g017200,A02,,,,,,...,,,,,,,,,,
61281,30v2,2024-06-23,LMJ.RY0402.189784,Cre01.g001800,A03,0.242697,0.663739,0.193924,0.659478,0.199937,...,0.132532,0.641843,0.162041,0.633769,0.133863,0.629317,0.152372,0.621799,0.134939,0.647246
61282,30v2,2024-06-23,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",A04,0.161902,0.592355,0.128095,0.606214,0.130902,...,0.093363,0.568909,0.097464,0.588722,0.091224,0.565415,0.099631,0.576590,0.071319,0.556958
61283,30v2,2024-06-23,LMJ.RY0402.164384,Cre09.g392500,A05,0.232877,0.637154,0.184829,0.630291,0.184519,...,0.138707,0.633044,0.152089,0.627965,0.133183,0.633640,0.128293,0.647010,0.112384,0.630177
61284,30v2,2024-06-23,LMJ.RY0402.163529,Cre01.g025400,A06,0.165970,0.599096,0.156011,0.577160,0.143322,...,0.086692,0.583975,0.082000,0.586734,0.053404,0.606319,0.102923,0.577382,0.116563,0.604383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104554,30v3,2024-11-25,LMJ.RY0402.240458,"Cre13.g588405 & Cre13.g588453,Cre02.g106650",P20,,,,0.644752,0.225640,...,0.191302,0.623611,0.185336,0.672037,0.163143,0.641123,0.212109,0.573273,,0.653657
104555,30v3,2024-11-25,LMJ.RY0402.076841,Cre04.g216050,P21,0.222815,,0.200937,0.626511,0.247620,...,0.143295,0.636815,0.094762,0.586516,0.154225,0.618560,0.171763,0.619178,0.166919,0.625322
104556,30v3,2024-11-25,LMJ.RY0402.039259,Cre01.g023050,P22,,0.674534,0.186118,0.637297,,...,0.111942,0.606290,0.161409,0.589164,,0.598113,,0.573984,0.159639,0.618897
104557,30v3,2024-11-25,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",P23,0.196578,0.647262,0.235051,0.617472,0.266822,...,0.173175,0.645985,0.206825,0.611175,0.197334,0.633640,0.162574,0.617913,0.161102,0.601719


In [45]:
plates = ['30v1', '30v2','30v3']
y2_cols = [f'y2_{i}' for i in range(1, 90)]
phase2_30_1min_5min= phase2_df1[(phase2_df1['light_regime'] == '1min-5min') & (phase2_df1['plate'].isin(plates))].copy()
phase2_30_1min_5min[['plate', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,y2_6,...,y2_80,y2_81,y2_82,y2_83,y2_84,y2_85,y2_86,y2_87,y2_88,y2_89
61280,30v2,LMJ.RY0402.051134,Cre01.g017200,A02,,,,,,,...,,,,,,,,,,
61281,30v2,LMJ.RY0402.189784,Cre01.g001800,A03,0.286091,0.662745,0.224805,0.656407,0.225504,0.657713,...,0.628206,0.214087,0.616775,0.178378,0.614558,0.201466,0.604706,0.176365,0.638405,
61282,30v2,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",A04,0.184899,0.566349,0.144924,0.586311,0.136270,0.550509,...,0.531786,0.123692,0.557967,0.123397,0.528383,0.133271,0.544872,0.094898,0.519099,
61283,30v2,LMJ.RY0402.164384,Cre09.g392500,A05,0.274217,0.624538,0.210922,0.616918,0.206208,0.594279,...,0.617104,0.197029,0.610256,0.177327,0.619713,0.168734,0.640524,0.144387,0.614812,
61284,30v2,LMJ.RY0402.163529,Cre01.g025400,A06,0.189783,0.573342,0.177367,0.548159,0.152093,0.557057,...,0.549045,0.104035,0.555692,0.079085,0.583846,0.136997,0.545379,0.149141,0.577694,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104554,30v3,LMJ.RY0402.240458,"Cre13.g588405 & Cre13.g588453,Cre02.g106650",P20,0.225850,0.681489,0.208708,0.645814,0.177832,0.666755,...,0.639287,0.119976,0.658600,0.105542,0.643631,0.135338,0.600658,0.136674,0.652463,
104555,30v3,LMJ.RY0402.076841,Cre04.g216050,P21,0.176487,0.680846,0.163138,0.636851,0.185159,0.658357,...,0.644552,0.062796,0.610025,0.100059,0.631326,0.109919,0.632120,0.104908,0.637103,
104556,30v3,LMJ.RY0402.039259,Cre01.g023050,P22,0.244520,0.661683,0.155292,0.642827,0.226093,0.592570,...,0.630576,0.104884,0.612015,0.149946,0.620017,0.172030,0.601625,0.100780,0.633501,
104557,30v3,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",P23,0.161106,0.649937,0.182192,0.631177,0.192298,0.630728,...,0.648242,0.125680,0.627198,0.126636,0.640167,0.105850,0.631427,0.101309,0.623098,


In [46]:
phase2_30_quantile1= pd.concat([
    phase2_30_20h_ML_normalized,
    phase2_30_20h_HL_normalized,
    phase2_30_2h_2h_normalized,
    phase2_30_1min_1min_normalized,
    phase2_30_30s_30s_normalized,
    phase2_30_5min_5min_normalized,
    phase2_30_1min_5min_normalized
], ignore_index=True)

In [65]:
phase2_30_quantile1.to_csv('phase2_30_quantile1.csv',index=False)

In [47]:
phase2_30_quantile1.shape

(9958, 726)

In [56]:
plates = ['30v1', '30v2','30v3']
data=phase2_df1[(phase2_df1['plate'].isin(plates))&(phase2_df1['light_regime']!='10min-10min')]
data.shape

(9958, 726)