In [47]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

import statsmodels.formula.api as smf
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.linear_model import LinearRegression, LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, matthews_corrcoef

from matplotlib.lines import Line2D
from matplotlib.patches import Patch

In [3]:
file_path = 'database_2025-05-18.csv'  
df = pd.read_csv(file_path, low_memory=False)

In [4]:
phase2_plates = ['30v1','30v2','30v3','31v1','31v2','31v3','32v1','32v2','32v3','33v1','33v2','33v3']
phase2_df= df[df['plate'].isin(phase2_plates)]

In [5]:
# drop last data point
y2_cols = [f'y2_{i}' for i in range(1, 91)] 
def drop_last_valid(row):
    valid = row[y2_cols].last_valid_index()
    # if pd.notna(row[valid]):
    if valid is not None and pd.notna(row[valid]):
        row[valid] = np.nan
    return row

phase2_df1 = phase2_df.apply(drop_last_valid, axis=1)

In [41]:
phase2_df1.to_csv('phase2_df(null).csv',index=False)

In [48]:
phase2_df1=pd.read_csv('phase2_df(null).csv', low_memory=False)

In [8]:
phase2_df1[
    ((phase2_df1['light_regime'] == '10min-10min') & (phase2_df1['y2_87'].notnull()))
]['plate'].unique()

array(['30v2', '30v3', '32v1', '31v1'], dtype=object)

In [None]:
phase2_df1 = phase2_df1[
    ~((phase2_df1['light_regime'] == '10min-10min') & (phase2_df1['y2_87'].notnull()))
]

In [49]:
from scipy import interpolate
from scipy.stats import rankdata

def normalize_quantiles(A, ties=True):
    A = np.asarray(A, dtype=np.float64)
    n_rows, n_cols = A.shape
    if n_cols == 1:
        return A.copy()

    i = np.linspace(0, 1, n_rows)
    S = np.full((n_rows, n_cols), np.nan)
    nobs = np.zeros(n_cols, dtype=int)
    sort_idx = []

    for j in range(n_cols):
        col = A[:, j]
        not_nan = ~np.isnan(col)
        x = col[not_nan]
        nobs[j] = len(x)
        sort_order = np.argsort(x)
        sorted_x = x[sort_order]

        if nobs[j] < n_rows:
            f = interpolate.interp1d(np.linspace(0, 1, nobs[j]), sorted_x,
                                     bounds_error=False, fill_value="extrapolate")
            S[:, j] = f(i)
        else:
            S[:, j] = sorted_x

        sort_idx.append(np.argsort(np.argsort(col[not_nan])))

    m = np.nanmean(S, axis=1)
    A_out = np.full_like(A, np.nan)

    for j in range(n_cols):
        col = A[:, j]
        not_nan = ~np.isnan(col)

        if ties:
            r = rankdata(col[not_nan], method='average')
            quant_pos = (r - 1) / (nobs[j] - 1)
            f = interpolate.interp1d(i, m, bounds_error=False, fill_value="extrapolate")
            A_out[not_nan, j] = f(quant_pos)
        else:
            ranks = sort_idx[j]
            A_out[not_nan, j] = m[ranks.astype(int)]

    return A_out

In [11]:

plates = ['30v1', '30v2', '30v3']
df_30v =phase2_df1[phase2_df1['plate'].isin(plates)]

group_counts = (
    df_30v.groupby(['plate', 'light_regime', 'mutant_ID', 'mutated_genes'])
    .size()
    .reset_index(name='count')
)

summary = (
    group_counts.groupby(['light_regime','plate', 'count'])
    .size()
    .reset_index(name='n_mutants')
)

summary = summary.sort_values(by=['light_regime','plate', 'count'])

summary

Unnamed: 0,light_regime,plate,count,n_mutants
0,10min-10min,30v1,1,375
1,10min-10min,30v1,7,1
2,10min-10min,30v2,1,375
3,10min-10min,30v2,7,1
4,10min-10min,30v3,2,375
5,10min-10min,30v3,14,1
6,1min-1min,30v1,2,375
7,1min-1min,30v1,14,1
8,1min-1min,30v2,1,375
9,1min-1min,30v2,7,1


## 30 plate 20h ML

In [51]:
def quantile_normalize_light_regime(df, light_regime, plates, y2_cols, tie_handling=True):
    # Filter data
    subset_df = df[(df['light_regime'] == light_regime) & (df['plate'].isin(plates))].copy()
    df_normalized = subset_df.copy()

    for timepoint in y2_cols:
        position_values = []
        valid_plate_indices = {}

        for plate in plates:
            plate_df = subset_df[subset_df['plate'] == plate].copy()

            wt_rows = plate_df[plate_df['mutant_ID'] == 'WT'].copy()
            non_wt_rows = plate_df[plate_df['mutant_ID'] != 'WT'].copy()

            wt_rows = wt_rows.sort_values(['mutant_ID', 'mutated_genes', 'well_id'])
            non_wt_rows = non_wt_rows.sort_values(['mutant_ID', 'mutated_genes'])

            sorted_df = pd.concat([wt_rows, non_wt_rows], axis=0)
            values = sorted_df[timepoint].values
            index = sorted_df.index.values

            position_values.append(values)
            valid_plate_indices[plate] = index

        # Validate shape
        lengths = [len(v) for v in position_values]
        if len(set(lengths)) != 1:
            raise ValueError(f"Length mismatch at {timepoint}: {lengths}")

        matrix = np.column_stack(position_values)
        normalized_matrix = normalize_quantiles(matrix, ties=tie_handling)

        # Write back
        for col_idx, plate in enumerate(plates):
            indices = valid_plate_indices[plate]
            df_normalized.loc[indices, timepoint] = normalized_matrix[:, col_idx]

    return df_normalized

In [52]:
# Define inputs
plates = ['30v1', '30v2', '30v3']
y2_cols = [f'y2_{i}' for i in range(1, 45)]

# Run normalization
phase2_30_20h_ML_normalized = quantile_normalize_light_regime(
    df=phase2_df1,
    light_regime='20h_ML',
    plates=plates,
    y2_cols=y2_cols
)

# View a few columns
phase2_30_20h_ML_normalized[['plate', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,y2_6,...,y2_35,y2_36,y2_37,y2_38,y2_39,y2_40,y2_41,y2_42,y2_43,y2_44
766,30v3,LMJ.RY0402.047723,"Cre03.g183150 & Cre03.g183200,Cre03.g183150,Cr...",A02,0.501621,0.511582,0.539199,0.504744,0.520319,0.512838,...,0.467841,0.460427,0.471032,0.476462,0.464536,0.481096,0.484785,0.477921,0.470457,0.454926
767,30v3,LMJ.RY0402.064780,Cre03.g192450,A03,0.288100,0.369688,0.342921,0.342009,0.349547,0.370690,...,0.321162,0.340087,0.326739,0.329589,0.348223,0.348566,0.341849,0.350612,0.306536,0.326008
768,30v3,LMJ.RY0402.193315,Cre01.g025400,A04,0.343212,0.370821,0.407145,0.358222,0.388296,0.382673,...,0.366632,0.356636,0.384685,0.348266,0.375795,0.352887,0.373183,0.359884,0.363412,0.361793
769,30v3,LMJ.RY0402.110865,Cre01.g039550,A05,0.273719,0.355010,0.348160,0.366782,0.335236,0.373379,...,0.190483,0.177119,0.228444,0.221199,0.257200,0.206154,0.228532,0.156460,0.243133,0.215428
770,30v3,LMJ.RY0402.074645,Cre01.g001800,A06,0.284005,0.306666,0.267974,0.327902,0.322236,0.320471,...,0.291170,0.280813,0.266605,0.258826,0.308519,0.299280,0.296051,0.266746,0.270987,0.296442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29486,30v2,LMJ.RY0402.228465,Cre09.g398200,P20,0.406222,0.396546,0.404007,0.426827,0.403000,0.403897,...,0.398597,0.390503,0.393111,0.368958,0.406871,0.343467,0.379763,0.364258,0.387625,0.386956
29487,30v2,LMJ.RY0402.193684,Cre09.g387400,P21,0.229250,0.284112,0.282021,0.344759,0.315784,0.321395,...,0.283393,0.289281,0.287165,0.259373,0.254195,0.269965,0.278637,0.275296,0.274288,0.278159
29488,30v2,LMJ.RY0402.147464,Cre04.g218500,P22,0.365476,0.391506,0.389953,0.399592,0.385205,0.426212,...,0.391131,0.355068,0.373477,0.375720,0.392757,0.372144,0.415299,0.378878,0.390379,0.384888
29489,30v2,LMJ.RY0402.232801,Cre08.g385650,P23,0.299187,0.388771,0.288998,0.355913,0.341803,0.322943,...,0.319482,0.297543,0.308269,0.289078,0.304865,0.278218,0.307729,0.294965,0.300620,0.281964


In [10]:
plates = ['30v1', '30v2', '30v3']
phase2_30_20h_ML= phase2_df1[(phase2_df1['light_regime'] == '20h_ML') & (phase2_df1['plate'].isin(plates))].copy()
phase2_30_20h_ML[['plate', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,y2_6,...,y2_35,y2_36,y2_37,y2_38,y2_39,y2_40,y2_41,y2_42,y2_43,y2_44
3064,30v3,LMJ.RY0402.047723,"Cre03.g183150 & Cre03.g183200,Cre03.g183150,Cr...",A02,0.515425,0.530922,0.556353,0.526765,0.542859,0.531956,...,0.504937,0.489176,0.502789,0.505343,0.498442,0.522396,0.521141,0.518100,0.510652,0.489583
3065,30v3,LMJ.RY0402.064780,Cre03.g192450,A03,0.297425,0.376459,0.356766,0.352587,0.373655,0.388188,...,0.348878,0.368588,0.353014,0.360544,0.378921,0.382440,0.372428,0.385754,0.333692,0.351102
3066,30v3,LMJ.RY0402.193315,Cre01.g025400,A04,0.355337,0.378657,0.422959,0.373672,0.414600,0.399107,...,0.394081,0.384850,0.413957,0.380081,0.408224,0.385875,0.403889,0.395332,0.399688,0.387975
3067,30v3,LMJ.RY0402.110865,Cre01.g039550,A05,0.280328,0.360491,0.360546,0.383162,0.359604,0.391119,...,0.213224,0.214187,0.249366,0.256097,0.281226,0.239622,0.258191,0.188697,0.267244,0.226832
3068,30v3,LMJ.RY0402.074645,Cre01.g001800,A06,0.292572,0.308605,0.275840,0.340822,0.351931,0.331894,...,0.318408,0.305931,0.291088,0.288082,0.336617,0.330172,0.329137,0.298541,0.299378,0.320418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90957,30v2,LMJ.RY0402.228465,Cre09.g398200,P20,0.398817,0.390735,0.396958,0.414085,0.391509,0.398642,...,0.396879,0.391382,0.390296,0.362349,0.405501,0.339020,0.384579,0.361717,0.382819,0.383467
90958,30v2,LMJ.RY0402.193684,Cre09.g387400,P21,0.205550,0.284820,0.278447,0.338621,0.297640,0.329212,...,0.285165,0.297610,0.287748,0.260881,0.260667,0.268727,0.286313,0.274494,0.268994,0.279427
90959,30v2,LMJ.RY0402.147464,Cre04.g218500,P22,0.358390,0.385916,0.383348,0.388827,0.373952,0.421918,...,0.387967,0.358730,0.369320,0.369663,0.393329,0.367008,0.415791,0.375757,0.386002,0.380681
90960,30v2,LMJ.RY0402.232801,Cre08.g385650,P23,0.296152,0.382801,0.281817,0.347726,0.333800,0.329268,...,0.319117,0.303551,0.311048,0.287504,0.303663,0.275597,0.312752,0.290159,0.296608,0.282701


## 30 plate 20h_HL

In [53]:
plates = ['30v1', '30v2', '30v3']
y2_cols = [f'y2_{i}' for i in range(1, 45)]

# Filter the data
phase2_30_20h_HL = phase2_df1[
    (phase2_df1['light_regime'] == '20h_HL') &
    (phase2_df1['plate'].isin(plates))
].copy()

# Copy to write normalized data
phase2_30_20h_HL_normalized = phase2_30_20h_HL.copy()

# Loop over each y2 column (timepoint)
for timepoint in y2_cols:
    position_values = []
    valid_plate_indices = []

    # Loop through (plate, start_date) technical replicates


    for (plate, start_date), group in phase2_30_20h_HL.groupby(['plate', 'start_date']):


        subset = group.copy()

        # Separate WT and non-WT rows
        wt_rows = subset[subset['mutant_ID'] == 'WT'].copy()
        non_wt_rows = subset[subset['mutant_ID'] != 'WT'].copy()

        # Sort for reproducibility
        wt_rows = wt_rows.sort_values(['mutant_ID', 'mutated_genes', 'well_id', 'start_date'])
        non_wt_rows = non_wt_rows.sort_values(['mutant_ID', 'mutated_genes', 'start_date'])


        # Combine sorted rows
        subset_sorted = pd.concat([wt_rows, non_wt_rows], axis=0)

        # Extract values and index
        values = subset_sorted[timepoint].values
        index = subset_sorted.index.values

        position_values.append(values)
        valid_plate_indices.append(index)

    # Skip timepoint if mismatch or empty
    lengths = [len(v) for v in position_values]
    if len(set(lengths)) != 1 or 0 in lengths:
        print(f"⚠️ Skipping {timepoint} due to mismatch or empty data: lengths = {lengths}")
        continue

    # Quantile normalize
    matrix = np.column_stack(position_values)
    normalized_matrix = normalize_quantiles(matrix, ties=True)

    # Write back
    for col_idx, index in enumerate(valid_plate_indices):
        phase2_30_20h_HL_normalized.loc[index, timepoint] = normalized_matrix[:, col_idx]

# Optional preview
phase2_30_20h_HL_normalized[['plate', 'start_date', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,start_date,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,...,y2_35,y2_36,y2_37,y2_38,y2_39,y2_40,y2_41,y2_42,y2_43,y2_44
6511,30v3,2024-06-26,LMJ.RY0402.047723,"Cre03.g183150 & Cre03.g183200,Cre03.g183150,Cr...",A02,0.295548,0.257968,0.320485,0.301384,0.240298,...,0.132482,0.164627,0.160285,0.187703,0.181534,0.171787,0.142228,0.170101,0.161757,0.125960
6512,30v3,2024-06-26,LMJ.RY0402.064780,Cre03.g192450,A03,0.194330,0.184814,0.109381,0.215168,0.167976,...,0.103549,0.134823,0.115659,0.183872,0.164975,0.135408,0.118771,0.121452,0.082812,0.118588
6513,30v3,2024-06-26,LMJ.RY0402.193315,Cre01.g025400,A04,0.189035,0.190679,0.195262,0.233932,0.171271,...,0.136703,0.136705,0.120952,0.130105,0.108168,0.135144,0.125014,0.122069,0.147503,0.150658
6514,30v3,2024-06-26,LMJ.RY0402.110865,Cre01.g039550,A05,0.181366,0.166998,0.139838,0.136055,0.175094,...,0.027029,-0.035618,-0.027036,-0.025182,0.024088,0.008741,-0.003124,0.002466,0.011074,0.004697
6515,30v3,2024-06-26,LMJ.RY0402.074645,Cre01.g001800,A06,0.168071,0.107102,0.147199,0.116262,0.124575,...,0.066795,0.094091,0.127764,0.090485,0.118583,0.096524,0.052049,0.106440,0.070797,0.109159
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34082,30v2,2024-06-18,LMJ.RY0402.228465,Cre09.g398200,P20,0.252806,0.225432,0.140007,0.210454,0.184944,...,0.156308,0.179087,0.148834,0.111914,0.125330,0.129880,0.106504,0.185090,0.146281,0.149609
34083,30v2,2024-06-18,LMJ.RY0402.193684,Cre09.g387400,P21,0.228722,0.170192,0.204711,0.157169,0.207882,...,0.100523,0.110715,0.110510,0.084994,0.103060,0.068911,0.046408,0.051109,0.089803,0.105300
34084,30v2,2024-06-18,LMJ.RY0402.147464,Cre04.g218500,P22,0.216961,0.184309,0.229365,0.123812,0.070690,...,0.143573,0.103326,0.149071,0.137506,0.176319,0.068368,0.144751,0.130519,0.134983,0.193495
34085,30v2,2024-06-18,LMJ.RY0402.232801,Cre08.g385650,P23,0.109067,0.171941,0.215154,0.243162,0.140779,...,0.125781,0.070009,0.108660,0.018767,0.117300,0.171685,0.083300,0.075319,0.107913,0.118675


In [11]:
plates = ['30v1', '30v2', '30v3']
phase2_30_20h_HL= phase2_df1[(phase2_df1['light_regime'] == '20h_HL') & (phase2_df1['plate'].isin(plates))].copy()
phase2_30_20h_HL[['plate', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,y2_6,...,y2_35,y2_36,y2_37,y2_38,y2_39,y2_40,y2_41,y2_42,y2_43,y2_44
20299,30v3,LMJ.RY0402.047723,"Cre03.g183150 & Cre03.g183200,Cre03.g183150,Cr...",A02,0.300009,0.254210,0.319156,0.291770,0.226091,0.290621,...,0.097309,0.122476,0.111993,0.143006,0.141476,0.131043,0.103595,0.134687,0.121471,0.087130
20300,30v3,LMJ.RY0402.064780,Cre03.g192450,A03,0.196110,0.186600,0.095905,0.190578,0.153390,0.161948,...,0.067129,0.093507,0.070874,0.138110,0.127992,0.094304,0.080477,0.083554,0.043804,0.081004
20301,30v3,LMJ.RY0402.193315,Cre01.g025400,A04,0.191026,0.192399,0.180462,0.212443,0.155750,0.166929,...,0.099985,0.095761,0.075837,0.085135,0.073505,0.093926,0.085824,0.083784,0.106359,0.112459
20302,30v3,LMJ.RY0402.110865,Cre01.g039550,A05,0.182405,0.168592,0.129502,0.114303,0.160754,0.109891,...,-0.022661,-0.097819,-0.064125,-0.068158,-0.031970,-0.036996,-0.046929,-0.033266,-0.041887,-0.017343
20303,30v3,LMJ.RY0402.074645,Cre01.g001800,A06,0.169757,0.106682,0.136710,0.092987,0.111253,0.120809,...,0.026276,0.051640,0.079967,0.047842,0.083004,0.056502,0.018033,0.067847,0.032576,0.072037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104314,30v2,LMJ.RY0402.228465,Cre09.g398200,P20,0.265454,0.250580,0.163968,0.234110,0.211761,0.299394,...,0.176257,0.190908,0.172653,0.131489,0.138055,0.141860,0.120758,0.196419,0.168623,0.171068
104315,30v2,LMJ.RY0402.193684,Cre09.g387400,P21,0.240877,0.186823,0.234656,0.177506,0.236541,0.193998,...,0.118118,0.123830,0.132518,0.106696,0.117284,0.081512,0.058062,0.057121,0.110994,0.121999
104316,30v2,LMJ.RY0402.147464,Cre04.g218500,P22,0.230947,0.203313,0.261677,0.139226,0.099217,0.183440,...,0.160740,0.117114,0.172770,0.162878,0.189833,0.081109,0.159113,0.143299,0.158922,0.212536
104317,30v2,LMJ.RY0402.232801,Cre08.g385650,P23,0.110070,0.189555,0.245011,0.267707,0.163794,0.140098,...,0.142283,0.077475,0.128622,0.055200,0.130188,0.185548,0.100173,0.086116,0.130023,0.136890


In [90]:
phase2_30_20h_HL[phase2_30_20h_HL['plate']=='30v1']

Unnamed: 0,plate,measurement,start_date,light_regime,dark_threshold,light_threshold,num_frames,i,j,fv_fm,...,measurement_time_173,measurement_time_174,measurement_time_175,measurement_time_176,measurement_time_177,well_id,mutant_ID,feature,mutated_genes,num_mutations
7660,30v1,M4,2024-06-11,20h_HL,15.467099,20.377613,92,0,1,0.612191,...,,,,,,A02,LMJ.RY0402.052440,intron,Cre01.g028650,1.0
7661,30v1,M4,2024-06-11,20h_HL,15.467099,20.377613,92,0,2,,...,,,,,,A03,LMJ.RY0402.055420,CDS/intron,Cre01.g045150,1.0
7662,30v1,M4,2024-06-11,20h_HL,15.467099,20.377613,92,0,3,0.548124,...,,,,,,A04,LMJ.RY0402.047311,intergenic,"Cre08.g375800,Cre01.g049900,Cre01.g049900 & Cr...",3.0
7663,30v1,M4,2024-06-11,20h_HL,15.467099,20.377613,92,0,4,0.693065,...,,,,,,A05,LMJ.RY0402.054897,intron,Cre02.g095115,1.0
7664,30v1,M4,2024-06-11,20h_HL,15.467099,20.377613,92,0,5,0.687707,...,,,,,,A06,LMJ.RY0402.054597,intron,Cre03.g153400,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16464,30v1,M2,2024-06-09,20h_HL,20.743397,25.152641,92,15,19,0.645732,...,,,,,,P20,LMJ.RY0402.101554,intron,Cre08.g374450,1.0
16465,30v1,M2,2024-06-09,20h_HL,20.743397,25.152641,92,15,20,0.581821,...,,,,,,P21,LMJ.RY0402.105853,intron,Cre08.g382620,1.0
16466,30v1,M2,2024-06-09,20h_HL,20.743397,25.152641,92,15,21,0.649203,...,,,,,,P22,LMJ.RY0402.144998,intron,Cre08.g382620,1.0
16467,30v1,M2,2024-06-09,20h_HL,20.743397,25.152641,92,15,22,0.651053,...,,,,,,P23,LMJ.RY0402.042144,CDS,Cre09.g387400,1.0


### 30 plate 2h-2h

In [54]:
# Define inputs
plates = ['30v1', '30v2', '30v3']
y2_cols = [f'y2_{i}' for i in range(1, 49)]

# Run normalization
phase2_30_2h_2h_normalized = quantile_normalize_light_regime(
    df=phase2_df1,
    light_regime='2h-2h',
    plates=plates,
    y2_cols=y2_cols
)

# View a few columns
phase2_30_2h_2h_normalized[['plate', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols[:10]]

Unnamed: 0,plate,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,y2_6,y2_7,y2_8,y2_9,y2_10
383,30v1,LMJ.RY0402.052440,Cre01.g028650,A02,0.153134,0.125631,0.179038,0.127128,0.192583,0.217352,0.141008,0.220985,0.523430,0.569918
384,30v1,LMJ.RY0402.055420,Cre01.g045150,A03,,,,,,,,,,
385,30v1,LMJ.RY0402.047311,"Cre08.g375800,Cre01.g049900,Cre01.g049900 & Cr...",A04,0.172781,0.187679,0.186683,0.171537,0.119117,0.102889,0.188624,0.125848,0.534260,0.523752
386,30v1,LMJ.RY0402.054897,Cre02.g095115,A05,0.270873,0.242891,0.261639,0.232936,0.235228,0.249579,0.270519,0.252196,0.627803,0.651289
387,30v1,LMJ.RY0402.054597,Cre03.g153400,A06,0.223737,0.209742,0.244030,0.167038,0.248156,0.211332,0.267948,0.197394,0.632807,0.670509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28720,30v2,LMJ.RY0402.228465,Cre09.g398200,P20,0.172325,0.195285,0.242361,0.209777,0.278083,0.179763,0.227053,0.233198,0.534385,0.603975
28721,30v2,LMJ.RY0402.193684,Cre09.g387400,P21,0.130382,0.066188,0.110621,0.119814,0.149866,0.113688,0.147377,0.169033,0.466541,0.521912
28722,30v2,LMJ.RY0402.147464,Cre04.g218500,P22,0.161551,0.221729,0.214723,0.188050,0.231342,0.196601,0.247418,0.267044,0.590970,0.597039
28723,30v2,LMJ.RY0402.232801,Cre08.g385650,P23,0.142958,0.145327,0.163319,0.129046,0.149235,0.147139,0.167291,0.198629,0.535234,0.516281


In [15]:
phase2_30_2h_2h= phase2_df1[(phase2_df1['light_regime'] == '2h-2h') & (phase2_df1['plate'].isin(plates))].copy()
phase2_30_2h_2h[['plate', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols[:10]]

Unnamed: 0,plate,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,y2_6,y2_7,y2_8,y2_9,y2_10
1149,30v1,LMJ.RY0402.052440,Cre01.g028650,A02,0.149536,0.118418,0.172688,0.119780,0.190143,0.214951,0.130722,0.210320,0.509822,0.557434
1150,30v1,LMJ.RY0402.055420,Cre01.g045150,A03,,,,,,,,,,
1151,30v1,LMJ.RY0402.047311,"Cre08.g375800,Cre01.g049900,Cre01.g049900 & Cr...",A04,0.166283,0.186892,0.181170,0.168928,0.106746,0.089804,0.179855,0.119110,0.520025,0.510441
1152,30v1,LMJ.RY0402.054897,Cre02.g095115,A05,0.254300,0.241386,0.253756,0.234268,0.232971,0.250657,0.262042,0.242339,0.614033,0.637819
1153,30v1,LMJ.RY0402.054597,Cre03.g153400,A06,0.221033,0.204123,0.240292,0.164685,0.249126,0.208331,0.258897,0.189821,0.619235,0.662281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86840,30v2,LMJ.RY0402.228465,Cre09.g398200,P20,0.178483,0.203732,0.244510,0.215436,0.264020,0.171053,0.226065,0.228592,0.540857,0.605745
86841,30v2,LMJ.RY0402.193684,Cre09.g387400,P21,0.137755,0.080398,0.113951,0.130149,0.146465,0.108329,0.151975,0.168080,0.481357,0.528750
86842,30v2,LMJ.RY0402.147464,Cre04.g218500,P22,0.167121,0.228295,0.217918,0.195899,0.222489,0.189124,0.246569,0.262519,0.595913,0.600962
86843,30v2,LMJ.RY0402.232801,Cre08.g385650,P23,0.148319,0.155801,0.169047,0.136979,0.145749,0.143564,0.171515,0.194463,0.542462,0.525771


## plate 30 1min-1min

In [55]:
plates = ['30v1', '30v2', '30v3']
y2_cols = [f'y2_{i}' for i in range(1, 89)]

# Filter the data
phase2_30_1min_1min = phase2_df1[
    (phase2_df1['light_regime'] == '1min-1min') &
    (phase2_df1['plate'].isin(plates))
].copy()

# Copy to write normalized data
phase2_30_1min_1min_normalized = phase2_30_1min_1min.copy()

# Loop over each y2 column (timepoint)
for timepoint in y2_cols:
    position_values = []
    valid_plate_indices = []

    # Loop through (plate, start_date) technical replicates
    for (plate, start_date), group in phase2_30_1min_1min.groupby(['plate', 'start_date']):
        subset = group.copy()

        # Separate WT and non-WT rows
        wt_rows = subset[subset['mutant_ID'] == 'WT'].copy()
        non_wt_rows = subset[subset['mutant_ID'] != 'WT'].copy()

        # Sort for reproducibility
        wt_rows = wt_rows.sort_values(['mutant_ID', 'mutated_genes', 'well_id', 'start_date'])
        non_wt_rows = non_wt_rows.sort_values(['mutant_ID', 'mutated_genes', 'start_date'])

        # Combine sorted rows
        subset_sorted = pd.concat([wt_rows, non_wt_rows], axis=0)

        # Extract values and index
        values = subset_sorted[timepoint].values
        index = subset_sorted.index.values

        position_values.append(values)
        valid_plate_indices.append(index)

    # Skip timepoint if mismatch or empty
    lengths = [len(v) for v in position_values]
    if len(set(lengths)) != 1 or 0 in lengths:
        print(f"⚠️ Skipping {timepoint} due to mismatch or empty data: lengths = {lengths}")
        continue

    # Quantile normalize
    matrix = np.column_stack(position_values)
    normalized_matrix = normalize_quantiles(matrix, ties=True)

    # Write back
    for col_idx, index in enumerate(valid_plate_indices):
        phase2_30_1min_1min_normalized.loc[index, timepoint] = normalized_matrix[:, col_idx]

# Optional preview
phase2_30_1min_1min_normalized[['plate', 'start_date', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,start_date,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,...,y2_79,y2_80,y2_81,y2_82,y2_83,y2_84,y2_85,y2_86,y2_87,y2_88
0,30v1,2024-06-10,LMJ.RY0402.052440,Cre01.g028650,A02,0.170995,0.541553,0.175519,0.521771,0.153619,...,0.164825,0.463091,0.190968,0.502740,0.150794,0.509139,0.165931,0.507277,0.208195,0.495624
1,30v1,2024-06-10,LMJ.RY0402.055420,Cre01.g045150,A03,,,,,,...,,,,,,,,,,
2,30v1,2024-06-10,LMJ.RY0402.047311,"Cre08.g375800,Cre01.g049900,Cre01.g049900 & Cr...",A04,0.094046,0.499596,0.129022,0.479129,0.125084,...,0.136848,0.480398,0.135347,0.459845,0.069767,0.467347,0.072266,0.473910,0.143341,0.456176
3,30v1,2024-06-10,LMJ.RY0402.054897,Cre02.g095115,A05,0.307551,0.637087,0.287824,0.598286,0.298902,...,0.244681,0.563910,0.244618,0.598894,0.238170,0.581094,0.246664,0.589086,0.238006,0.572135
4,30v1,2024-06-10,LMJ.RY0402.054597,Cre03.g153400,A06,0.291403,0.619408,0.251669,0.576694,0.256319,...,0.187906,0.528458,0.175647,0.541533,0.191274,0.534114,0.203178,0.536270,0.212234,0.533388
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30635,30v3,2024-06-25,LMJ.RY0402.240458,"Cre13.g588405 & Cre13.g588453,Cre02.g106650",P20,0.231092,0.573771,0.239890,0.518722,0.223960,...,0.209133,0.509333,0.208341,0.508523,0.178104,0.543036,0.183633,0.519819,0.179880,0.544117
30636,30v3,2024-06-25,LMJ.RY0402.076841,Cre04.g216050,P21,0.314714,0.632705,0.268413,0.601286,0.259490,...,0.211216,0.557839,0.167119,0.552078,0.188021,0.530378,0.184149,0.528375,0.152021,0.505952
30637,30v3,2024-06-25,LMJ.RY0402.039259,Cre01.g023050,P22,0.186815,0.515658,0.162955,0.506892,0.155099,...,0.162930,0.504355,0.160769,0.488578,0.166161,0.484440,0.092957,0.441681,0.116268,0.505252
30638,30v3,2024-06-25,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",P23,0.171547,0.584237,0.178309,0.525358,0.099984,...,0.152324,0.480321,0.150580,0.513564,0.101447,0.511939,0.143820,0.487725,0.110310,0.504022


In [17]:
y2_cols = [f'y2_{i}' for i in range(1, 92)]
phase2_30_1min_1min= phase2_df1[(phase2_df1['light_regime'] == '1min-1min') & (phase2_df1['plate'].isin(plates))].copy()
phase2_30_1min_1min[['plate', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,y2_6,...,y2_82,y2_83,y2_84,y2_85,y2_86,y2_87,y2_88,y2_89,y2_90,y2_91
0,30v1,LMJ.RY0402.052440,Cre01.g028650,A02,0.178463,0.550118,0.169154,0.536918,0.157446,0.497102,...,0.514274,0.154958,0.519095,0.165955,0.514616,0.209323,0.503449,,,
1,30v1,LMJ.RY0402.055420,Cre01.g045150,A03,,,,,,,...,,,,,,,,,,
2,30v1,LMJ.RY0402.047311,"Cre08.g375800,Cre01.g049900,Cre01.g049900 & Cr...",A04,0.085688,0.502814,0.120873,0.490352,0.126170,0.462083,...,0.467418,0.073874,0.476327,0.066746,0.476632,0.144827,0.458694,,,
3,30v1,LMJ.RY0402.054897,Cre02.g095115,A05,0.315781,0.649059,0.289019,0.612956,0.306358,0.605436,...,0.608368,0.243948,0.593104,0.247075,0.594532,0.237937,0.580276,,,
4,30v1,LMJ.RY0402.054597,Cre03.g153400,A06,0.296205,0.630581,0.248592,0.589196,0.258415,0.582097,...,0.550243,0.196497,0.543061,0.203080,0.542638,0.212207,0.543212,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94787,30v3,LMJ.RY0402.240458,"Cre13.g588405 & Cre13.g588453,Cre02.g106650",P20,0.213346,0.566172,0.229570,0.516654,0.208745,0.520764,...,0.507074,0.157094,0.539906,0.165638,0.519407,0.162186,0.546528,,,
94788,30v3,LMJ.RY0402.076841,Cre04.g216050,P21,0.302912,0.630209,0.260860,0.603774,0.253227,0.599196,...,0.549287,0.172317,0.526123,0.165680,0.528998,0.132935,0.510404,,,
94789,30v3,LMJ.RY0402.039259,Cre01.g023050,P22,0.163494,0.506909,0.154467,0.505000,0.137106,0.510749,...,0.487017,0.146014,0.480897,0.088540,0.441224,0.102654,0.509137,,,
94790,30v3,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",P23,0.147403,0.578180,0.169607,0.523358,0.079297,0.476844,...,0.511670,0.082184,0.507070,0.125521,0.488418,0.099019,0.507466,,,


In [89]:
phase2_30_1min_1min[phase2_30_1min_1min['plate']=='30v1']

Unnamed: 0,plate,measurement,start_date,light_regime,dark_threshold,light_threshold,num_frames,i,j,fv_fm,...,measurement_time_173,measurement_time_174,measurement_time_175,measurement_time_176,measurement_time_177,well_id,mutant_ID,feature,mutated_genes,num_mutations
0,30v1,M3,2024-06-10,1min-1min,15.956943,22.289219,180,0,1,0.628277,...,,,,,,A02,LMJ.RY0402.052440,intron,Cre01.g028650,1.0
1,30v1,M3,2024-06-10,1min-1min,15.956943,22.289219,180,0,2,,...,,,,,,A03,LMJ.RY0402.055420,CDS/intron,Cre01.g045150,1.0
2,30v1,M3,2024-06-10,1min-1min,15.956943,22.289219,180,0,3,0.555580,...,,,,,,A04,LMJ.RY0402.047311,intergenic,"Cre08.g375800,Cre01.g049900,Cre01.g049900 & Cr...",3.0
3,30v1,M3,2024-06-10,1min-1min,15.956943,22.289219,180,0,4,0.707976,...,,,,,,A05,LMJ.RY0402.054897,intron,Cre02.g095115,1.0
4,30v1,M3,2024-06-10,1min-1min,15.956943,22.289219,180,0,5,0.704442,...,,,,,,A06,LMJ.RY0402.054597,intron,Cre03.g153400,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27571,30v1,M1,2024-06-08,1min-1min,18.779654,25.444115,180,15,19,0.644086,...,,,,,,P20,LMJ.RY0402.101554,intron,Cre08.g374450,1.0
27572,30v1,M1,2024-06-08,1min-1min,18.779654,25.444115,180,15,20,0.584386,...,,,,,,P21,LMJ.RY0402.105853,intron,Cre08.g382620,1.0
27573,30v1,M1,2024-06-08,1min-1min,18.779654,25.444115,180,15,21,0.661434,...,,,,,,P22,LMJ.RY0402.144998,intron,Cre08.g382620,1.0
27574,30v1,M1,2024-06-08,1min-1min,18.779654,25.444115,180,15,22,0.679345,...,,,,,,P23,LMJ.RY0402.042144,CDS,Cre09.g387400,1.0


### plate 30 30s-30s

In [56]:
# Define inputs
plates = ['30v1', '30v2', '30v3']
y2_cols = [f'y2_{i}' for i in range(1, 89)]

# Run normalization
phase2_30_30s_30s_normalized = quantile_normalize_light_regime(
    df=phase2_df1,
    light_regime='30s-30s',
    plates=plates,
    y2_cols=y2_cols
)

# View a few columns
phase2_30_30s_30s_normalized[['plate', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,y2_6,...,y2_79,y2_80,y2_81,y2_82,y2_83,y2_84,y2_85,y2_86,y2_87,y2_88
4979,30v3,LMJ.RY0402.047723,"Cre03.g183150 & Cre03.g183200,Cre03.g183150,Cr...",A02,0.416491,0.654779,0.394700,0.620050,0.372290,0.629198,...,0.349863,0.606316,0.341759,0.624399,0.330567,0.615882,0.331206,0.603058,0.335140,0.599969
4980,30v3,LMJ.RY0402.064780,Cre03.g192450,A03,0.169608,0.482472,0.120251,0.466689,0.192762,0.443359,...,0.192610,0.448432,0.115402,0.446781,0.158050,0.478455,0.129405,0.402962,0.110777,0.442793
4981,30v3,LMJ.RY0402.193315,Cre01.g025400,A04,0.216168,0.517010,0.244197,0.501330,0.205733,0.471114,...,0.215390,0.468651,0.166238,0.471810,0.222013,0.489254,0.227226,0.464397,0.194506,0.503556
4982,30v3,LMJ.RY0402.110865,Cre01.g039550,A05,0.195544,0.506539,0.087807,0.439734,0.167217,0.414087,...,0.100805,0.375366,0.095230,0.438258,0.125629,0.396679,0.195285,0.303771,0.093810,0.301022
4983,30v3,LMJ.RY0402.074645,Cre01.g001800,A06,0.180508,0.490304,0.208157,0.471744,0.202619,0.500271,...,0.170554,0.439956,0.163567,0.459428,0.167186,0.443945,0.143889,0.446441,0.149244,0.437585
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35614,30v2,LMJ.RY0402.228465,Cre09.g398200,P20,0.225952,0.533890,0.210250,0.524605,0.210005,0.500293,...,0.204687,0.503976,0.217607,0.482031,0.222405,0.501824,0.170806,0.513977,0.195133,0.511573
35615,30v2,LMJ.RY0402.193684,Cre09.g387400,P21,0.167337,0.469671,0.103365,0.454321,0.137871,0.442306,...,0.115856,0.480289,0.119928,0.426716,0.146083,0.406587,0.097185,0.452620,0.089637,0.472719
35616,30v2,LMJ.RY0402.147464,Cre04.g218500,P22,0.224340,0.510764,0.214947,0.530102,0.228504,0.502073,...,0.169347,0.527369,0.186961,0.529567,0.180182,0.468873,0.214492,0.470738,0.203660,0.495052
35617,30v2,LMJ.RY0402.232801,Cre08.g385650,P23,0.111666,0.447366,0.141203,0.421017,0.133389,0.461206,...,0.123070,0.454384,0.139295,0.447663,0.122077,0.417074,0.103702,0.447744,0.130159,0.413440


In [19]:
y2_cols = [f'y2_{i}' for i in range(1, 89)]
phase2_30_30s_30s= phase2_df1[(phase2_df1['light_regime'] == '30s-30s') & (phase2_df1['plate'].isin(plates))].copy()
phase2_30_30s_30s[['plate', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,y2_6,...,y2_79,y2_80,y2_81,y2_82,y2_83,y2_84,y2_85,y2_86,y2_87,y2_88
17618,30v3,LMJ.RY0402.047723,"Cre03.g183150 & Cre03.g183200,Cre03.g183150,Cr...",A02,0.420936,0.655267,0.390790,0.621872,0.378465,0.639171,...,0.346719,0.606152,0.338125,0.631475,0.327414,0.612326,0.335618,0.606576,0.331829,0.604739
17619,30v3,LMJ.RY0402.064780,Cre03.g192450,A03,0.165749,0.479098,0.121231,0.468119,0.195359,0.445142,...,0.178492,0.450141,0.121185,0.450676,0.150316,0.474586,0.130775,0.406174,0.108494,0.438719
17620,30v3,LMJ.RY0402.193315,Cre01.g025400,A04,0.213003,0.509001,0.249679,0.504075,0.207678,0.473582,...,0.202284,0.468827,0.158824,0.478704,0.214800,0.483906,0.230056,0.461946,0.188345,0.503438
17621,30v3,LMJ.RY0402.110865,Cre01.g039550,A05,0.192051,0.497267,0.095375,0.445238,0.168908,0.408339,...,0.080697,0.365304,0.093557,0.444062,0.117235,0.388450,0.195765,0.302987,0.089800,0.312022
17622,30v3,LMJ.RY0402.074645,Cre01.g001800,A06,0.176638,0.485322,0.213517,0.474869,0.205212,0.500576,...,0.154120,0.440459,0.157786,0.465356,0.158112,0.437007,0.148491,0.444521,0.141617,0.433360
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108527,30v2,LMJ.RY0402.228465,Cre09.g398200,P20,0.230111,0.542900,0.210873,0.527458,0.210943,0.500793,...,0.217784,0.510020,0.222708,0.482240,0.231461,0.511648,0.177852,0.517925,0.201273,0.513579
108528,30v2,LMJ.RY0402.193684,Cre09.g387400,P21,0.171363,0.480203,0.098401,0.461616,0.142675,0.442858,...,0.125780,0.485039,0.119794,0.423354,0.157176,0.418609,0.079760,0.459043,0.090184,0.474501
108529,30v2,LMJ.RY0402.147464,Cre04.g218500,P22,0.227096,0.523295,0.217629,0.533985,0.231998,0.502400,...,0.186435,0.528623,0.190963,0.528804,0.192836,0.477209,0.219786,0.477641,0.210296,0.498598
108530,30v2,LMJ.RY0402.232801,Cre08.g385650,P23,0.107310,0.458658,0.140347,0.422454,0.139969,0.461889,...,0.132857,0.461449,0.144200,0.449290,0.129569,0.430041,0.096104,0.454487,0.137321,0.413907


### 30 plate 5min-5min

In [57]:
plates = ['30v1', '30v2', '30v3']
y2_cols = [f'y2_{i}' for i in range(1, 89)]

# Filter the data
phase2_30_5min_5min = phase2_df1[
    (phase2_df1['light_regime'] == '5min-5min') &
    (phase2_df1['plate'].isin(plates))
].copy()

# Copy to write normalized data
phase2_30_5min_5min_normalized = phase2_30_5min_5min.copy()

# Loop over each y2 column (timepoint)
for timepoint in y2_cols:
    position_values = []
    valid_plate_indices = []

    # Loop through (plate, start_date) technical replicates
    for (plate, start_date), group in phase2_30_5min_5min.groupby(['plate', 'start_date']):
        subset = group.copy()

        # Separate WT and non-WT rows
        wt_rows = subset[subset['mutant_ID'] == 'WT'].copy()
        non_wt_rows = subset[subset['mutant_ID'] != 'WT'].copy()

        # Sort for reproducibility
        wt_rows = wt_rows.sort_values(['mutant_ID', 'mutated_genes', 'well_id', 'start_date'])
        non_wt_rows = non_wt_rows.sort_values(['mutant_ID', 'mutated_genes', 'start_date'])

        # Combine sorted rows
        subset_sorted = pd.concat([wt_rows, non_wt_rows], axis=0)

        # Extract values and index
        values = subset_sorted[timepoint].values
        index = subset_sorted.index.values

        position_values.append(values)
        valid_plate_indices.append(index)

    # Skip timepoint if mismatch or empty
    lengths = [len(v) for v in position_values]
    if len(set(lengths)) != 1 or 0 in lengths:
        print(f"⚠️ Skipping {timepoint} due to mismatch or empty data: lengths = {lengths}")
        continue

    # Quantile normalize
    matrix = np.column_stack(position_values)
    normalized_matrix = normalize_quantiles(matrix, ties=True)

    # Write back
    for col_idx, index in enumerate(valid_plate_indices):
        phase2_30_5min_5min_normalized.loc[index, timepoint] = normalized_matrix[:, col_idx]

# Optional preview
phase2_30_5min_5min_normalized[['plate', 'start_date', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,start_date,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,...,y2_79,y2_80,y2_81,y2_82,y2_83,y2_84,y2_85,y2_86,y2_87,y2_88
4596,30v2,2024-06-24,LMJ.RY0402.051134,Cre01.g017200,A02,,,,,,...,,,,,,,,,,
4597,30v2,2024-06-24,LMJ.RY0402.189784,Cre01.g001800,A03,0.206795,0.632784,0.211117,0.618747,0.201653,...,0.184500,0.592875,0.161459,0.585253,0.201263,0.585394,0.183005,0.594642,0.161717,0.589206
4598,30v2,2024-06-24,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",A04,0.136062,0.542970,0.102491,0.569401,0.143227,...,0.134305,0.527161,0.106718,0.539800,0.119346,0.498636,0.107175,0.521061,0.101404,0.532594
4599,30v2,2024-06-24,LMJ.RY0402.164384,Cre09.g392500,A05,0.207903,0.606450,0.160406,0.622530,0.189462,...,0.119143,0.580501,0.133405,0.569298,0.175294,0.592682,0.147571,0.587108,0.133349,0.580791
4600,30v2,2024-06-24,LMJ.RY0402.163529,Cre01.g025400,A06,0.169020,0.563917,0.153565,0.557442,0.149198,...,0.167449,0.547931,0.128900,0.534819,0.126159,0.538456,0.118567,0.536836,0.126109,0.549749
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26805,30v3,2024-07-02,LMJ.RY0402.240458,"Cre13.g588405 & Cre13.g588453,Cre02.g106650",P20,0.203722,0.625042,0.178390,0.599685,0.225939,...,0.152790,0.567970,0.148647,0.557882,0.124674,0.568633,0.199854,0.582393,0.182025,0.588700
26806,30v3,2024-07-02,LMJ.RY0402.076841,Cre04.g216050,P21,0.312553,0.679588,0.227444,0.628834,0.230852,...,0.163039,0.598488,0.186196,0.568261,0.188345,0.599567,0.211385,0.605112,0.189673,0.613761
26807,30v3,2024-07-02,LMJ.RY0402.039259,Cre01.g023050,P22,0.200187,0.566901,0.146975,0.591551,0.157081,...,0.084053,0.539432,0.121804,0.531660,0.176020,0.498495,0.166459,0.558547,0.121233,0.519235
26808,30v3,2024-07-02,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",P23,0.165972,0.600435,0.126747,0.557458,0.187052,...,0.160167,0.540211,0.149738,0.515903,0.148721,0.548869,0.115800,0.527552,0.116835,0.508092


In [21]:
plates = ['30v1', '30v2','30v3']
y2_cols = [f'y2_{i}' for i in range(1, 90)]
phase2_30_5min_5min= phase2_df1[(phase2_df1['light_regime'] == '5min-5min') & (phase2_df1['plate'].isin(plates))].copy()
phase2_30_5min_5min[['plate', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,y2_6,...,y2_80,y2_81,y2_82,y2_83,y2_84,y2_85,y2_86,y2_87,y2_88,y2_89
17235,30v2,LMJ.RY0402.051134,Cre01.g017200,A02,,,,,,,...,,,,,,,,,,
17236,30v2,LMJ.RY0402.189784,Cre01.g001800,A03,0.219473,0.628291,0.218822,0.611663,0.204733,0.583965,...,0.586365,0.177225,0.576962,0.222916,0.576951,0.200629,0.591031,0.179134,0.585339,
17237,30v2,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",A04,0.130885,0.527061,0.092299,0.551516,0.140813,0.508574,...,0.505203,0.116432,0.522082,0.125390,0.472637,0.112021,0.497965,0.109269,0.514465,
17238,30v2,LMJ.RY0402.164384,Cre09.g392500,A05,0.220939,0.597080,0.164246,0.616587,0.191265,0.579720,...,0.569768,0.144817,0.558535,0.192215,0.585822,0.158917,0.580598,0.143732,0.573412,
17239,30v2,LMJ.RY0402.163529,Cre01.g025400,A06,0.174104,0.548586,0.154213,0.538536,0.146623,0.509359,...,0.528984,0.140201,0.517013,0.133615,0.520793,0.125136,0.517957,0.138121,0.534496,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78031,30v3,LMJ.RY0402.240458,"Cre13.g588405 & Cre13.g588453,Cre02.g106650",P20,0.211026,0.623484,0.176846,0.596590,0.240249,0.586657,...,0.569617,0.174096,0.556962,0.147512,0.569460,0.231456,0.585813,0.214179,0.595698,
78032,30v3,LMJ.RY0402.076841,Cre04.g216050,P21,0.350024,0.694314,0.233283,0.632761,0.245761,0.594828,...,0.606984,0.215115,0.569717,0.216061,0.608424,0.249463,0.615915,0.222838,0.626783,
78033,30v3,LMJ.RY0402.039259,Cre01.g023050,P22,0.206932,0.551717,0.140325,0.586087,0.155777,0.512961,...,0.536210,0.142799,0.527856,0.203202,0.491374,0.196445,0.558228,0.146439,0.513352,
78034,30v3,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",P23,0.166386,0.592675,0.119699,0.544469,0.189972,0.559001,...,0.536428,0.175485,0.509919,0.175819,0.546593,0.138075,0.521628,0.142347,0.503064,


In [88]:
phase2_30_5min_5min[phase2_30_5min_5min['plate']=='30v3']

Unnamed: 0,plate,measurement,start_date,light_regime,dark_threshold,light_threshold,num_frames,i,j,fv_fm,...,measurement_time_173,measurement_time_174,measurement_time_175,measurement_time_176,measurement_time_177,well_id,mutant_ID,feature,mutated_genes,num_mutations
5745,30v3,M8,2024-11-26,5min-5min,15.874453,21.956830,180,0,1,0.670419,...,,,,,,A02,LMJ.RY0402.047723,CDS,"Cre03.g183150 & Cre03.g183200,Cre03.g183150,Cr...",3.0
5746,30v3,M8,2024-11-26,5min-5min,15.874453,21.956830,180,0,2,0.654829,...,,,,,,A03,LMJ.RY0402.064780,CDS,Cre03.g192450,1.0
5747,30v3,M8,2024-11-26,5min-5min,15.874453,21.956830,180,0,3,0.651561,...,,,,,,A04,LMJ.RY0402.193315,CDS,Cre01.g025400,1.0
5748,30v3,M8,2024-11-26,5min-5min,15.874453,21.956830,180,0,4,0.660693,...,,,,,,A05,LMJ.RY0402.110865,CDS,Cre01.g039550,1.0
5749,30v3,M8,2024-11-26,5min-5min,15.874453,21.956830,180,0,5,0.562017,...,,,,,,A06,LMJ.RY0402.074645,CDS,Cre01.g001800,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26805,30v3,M8,2024-07-02,5min-5min,15.432362,21.392689,180,15,19,0.640500,...,,,,,,P20,LMJ.RY0402.240458,CDS,"Cre13.g588405 & Cre13.g588453,Cre02.g106650",2.0
26806,30v3,M8,2024-07-02,5min-5min,15.432362,21.392689,180,15,20,0.743787,...,,,,,,P21,LMJ.RY0402.076841,CDS,Cre04.g216050,1.0
26807,30v3,M8,2024-07-02,5min-5min,15.432362,21.392689,180,15,21,0.595715,...,,,,,,P22,LMJ.RY0402.039259,intron,Cre01.g023050,1.0
26808,30v3,M8,2024-07-02,5min-5min,15.432362,21.392689,180,15,22,0.590430,...,,,,,,P23,LMJ.RY0402.229006,MULTIPLE_SPLICE_VARIANTS,"Cre13.g583250,Cre07.g341900",2.0


### plate 30 1min-5min

In [69]:
plates = ['30v1', '30v2', '30v3']
y2_cols = [f'y2_{i}' for i in range(1, 89)]

# Filter the data
phase2_30_1min_5min = phase2_df1[
    (phase2_df1['light_regime'] == '1min-5min') &
    (phase2_df1['plate'].isin(plates))
].copy()

# Copy to write normalized data
phase2_30_1min_5min_normalized = phase2_30_1min_5min.copy()

# Loop over each y2 column (timepoint)
for timepoint in y2_cols:
    position_values = []
    valid_plate_indices = []

    # Loop through (plate, start_date) technical replicates
    for (plate, start_date), group in phase2_30_1min_5min.groupby(['plate', 'start_date']):
        subset = group.copy()

        # Separate WT and non-WT rows
        wt_rows = subset[subset['mutant_ID'] == 'WT'].copy()
        non_wt_rows = subset[subset['mutant_ID'] != 'WT'].copy()

        # Sort for reproducibility
        wt_rows = wt_rows.sort_values(['mutant_ID', 'mutated_genes', 'well_id', 'start_date'])
        non_wt_rows = non_wt_rows.sort_values(['mutant_ID', 'mutated_genes', 'start_date'])

        # Combine sorted rows
        subset_sorted = pd.concat([wt_rows, non_wt_rows], axis=0)

        # Extract values and index
        values = subset_sorted[timepoint].values
        index = subset_sorted.index.values

        position_values.append(values)
        valid_plate_indices.append(index)

    # Skip timepoint if mismatch or empty
    lengths = [len(v) for v in position_values]
    if len(set(lengths)) != 1 or 0 in lengths:
        print(f"⚠️ Skipping {timepoint} due to mismatch or empty data: lengths = {lengths}")
        continue

    # Quantile normalize
    matrix = np.column_stack(position_values)
    normalized_matrix = normalize_quantiles(matrix, ties=True)

    # Write back
    for col_idx, index in enumerate(valid_plate_indices):
        phase2_30_1min_5min_normalized.loc[index, timepoint] = normalized_matrix[:, col_idx]

# Optional preview
phase2_30_1min_5min_normalized[['plate', 'start_date', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,start_date,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,...,y2_79,y2_80,y2_81,y2_82,y2_83,y2_84,y2_85,y2_86,y2_87,y2_88
3064,30v3,2024-11-25,LMJ.RY0402.047723,"Cre03.g183150 & Cre03.g183200,Cre03.g183150,Cr...",A02,0.164759,0.600722,0.156608,0.584367,0.152006,...,0.109737,0.649321,0.111968,0.610480,0.101826,0.640917,0.104486,0.647251,0.109788,0.613496
3065,30v3,2024-11-25,LMJ.RY0402.064780,Cre03.g192450,A03,0.211444,0.556980,0.103332,0.532145,0.154494,...,0.010619,0.567573,0.070483,0.554666,0.065674,0.591907,0.050860,0.560681,0.079933,0.544015
3066,30v3,2024-11-25,LMJ.RY0402.193315,Cre01.g025400,A04,0.121983,0.576616,0.118266,0.563803,0.079769,...,0.081069,0.577471,0.101928,0.591881,0.095825,0.616279,0.095473,0.609886,0.127237,0.587363
3067,30v3,2024-11-25,LMJ.RY0402.110865,Cre01.g039550,A05,0.022181,0.593765,0.142034,0.553442,0.355298,...,0.250012,0.560961,0.234302,0.665528,0.195970,0.579217,0.195678,0.516689,-0.006460,0.570388
3068,30v3,2024-11-25,LMJ.RY0402.074645,Cre01.g001800,A06,0.074474,0.506077,0.056489,0.514810,0.120026,...,0.055215,0.551218,0.065996,0.555470,0.075081,0.573212,0.058804,0.575835,0.094272,0.559254
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24507,30v3,2024-07-01,LMJ.RY0402.240458,"Cre13.g588405 & Cre13.g588453,Cre02.g106650",P20,0.211415,0.621136,0.190416,0.622247,0.197263,...,0.159466,0.609330,0.150705,0.585849,0.123681,0.619553,0.169960,0.619207,0.113297,0.610220
24508,30v3,2024-07-01,LMJ.RY0402.076841,Cre04.g216050,P21,0.285460,0.690189,0.297025,0.673218,0.259498,...,0.204971,0.662870,0.218228,0.660722,0.226754,0.647469,0.189990,0.671762,0.187954,0.651868
24509,30v3,2024-07-01,LMJ.RY0402.039259,Cre01.g023050,P22,0.194441,0.578380,0.134913,0.535807,0.137740,...,0.098091,0.540404,0.137054,0.580483,0.047235,0.560109,0.095875,0.582552,0.125358,0.563059
24510,30v3,2024-07-01,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",P23,0.137096,0.548837,0.132032,0.551157,0.138638,...,0.094305,0.568079,0.109716,0.558691,0.076748,0.553310,0.078569,0.578723,0.105073,0.569532


In [22]:
plates = ['30v1', '30v2','30v3']
y2_cols = [f'y2_{i}' for i in range(1, 90)]
phase2_30_1min_5min= phase2_df1[(phase2_df1['light_regime'] == '1min-5min') & (phase2_df1['plate'].isin(plates))].copy()
phase2_30_1min_5min[['plate', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,y2_6,...,y2_80,y2_81,y2_82,y2_83,y2_84,y2_85,y2_86,y2_87,y2_88,y2_89
9192,30v3,LMJ.RY0402.047723,"Cre03.g183150 & Cre03.g183200,Cre03.g183150,Cr...",A02,0.134198,0.623612,0.128673,0.614192,0.128401,0.605740,...,0.649936,0.067378,0.628977,0.058125,0.644873,0.057920,0.646288,0.062913,0.633515,
9193,30v3,LMJ.RY0402.064780,Cre03.g192450,A03,0.162950,0.585423,0.090642,0.575251,0.130910,0.588775,...,0.609838,0.036939,0.595476,0.033474,0.620850,0.022955,0.597748,0.039964,0.588157,
9194,30v3,LMJ.RY0402.193315,Cre01.g025400,A04,0.096096,0.603724,0.100223,0.598859,0.073401,0.599621,...,0.617745,0.060711,0.618480,0.053481,0.632503,0.051265,0.630298,0.073337,0.619739,
9195,30v3,LMJ.RY0402.110865,Cre01.g039550,A05,0.017582,0.619679,0.117271,0.589252,0.234715,0.617238,...,0.605281,0.133751,0.656860,0.111983,0.613214,0.114168,0.557855,-0.005171,0.607637,
9196,30v3,LMJ.RY0402.074645,Cre01.g001800,A06,0.064374,0.540312,0.051770,0.553874,0.106032,0.568289,...,0.598267,0.033898,0.595769,0.039875,0.608773,0.025570,0.610210,0.051663,0.598796,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69988,30v3,LMJ.RY0402.240458,"Cre13.g588405 & Cre13.g588453,Cre02.g106650",P20,0.238887,0.620026,0.213520,0.619026,0.224229,0.596597,...,0.600591,0.186340,0.579579,0.149757,0.615516,0.206679,0.615873,0.144708,0.602650,
69989,30v3,LMJ.RY0402.076841,Cre04.g216050,P21,0.336369,0.706353,0.348162,0.682320,0.306916,0.681959,...,0.670874,0.262729,0.671410,0.273464,0.651510,0.230360,0.682602,0.234393,0.654341,
69990,30v3,LMJ.RY0402.039259,Cre01.g023050,P22,0.218165,0.570643,0.144805,0.517675,0.156838,0.541569,...,0.520785,0.172756,0.572236,0.062958,0.544277,0.118947,0.570871,0.159174,0.546271,
69991,30v3,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",P23,0.151661,0.536455,0.142453,0.536213,0.157875,0.527584,...,0.551027,0.137557,0.545530,0.094017,0.538343,0.101658,0.566913,0.135147,0.554233,


## plate 30 10min-10min

In [59]:
phase2_df2 = phase2_df1[
    ~(
        (phase2_df1['light_regime'] == '10min-10min') &
        (phase2_df1['plate'] == '30v3') &
        (phase2_df1['start_date'] == '2024-06-29')
    )
]


In [60]:
# Define inputs
plates = ['30v1', '30v3']
y2_cols = [f'y2_{i}' for i in range(1, 85)]

# Run normalization
phase2_30_10min_10min_normalized = quantile_normalize_light_regime(
    df=phase2_df2,
    light_regime='10min-10min',
    plates=plates,
    y2_cols=y2_cols
)

# View a few columns
phase2_30_10min_10min_normalized[['plate', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,y2_6,...,y2_75,y2_76,y2_77,y2_78,y2_79,y2_80,y2_81,y2_82,y2_83,y2_84
13788,30v1,LMJ.RY0402.052440,Cre01.g028650,A02,0.196704,0.595254,0.216432,0.585942,0.181391,0.578494,...,0.108690,0.586747,0.587187,0.146691,0.158733,0.606647,0.594400,0.121620,0.160179,0.591514
13789,30v1,LMJ.RY0402.055420,Cre01.g045150,A03,,,,,,,...,,,,,,,,,,
13790,30v1,LMJ.RY0402.047311,"Cre08.g375800,Cre01.g049900,Cre01.g049900 & Cr...",A04,0.178339,0.537808,0.174254,0.550486,0.154003,0.550272,...,0.071201,0.527689,0.554856,0.099644,0.070603,0.543485,0.528501,0.084765,0.064154,0.537355
13791,30v1,LMJ.RY0402.054897,Cre02.g095115,A05,0.188015,0.622587,0.211352,0.613474,0.206530,0.621237,...,0.114474,0.612686,0.601934,0.113141,0.087547,0.604570,0.615333,0.131183,0.088312,0.607213
13792,30v1,LMJ.RY0402.054597,Cre03.g153400,A06,0.155354,0.615819,0.141431,0.612813,0.179831,0.628406,...,0.092080,0.612982,0.597212,0.091332,0.082599,0.603931,0.605616,0.095114,0.093501,0.611567
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16081,30v3,LMJ.RY0402.240458,"Cre13.g588405 & Cre13.g588453,Cre02.g106650",P20,0.225760,0.604011,0.216353,0.550454,0.240252,0.571375,...,0.122560,0.623717,0.597192,0.122955,0.145859,0.623917,0.629666,0.122228,0.188156,0.644165
16082,30v3,LMJ.RY0402.076841,Cre04.g216050,P21,0.190002,0.625832,0.274314,0.668964,0.176594,0.680467,...,0.119011,0.651120,0.622130,0.174955,0.160540,0.617565,0.626166,0.163949,0.130612,0.610947
16083,30v3,LMJ.RY0402.039259,Cre01.g023050,P22,0.210545,0.617911,0.319655,0.685403,0.144355,0.611414,...,0.243976,0.670047,0.454888,0.199510,0.238329,0.615494,0.643521,0.081724,0.153661,0.589639
16084,30v3,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",P23,0.254101,0.621515,0.188994,0.603186,0.227006,0.636412,...,0.186782,0.619635,0.643662,0.241820,0.254540,0.590258,0.621169,0.171496,0.195580,0.632706


In [61]:
plates = ['30v1', '30v3']
y2_cols = [f'y2_{i}' for i in range(1, 90)]
phase2_30_10min_10min= phase2_df2[(phase2_df2['light_regime'] == '10min-10min') & (phase2_df2['plate'].isin(plates))].copy()
phase2_30_10min_10min[['plate', 'mutant_ID', 'mutated_genes', 'well_id'] + y2_cols]

Unnamed: 0,plate,mutant_ID,mutated_genes,well_id,y2_1,y2_2,y2_3,y2_4,y2_5,y2_6,...,y2_80,y2_81,y2_82,y2_83,y2_84,y2_85,y2_86,y2_87,y2_88,y2_89
13788,30v1,LMJ.RY0402.052440,Cre01.g028650,A02,0.200088,0.592155,0.221741,0.586242,0.189824,0.575675,...,0.601497,0.588759,0.121622,0.157859,0.586911,,,,,
13789,30v1,LMJ.RY0402.055420,Cre01.g045150,A03,,,,,,,...,,,,,,,,,,
13790,30v1,LMJ.RY0402.047311,"Cre08.g375800,Cre01.g049900,Cre01.g049900 & Cr...",A04,0.181050,0.536231,0.178142,0.546889,0.159278,0.545123,...,0.535548,0.524239,0.086963,0.064511,0.531658,,,,,
13791,30v1,LMJ.RY0402.054897,Cre02.g095115,A05,0.189667,0.620641,0.215870,0.615362,0.216840,0.620192,...,0.599653,0.611431,0.131126,0.088074,0.604549,,,,,
13792,30v1,LMJ.RY0402.054597,Cre03.g153400,A06,0.157513,0.612671,0.134711,0.614145,0.187434,0.627726,...,0.599292,0.601178,0.097280,0.093613,0.608837,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16081,30v3,LMJ.RY0402.240458,"Cre13.g588405 & Cre13.g588453,Cre02.g106650",P20,0.220878,0.606923,0.211064,0.554235,0.231264,0.574287,...,0.627550,0.633688,0.122493,0.193354,0.645023,,,,,
16082,30v3,LMJ.RY0402.076841,Cre04.g216050,P21,0.188056,0.627401,0.272056,0.679252,0.169180,0.690957,...,0.622177,0.629981,0.163360,0.129348,0.613879,,,,,
16083,30v3,LMJ.RY0402.039259,Cre01.g023050,P22,0.206335,0.620589,0.318205,0.707585,0.138220,0.613540,...,0.620827,0.643272,0.081567,0.153395,0.594352,,,,,
16084,30v3,LMJ.RY0402.229006,"Cre13.g583250,Cre07.g341900",P23,0.253073,0.623464,0.184892,0.601890,0.217352,0.637417,...,0.595598,0.623978,0.170390,0.198906,0.634442,,,,,


In [62]:
phase2_30_quantile1= pd.concat([
    phase2_30_20h_ML_normalized,
    phase2_30_20h_HL_normalized,
    phase2_30_2h_2h_normalized,
    phase2_30_10min_10min_normalized,
    phase2_30_1min_1min_normalized,
    phase2_30_30s_30s_normalized,
    phase2_30_5min_5min_normalized,
    phase2_30_1min_5min_normalized
], ignore_index=True)

In [64]:
phase2_30_quantile1.to_csv('phase2_30_qn(5.18).csv',index=False)

In [63]:
phase2_30_quantile1.shape

(9575, 726)

In [39]:
## lack 10min-10min 30v2, part of 30v3
plates = ['30v1', '30v2','30v3']
data=phase2_df2[(phase2_df2['plate'].isin(plates))]
data.shape

(9958, 726)