Step 5 - ANOVA processing

This code extracts the relevant information from ANOVA and Tukey post-hoc results, concerning the significance of difference between modified TF and control strain, overexpressed TF and knocked-out TF, overexpressed TF and overexpressed with protein TF, and knocked-out TF and overexpressed with protein TF. It alignes the statistical insights with the growth data to form a new statisctis data dataframe, to be used in the next step for preparing the heatmaps.

In [2]:
from datetime import datetime
import glob
import pandas as pd
import os

In [3]:
IMPORT_PATH = os.path.join(os.getcwd(), "output_data")

all_files = glob.glob(os.path.join(IMPORT_PATH, '*_Tukey_*.xlsx'))
dates = [os.path.basename(f).split('_')[0] for f in all_files]

latest_date = max(dates)

latest_files = glob.glob(os.path.join(IMPORT_PATH, f'{latest_date}_Tukey_*.xlsx'))


In [4]:
list_TFs = ['Dal81', 'Hap1', 'Mhy1', 'Msn4', 'Msn4b', 'Msn4m', 'Msn4w', 'TF009', 'TF011', 'TF036', 'Yas1']

In [5]:
results = []

for file in latest_files:
    tukey_df = pd.read_excel(file)
    
    # Split TF and modification parts
    tukey_df[['group1_TF', 'group1_modification']] = tukey_df['group1'].str.split('_', expand=True, n=1)
    tukey_df[['group2_TF', 'group2_modification']] = tukey_df['group2'].str.split('_', expand=True, n=1)
    
    # Drop unused columns
    tukey_df = tukey_df.drop(['meandiff', 'lower', 'upper', 'reject'], axis=1)
    
    for tf in list_TFs:
        # Mask for TF in group1 and control in group2
        mask1 = (tukey_df['group1_TF'] == tf) & ((tukey_df['group2_TF'] == 'control') & (tukey_df['group2_modification'] == 'control'))
        # Mask for TF in group2 and control in group1
        mask2 = (tukey_df['group2_TF'] == tf) & ((tukey_df['group1_TF'] == 'control') & (tukey_df['group1_modification'] == 'control'))
        
        # Extract relevant rows
        matched1 = tukey_df[mask1].copy()
        matched2 = tukey_df[mask2].copy()
        
        # Add columns for TF and its modification
        matched1['studied_TF'] = tf + '_' + matched1['group1_modification']
        matched1['studied_control'] = matched1['p-adj']
        
        matched2['studied_TF'] = tf + '_' + matched2['group2_modification']
        matched2['studied_control'] = matched2['p-adj']
        
        # Combine and store
        results.extend([matched1, matched2])

# Final combined DataFrame
final_df = pd.concat(results, ignore_index=True)
final_df['variant_column_modif'] = final_df['studied_TF'] + '_' + final_df['condition'] + '_' + final_df['time'].astype(str)

In [6]:
OUTPUT_PATH = os.path.join(os.getcwd(), "output_data")

current_date = datetime.now().strftime("%Y-%m-%d")

output_filename = f"{current_date}_ANOVA_processing_vs_control.xlsx"
output_path = os.path.join(OUTPUT_PATH, output_filename)

final_df.to_excel(output_path, index=False)

In [None]:
'''results = []

for file in latest_files:
    tukey_df = pd.read_excel(file)

    # Split group names into TF and modification
    tukey_df[['group1_TF', 'group1_modification']] = tukey_df['group1'].str.split('_', expand=True, n=1)
    tukey_df[['group2_TF', 'group2_modification']] = tukey_df['group2'].str.split('_', expand=True, n=1)

    tukey_df.drop(columns=['meandiff', 'lower', 'upper', 'reject'], inplace=True)

    for tf in list_TFs:
        # 1. Extract TF vs CONTROL comparisons
        mask1 = (tukey_df['group1_TF'] == tf) & (tukey_df['group2_TF'] == 'control')
        mask2 = (tukey_df['group2_TF'] == tf) & (tukey_df['group1_TF'] == 'control')

        matched1 = tukey_df[mask1].copy()
        matched1['studied_TF'] = tf
        matched1['modification'] = matched1['group1_modification']
        matched1['studied_control'] = matched1['p-adj']

        matched2 = tukey_df[mask2].copy()
        matched2['studied_TF'] = tf
        matched2['modification'] = matched2['group2_modification']
        matched2['studied_control'] = matched2['p-adj']

        matched = pd.concat([matched1, matched2], ignore_index=True)

        # 2. Now for each row in matched, look in full tukey_df for p-values of comparisons with other variants
        for i, row in matched.iterrows():
            mod = row['modification']  # The modification already compared with control

            def get_pval(mod1, mod2):
                return tukey_df[
                    ((tukey_df['group1_TF'] == tf) & (tukey_df['group2_TF'] == tf) &
                     (tukey_df['group1_modification'] == mod1) & (tukey_df['group2_modification'] == mod2)) |
                    ((tukey_df['group1_TF'] == tf) & (tukey_df['group2_TF'] == tf) &
                     (tukey_df['group1_modification'] == mod2) & (tukey_df['group2_modification'] == mod1))
                ]['p-adj'].values

            p_oe = get_pval(mod, 'OE')
            p_ko = get_pval(mod, 'KO')
            p_oeprot = get_pval(mod, 'OE_prot')

            matched.at[i, 'studied_OE'] = p_oe[0] if len(p_oe) > 0 else None
            matched.at[i, 'studied_KO'] = p_ko[0] if len(p_ko) > 0 else None
            matched.at[i, 'studied_OEprot'] = p_oeprot[0] if len(p_oeprot) > 0 else None

        results.append(matched)

# Combine all TF vs control + variant comparisons
final_df = pd.concat(results, ignore_index=True)
print(final_df)
final_df.to_excel('20250515_p-value-test.xlsx')'''

  final_df = pd.concat(results, ignore_index=True)


            group1                group2   p-adj condition  time group1_TF  \
0         Dal81_KO       control_control  0.0000     test2     1     Dal81   
1         Dal81_KO  control_prot_OE_prot  0.0000     test2     1     Dal81   
2         Dal81_OE       control_control  0.0000     test2     1     Dal81   
3         Dal81_OE  control_prot_OE_prot  0.0451     test2     1     Dal81   
4    Dal81_OE_prot       control_control  0.0000     test2     1     Dal81   
..             ...                   ...     ...       ...   ...       ...   
463        Yas1_KO  control_prot_OE_prot  0.0000     test3     2      Yas1   
464        Yas1_OE       control_control  0.0000     test3     2      Yas1   
465        Yas1_OE  control_prot_OE_prot  0.0000     test3     2      Yas1   
466   Yas1_OE_prot       control_control  0.0000     test3     2      Yas1   
467   Yas1_OE_prot  control_prot_OE_prot  0.0000     test3     2      Yas1   

    group1_modification group2_TF group2_modification studied_T

PermissionError: [Errno 13] Permission denied: '20250515_p-value-test.xlsx'

In [None]:
'''in dataframes for one timepotit and condition (one excel)
 for a TF in either group (1 or 2) with a specific modification, search for its corresponding TF with different modification and for control
 append the p-adj value to a dataframe, where it will have TF_modification column, second, third, fourth, fifth column with OE, KO, OEprot, control p-value (should be redundant)
 and last column with contition_time_TF_modification name as an index to merge with growth data'''