In [1]:
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import tqdm

# improt anova and tukeyhsd
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [2]:
def anova_function(features_df: pd.DataFrame, Metdata_column: str) -> pd.DataFrame:
    """
    This function will take in a dataframe and a metadata column and return the results of an anova and tukeyhsd test for each feature.


    Parameters
    ----------
    features_df : pd.DataFrame
        The dataframe containing the features with only one metadata column
    Metdata_column : str
        The name of the metadata column to be used for the anova test

    Returns
    -------
    pd.DataFrame
        A dataframe containing the results of the anova and tukeyhsd test for each feature
    """

    # anova and tukeyhsd for each feature
    # create a list to store the results
    anova_results = pd.DataFrame()

    # loop through each feature
    for feature in tqdm.tqdm(features_df.columns[:-1]):
        # create a model
        model = ols(f"{feature} ~ C({Metdata_column})", data=features_df).fit()
        # create an anova table
        anova_table = sm.stats.anova_lm(model, typ=2)
        # create a tukeyhsd table
        tukeyhsd = pairwise_tukeyhsd(features_df[feature], features_df[Metdata_column])

        # get the f-statistic based p-value
        anova_p_value = anova_table["PR(>F)"][0]
        tmp = pd.DataFrame(
            tukeyhsd._results_table.data, columns=tukeyhsd._results_table.data[0]
        ).drop(0)
        tmp.reset_index(inplace=True, drop=True)
        # drop the first row
        tmp["feature"] = feature
        tmp["anova_p_value"] = anova_p_value
        # tmp['unique'] = tmp['group1'] + "_" + tmp['group2'] + "_" + feature
        tmp = pd.DataFrame(tmp)

        anova_results = pd.concat([anova_results, tmp], axis=0).reset_index(drop=True)
    return anova_results

In [3]:
file_path = pathlib.Path(
    "../../data/5.converted_data/normalized_feature_selected_output.parquet"
)
df = pd.read_parquet(file_path)
df.head()

Unnamed: 0,Metadata_ImageNumber,Metadata_Image_FileName_OP,Metadata_ObjectNumber,Metadata_ConvertImageToObjects_Number_Object_Number,Metadata_ConvertImageToObjects_AreaShape_BoundingBoxArea,Metadata_ConvertImageToObjects_AreaShape_BoundingBoxMaximum_X,Metadata_ConvertImageToObjects_AreaShape_BoundingBoxMaximum_Y,Metadata_ConvertImageToObjects_AreaShape_BoundingBoxMinimum_X,Metadata_ConvertImageToObjects_AreaShape_BoundingBoxMinimum_Y,Metadata_ConvertImageToObjects_Location_CenterMassIntensity_X_OP,...,Texture_SumEntropy_OP_3_02_256,Texture_SumEntropy_OP_3_03_256,Texture_SumVariance_OP_3_00_256,Texture_SumVariance_OP_3_01_256,Texture_SumVariance_OP_3_02_256,Texture_SumVariance_OP_3_03_256,Texture_Variance_OP_3_00_256,Texture_Variance_OP_3_01_256,Texture_Variance_OP_3_02_256,Texture_Variance_OP_3_03_256
0,1,MAX_high_10_L.tiff,1,1,38250.0,269.0,182.0,44.0,12.0,131.546149,...,0.841475,0.834574,1.149186,1.1522,1.078905,1.144848,1.040009,1.02574,1.045492,1.027617
1,2,MAX_high_10_R.tiff,1,1,34170.0,208.0,245.0,38.0,44.0,106.962058,...,0.45248,0.436138,0.112178,0.131982,0.108602,0.1257,0.075717,0.071207,0.076477,0.067922
2,3,MAX_high_11_L.tiff,1,1,41736.0,250.0,267.0,62.0,45.0,131.359827,...,0.494813,0.490844,-0.01614,0.001041,-0.021997,-0.020992,-0.033934,-0.042635,-0.034983,-0.038763
3,4,MAX_high_11_R.tiff,1,1,43616.0,212.0,272.0,24.0,40.0,101.069901,...,0.878729,0.865617,1.120905,1.140604,1.074529,1.117941,1.031585,1.015172,1.03515,1.019188
4,5,MAX_high_12_L.tiff,2,2,25894.0,283.0,155.0,69.0,34.0,164.579054,...,0.476768,0.482674,0.177886,0.147933,0.103313,0.156457,0.111765,0.107846,0.117614,0.108859


In [4]:
# combine the genotype and idenity columns
df["Metadata_genotype_side"] = df["Metadata_genotype"] + "_" + df["Metadata_side"]
df["Metadata_genotype_identity_side"] = (
    df["Metadata_genotype"] + "_" + df["Metadata_identity"] + "_" + df["Metadata_side"]
)
# split the features and the metadata
metadata = df.columns.str.contains("Metadata")
# filter the metadata
metadata_df = df.loc[:, metadata]
# filter the features
features_df = df.loc[:, ~metadata]

## Anova for genotype only

In [5]:
genotype_df = features_df.copy()
genotype_df.loc[:, "Metadata_genotype"] = metadata_df["Metadata_genotype"]
anova_results = anova_function(genotype_df, "Metadata_genotype")
# export the anova results
# out dir
out_dir = pathlib.Path("../../data/6.analysis_results/")
# create the dir if it does not exist
out_dir.mkdir(parents=True, exist_ok=True)
anova_results_path = out_dir / "anova_results_genotype.parquet"
anova_results.to_parquet(anova_results_path)

100%|██████████| 244/244 [00:25<00:00,  9.51it/s]


## Anova for genotype and side

In [6]:
genotype_df = features_df.copy()
genotype_df.loc[:, "Metadata_genotype_side"] = metadata_df["Metadata_genotype_side"]
anova_results = anova_function(genotype_df, "Metadata_genotype_side")
# export the anova results
# out dir
out_dir = pathlib.Path("../../data/6.analysis_results/")
# create the dir if it does not exist
out_dir.mkdir(parents=True, exist_ok=True)
anova_results_path = out_dir / "anova_results_genotype_side.parquet"
anova_results.to_parquet(anova_results_path)

100%|██████████| 244/244 [00:45<00:00,  5.37it/s]


## Anova for genotype, side, and identity

In [7]:
genotype_df = features_df.copy()
genotype_df.loc[:, "Metadata_genotype_identity_side"] = metadata_df[
    "Metadata_genotype_identity_side"
]
anova_results = anova_function(genotype_df, "Metadata_genotype_identity_side")
# export the anova results
# out dir
out_dir = pathlib.Path("../../data/6.analysis_results/")
# create the dir if it does not exist
out_dir.mkdir(parents=True, exist_ok=True)
anova_results_path = out_dir / "anova_results_genotype_side_identity.parquet"
anova_results.to_parquet(anova_results_path)

100%|██████████| 244/244 [01:21<00:00,  3.00it/s]
