In [1]:
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import tqdm
from scipy.stats import levene

# import anova and tukeyhsd
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multitest import multipletests

In [2]:
def anova_function(features_df: pd.DataFrame, Metdata_column: str) -> pd.DataFrame:
    """
    This function will take in a dataframe and a metadata column and return the results of an anova and tukeyhsd test for each feature.


    Parameters
    ----------
    features_df : pd.DataFrame
        The dataframe containing the features with only one metadata column
    Metdata_column : str
        The name of the metadata column to be used for the anova test

    Returns
    -------
    pd.DataFrame
        A dataframe containing the results of the anova and tukeyhsd test for each feature
    """

    # anova and tukeyhsd for each feature
    # create a pandas data frame to store the results
    anova_results = pd.DataFrame()

    # loop through each feature
    for feature in tqdm.tqdm(features_df.columns[:-1]):
        # create a model
        model = ols(f"{feature} ~ C({Metdata_column})", data=features_df).fit()
        # create an anova table
        anova_table = sm.stats.anova_lm(model, typ=2)
        # create a tukeyhsd table
        tukeyhsd = pairwise_tukeyhsd(features_df[feature], features_df[Metdata_column])
        # get the f-statistic based p-value
        anova_p_value = anova_table["PR(>F)"][0]
        tmp = pd.DataFrame(
            tukeyhsd._results_table.data, columns=tukeyhsd._results_table.data[0]
        ).drop(0)
        tmp.reset_index(inplace=True, drop=True)
        # drop the first row
        tmp["feature"] = feature
        tmp["anova_p_value"] = anova_p_value
        tmp = pd.DataFrame(tmp)

        anova_results = pd.concat([anova_results, tmp], axis=0).reset_index(drop=True)
    return anova_results

In [3]:
file_path = pathlib.Path("../../data/5.converted_data/mean_aggregated_data.parquet")
df = pd.read_parquet(file_path)
df.head()

Unnamed: 0,Metadata_genotype,Metadata_replicate,Metadata_side,AreaShape_Area,AreaShape_CentralMoment_0_0,AreaShape_CentralMoment_0_1,AreaShape_CentralMoment_0_2,AreaShape_CentralMoment_0_3,AreaShape_CentralMoment_1_0,AreaShape_CentralMoment_1_1,...,Texture_SumEntropy_OP_3_02_256,Texture_SumEntropy_OP_3_03_256,Texture_SumVariance_OP_3_00_256,Texture_SumVariance_OP_3_01_256,Texture_SumVariance_OP_3_02_256,Texture_SumVariance_OP_3_03_256,Texture_Variance_OP_3_00_256,Texture_Variance_OP_3_01_256,Texture_Variance_OP_3_02_256,Texture_Variance_OP_3_03_256
0,high,1,L,-0.178797,-0.178797,0.003558,-0.386796,0.280118,0.037998,0.434046,...,-0.627762,-0.64499,-0.903354,-0.890939,-0.896995,-0.90287,-0.896145,-0.898448,-0.89888,-0.896609
1,high,1,R,1.371763,1.371763,0.538195,0.288322,0.501618,4.163833,-1.36192,...,0.419551,0.406918,-0.188358,-0.159114,-0.189615,-0.186308,-0.218981,-0.225754,-0.219073,-0.225134
2,high,10,L,1.706234,1.706234,2.652373,3.280425,-1.992966,0.18463,-1.404376,...,0.841475,0.834574,1.149186,1.1522,1.078905,1.144848,1.040009,1.02574,1.045492,1.027617
3,high,10,R,0.771674,0.771674,-1.332747,0.164304,0.244371,0.17376,-1.885144,...,0.45248,0.436138,0.112178,0.131982,0.108602,0.1257,0.075717,0.071207,0.076477,0.067922
4,high,11,L,2.180858,2.180858,2.142686,1.838038,-0.077515,0.00865,0.876597,...,0.494813,0.490844,-0.01614,0.001041,-0.021997,-0.020992,-0.033934,-0.042635,-0.034983,-0.038763


In [4]:
# split the features and the metadata
metadata = df.columns.str.contains("Metadata")
# filter the metadata
metadata_df = df.loc[:, metadata]
# filter the features
features_df = df.loc[:, ~metadata]

## Anova for genotype only

In [5]:
anova_input_df = features_df.copy()
anova_input_df["Metadata_genotype"] = metadata_df["Metadata_genotype"]
anova_output_df = anova_function(anova_input_df, "Metadata_genotype")
print(anova_output_df.shape)
anova_output_df.head()

  0%|                                                              | 0/244 [00:00<?, ?it/s]

  0%|▏                                                     | 1/244 [00:00<00:30,  8.10it/s]

  1%|▍                                                     | 2/244 [00:00<00:28,  8.41it/s]

  1%|▋                                                     | 3/244 [00:00<00:28,  8.58it/s]

  2%|▉                                                     | 4/244 [00:00<00:28,  8.55it/s]

  2%|█                                                     | 5/244 [00:00<00:27,  8.59it/s]

  2%|█▎                                                    | 6/244 [00:00<00:27,  8.80it/s]

  3%|█▌                                                    | 7/244 [00:00<00:27,  8.75it/s]

  3%|█▊                                                    | 8/244 [00:00<00:26,  8.78it/s]

  4%|█▉                                                    | 9/244 [00:01<00:26,  8.72it/s]

  4%|██▏                                                  | 10/244 [00:01<00:26,  8.67it/s]

  5%|██▍                                                  | 11/244 [00:01<00:26,  8.70it/s]

  5%|██▌                                                  | 12/244 [00:01<00:26,  8.66it/s]

  5%|██▊                                                  | 13/244 [00:01<00:26,  8.64it/s]

  6%|███                                                  | 14/244 [00:01<00:26,  8.63it/s]

  6%|███▎                                                 | 15/244 [00:01<00:26,  8.66it/s]

  7%|███▍                                                 | 16/244 [00:01<00:26,  8.68it/s]

  7%|███▋                                                 | 17/244 [00:01<00:26,  8.67it/s]

  7%|███▉                                                 | 18/244 [00:02<00:26,  8.68it/s]

  8%|████▏                                                | 19/244 [00:02<00:25,  8.71it/s]

  8%|████▎                                                | 20/244 [00:02<00:25,  8.72it/s]

  9%|████▌                                                | 21/244 [00:02<00:25,  8.75it/s]

  9%|████▊                                                | 22/244 [00:02<00:25,  8.75it/s]

  9%|████▉                                                | 23/244 [00:02<00:25,  8.72it/s]

 10%|█████▏                                               | 24/244 [00:02<00:25,  8.75it/s]

 10%|█████▍                                               | 25/244 [00:02<00:25,  8.74it/s]

 11%|█████▋                                               | 26/244 [00:02<00:24,  8.78it/s]

 11%|█████▊                                               | 27/244 [00:03<00:24,  8.74it/s]

 11%|██████                                               | 28/244 [00:03<00:24,  8.77it/s]

 12%|██████▎                                              | 29/244 [00:03<00:24,  8.74it/s]

 12%|██████▌                                              | 30/244 [00:03<00:24,  8.69it/s]

 13%|██████▋                                              | 31/244 [00:03<00:24,  8.73it/s]

 13%|██████▉                                              | 32/244 [00:03<00:24,  8.75it/s]

 14%|███████▏                                             | 33/244 [00:03<00:23,  8.80it/s]

 14%|███████▍                                             | 34/244 [00:03<00:23,  8.81it/s]

 14%|███████▌                                             | 35/244 [00:04<00:23,  8.85it/s]

 15%|███████▊                                             | 36/244 [00:04<00:23,  8.74it/s]

 15%|████████                                             | 37/244 [00:04<00:23,  8.76it/s]

 16%|████████▎                                            | 38/244 [00:04<00:23,  8.77it/s]

 16%|████████▍                                            | 39/244 [00:04<00:23,  8.86it/s]

 16%|████████▋                                            | 40/244 [00:04<00:22,  8.92it/s]

 17%|████████▉                                            | 41/244 [00:04<00:22,  8.86it/s]

 17%|█████████                                            | 42/244 [00:04<00:22,  8.84it/s]

 18%|█████████▎                                           | 43/244 [00:04<00:22,  8.84it/s]

 18%|█████████▌                                           | 44/244 [00:05<00:22,  8.82it/s]

 18%|█████████▊                                           | 45/244 [00:05<00:22,  8.77it/s]

 19%|█████████▉                                           | 46/244 [00:05<00:22,  8.77it/s]

 19%|██████████▏                                          | 47/244 [00:05<00:22,  8.84it/s]

 20%|██████████▍                                          | 48/244 [00:05<00:22,  8.82it/s]

 20%|██████████▋                                          | 49/244 [00:05<00:22,  8.72it/s]

 20%|██████████▊                                          | 50/244 [00:05<00:22,  8.76it/s]

 21%|███████████                                          | 51/244 [00:05<00:22,  8.76it/s]

 21%|███████████▎                                         | 52/244 [00:05<00:21,  8.77it/s]

 22%|███████████▌                                         | 53/244 [00:06<00:21,  8.76it/s]

 22%|███████████▋                                         | 54/244 [00:06<00:21,  8.78it/s]

 23%|███████████▉                                         | 55/244 [00:06<00:21,  8.80it/s]

 23%|████████████▏                                        | 56/244 [00:06<00:21,  8.88it/s]

 23%|████████████▍                                        | 57/244 [00:06<00:21,  8.88it/s]

 24%|████████████▌                                        | 58/244 [00:06<00:20,  8.87it/s]

 24%|████████████▊                                        | 59/244 [00:06<00:20,  8.85it/s]

 25%|█████████████                                        | 60/244 [00:06<00:20,  8.83it/s]

 25%|█████████████▎                                       | 61/244 [00:06<00:20,  8.83it/s]

 25%|█████████████▍                                       | 62/244 [00:07<00:20,  8.80it/s]

 26%|█████████████▋                                       | 63/244 [00:07<00:20,  8.81it/s]

 26%|█████████████▉                                       | 64/244 [00:07<00:20,  8.81it/s]

 27%|██████████████                                       | 65/244 [00:07<00:20,  8.82it/s]

 27%|██████████████▎                                      | 66/244 [00:07<00:20,  8.83it/s]

 27%|██████████████▌                                      | 67/244 [00:07<00:20,  8.84it/s]

 28%|██████████████▊                                      | 68/244 [00:07<00:19,  8.83it/s]

 28%|██████████████▉                                      | 69/244 [00:07<00:19,  8.86it/s]

 29%|███████████████▏                                     | 70/244 [00:07<00:19,  8.88it/s]

 29%|███████████████▍                                     | 71/244 [00:08<00:19,  8.88it/s]

 30%|███████████████▋                                     | 72/244 [00:08<00:19,  8.85it/s]

 30%|███████████████▊                                     | 73/244 [00:08<00:19,  8.84it/s]

 30%|████████████████                                     | 74/244 [00:08<00:19,  8.71it/s]

 31%|████████████████▎                                    | 75/244 [00:08<00:19,  8.73it/s]

 31%|████████████████▌                                    | 76/244 [00:08<00:19,  8.77it/s]

 32%|████████████████▋                                    | 77/244 [00:08<00:19,  8.75it/s]

 32%|████████████████▉                                    | 78/244 [00:08<00:18,  8.81it/s]

 32%|█████████████████▏                                   | 79/244 [00:09<00:18,  8.85it/s]

 33%|█████████████████▍                                   | 80/244 [00:09<00:18,  8.81it/s]

 33%|█████████████████▌                                   | 81/244 [00:09<00:18,  8.85it/s]

 34%|█████████████████▊                                   | 82/244 [00:09<00:18,  8.85it/s]

 34%|██████████████████                                   | 83/244 [00:09<00:18,  8.93it/s]

 34%|██████████████████▏                                  | 84/244 [00:09<00:17,  8.92it/s]

 35%|██████████████████▍                                  | 85/244 [00:09<00:17,  8.89it/s]

 35%|██████████████████▋                                  | 86/244 [00:09<00:17,  8.90it/s]

 36%|██████████████████▉                                  | 87/244 [00:09<00:17,  8.87it/s]

 36%|███████████████████                                  | 88/244 [00:10<00:17,  8.82it/s]

 36%|███████████████████▎                                 | 89/244 [00:10<00:17,  8.84it/s]

 37%|███████████████████▌                                 | 90/244 [00:10<00:17,  8.85it/s]

 37%|███████████████████▊                                 | 91/244 [00:10<00:17,  8.80it/s]

 38%|███████████████████▉                                 | 92/244 [00:10<00:17,  8.78it/s]

 38%|████████████████████▏                                | 93/244 [00:10<00:17,  8.78it/s]

 39%|████████████████████▍                                | 94/244 [00:10<00:17,  8.78it/s]

 39%|████████████████████▋                                | 95/244 [00:10<00:16,  8.83it/s]

 39%|████████████████████▊                                | 96/244 [00:10<00:16,  8.87it/s]

 40%|█████████████████████                                | 97/244 [00:11<00:16,  8.90it/s]

 40%|█████████████████████▎                               | 98/244 [00:11<00:16,  8.91it/s]

 41%|█████████████████████▌                               | 99/244 [00:11<00:16,  8.89it/s]

 41%|█████████████████████▎                              | 100/244 [00:11<00:16,  8.89it/s]

 41%|█████████████████████▌                              | 101/244 [00:11<00:16,  8.88it/s]

 42%|█████████████████████▋                              | 102/244 [00:11<00:15,  8.91it/s]

 42%|█████████████████████▉                              | 103/244 [00:11<00:15,  8.89it/s]

 43%|██████████████████████▏                             | 104/244 [00:11<00:15,  8.89it/s]

 43%|██████████████████████▍                             | 105/244 [00:11<00:15,  8.85it/s]

 43%|██████████████████████▌                             | 106/244 [00:12<00:15,  8.89it/s]

 44%|██████████████████████▊                             | 107/244 [00:12<00:15,  8.86it/s]

 44%|███████████████████████                             | 108/244 [00:12<00:15,  8.82it/s]

 45%|███████████████████████▏                            | 109/244 [00:12<00:15,  8.80it/s]

 45%|███████████████████████▍                            | 110/244 [00:12<00:15,  8.81it/s]

 45%|███████████████████████▋                            | 111/244 [00:12<00:15,  8.86it/s]

 46%|███████████████████████▊                            | 112/244 [00:12<00:14,  8.99it/s]

 46%|████████████████████████                            | 113/244 [00:12<00:14,  9.04it/s]

 47%|████████████████████████▎                           | 114/244 [00:12<00:14,  9.11it/s]

 47%|████████████████████████▌                           | 115/244 [00:13<00:14,  9.13it/s]

 48%|████████████████████████▋                           | 116/244 [00:13<00:14,  9.13it/s]

 48%|████████████████████████▉                           | 117/244 [00:13<00:13,  9.18it/s]

 48%|█████████████████████████▏                          | 118/244 [00:13<00:13,  9.20it/s]

 49%|█████████████████████████▎                          | 119/244 [00:13<00:13,  9.17it/s]

 49%|█████████████████████████▌                          | 120/244 [00:13<00:13,  9.18it/s]

 50%|█████████████████████████▊                          | 121/244 [00:13<00:13,  9.20it/s]

 50%|██████████████████████████                          | 122/244 [00:13<00:13,  9.21it/s]

 50%|██████████████████████████▏                         | 123/244 [00:13<00:13,  9.20it/s]

 51%|██████████████████████████▍                         | 124/244 [00:14<00:13,  9.20it/s]

 51%|██████████████████████████▋                         | 125/244 [00:14<00:12,  9.18it/s]

 52%|██████████████████████████▊                         | 126/244 [00:14<00:12,  9.18it/s]

 52%|███████████████████████████                         | 127/244 [00:14<00:12,  9.17it/s]

 52%|███████████████████████████▎                        | 128/244 [00:14<00:12,  9.18it/s]

 53%|███████████████████████████▍                        | 129/244 [00:14<00:12,  9.22it/s]

 53%|███████████████████████████▋                        | 130/244 [00:14<00:12,  9.24it/s]

 54%|███████████████████████████▉                        | 131/244 [00:14<00:12,  9.28it/s]

 54%|████████████████████████████▏                       | 132/244 [00:14<00:11,  9.35it/s]

 55%|████████████████████████████▎                       | 133/244 [00:15<00:12,  9.16it/s]

 55%|████████████████████████████▌                       | 134/244 [00:15<00:12,  8.97it/s]

 55%|████████████████████████████▊                       | 135/244 [00:15<00:12,  8.88it/s]

 56%|████████████████████████████▉                       | 136/244 [00:15<00:12,  8.91it/s]

 56%|█████████████████████████████▏                      | 137/244 [00:15<00:12,  8.88it/s]

 57%|█████████████████████████████▍                      | 138/244 [00:15<00:11,  8.89it/s]

 57%|█████████████████████████████▌                      | 139/244 [00:15<00:11,  8.87it/s]

 57%|█████████████████████████████▊                      | 140/244 [00:15<00:11,  8.90it/s]

 58%|██████████████████████████████                      | 141/244 [00:15<00:11,  8.88it/s]

 58%|██████████████████████████████▎                     | 142/244 [00:16<00:11,  8.83it/s]

 59%|██████████████████████████████▍                     | 143/244 [00:16<00:11,  8.79it/s]

 59%|██████████████████████████████▋                     | 144/244 [00:16<00:11,  8.77it/s]

 59%|██████████████████████████████▉                     | 145/244 [00:16<00:11,  8.80it/s]

 60%|███████████████████████████████                     | 146/244 [00:16<00:11,  8.77it/s]

 60%|███████████████████████████████▎                    | 147/244 [00:16<00:11,  8.78it/s]

 61%|███████████████████████████████▌                    | 148/244 [00:16<00:10,  8.77it/s]

 61%|███████████████████████████████▊                    | 149/244 [00:16<00:10,  8.78it/s]

 61%|███████████████████████████████▉                    | 150/244 [00:16<00:10,  8.79it/s]

 62%|████████████████████████████████▏                   | 151/244 [00:17<00:10,  8.75it/s]

 62%|████████████████████████████████▍                   | 152/244 [00:17<00:10,  8.73it/s]

 63%|████████████████████████████████▌                   | 153/244 [00:17<00:10,  8.71it/s]

 63%|████████████████████████████████▊                   | 154/244 [00:17<00:10,  8.72it/s]

 64%|█████████████████████████████████                   | 155/244 [00:17<00:10,  8.72it/s]

 64%|█████████████████████████████████▏                  | 156/244 [00:17<00:10,  8.78it/s]

 64%|█████████████████████████████████▍                  | 157/244 [00:17<00:09,  8.77it/s]

 65%|█████████████████████████████████▋                  | 158/244 [00:17<00:09,  8.73it/s]

 65%|█████████████████████████████████▉                  | 159/244 [00:17<00:09,  8.72it/s]

 66%|██████████████████████████████████                  | 160/244 [00:18<00:09,  8.73it/s]

 66%|██████████████████████████████████▎                 | 161/244 [00:18<00:09,  8.71it/s]

 66%|██████████████████████████████████▌                 | 162/244 [00:18<00:09,  8.71it/s]

 67%|██████████████████████████████████▋                 | 163/244 [00:18<00:09,  8.72it/s]

 67%|██████████████████████████████████▉                 | 164/244 [00:18<00:09,  8.73it/s]

 68%|███████████████████████████████████▏                | 165/244 [00:18<00:09,  8.70it/s]

 68%|███████████████████████████████████▍                | 166/244 [00:18<00:08,  8.70it/s]

 68%|███████████████████████████████████▌                | 167/244 [00:18<00:08,  8.68it/s]

 69%|███████████████████████████████████▊                | 168/244 [00:19<00:08,  8.76it/s]

 69%|████████████████████████████████████                | 169/244 [00:19<00:08,  8.89it/s]

 70%|████████████████████████████████████▏               | 170/244 [00:19<00:08,  8.91it/s]

 70%|████████████████████████████████████▍               | 171/244 [00:19<00:08,  8.95it/s]

 70%|████████████████████████████████████▋               | 172/244 [00:19<00:08,  8.84it/s]

 71%|████████████████████████████████████▊               | 173/244 [00:19<00:08,  8.70it/s]

 71%|█████████████████████████████████████               | 174/244 [00:19<00:07,  8.76it/s]

 72%|█████████████████████████████████████▎              | 175/244 [00:19<00:07,  8.80it/s]

 72%|█████████████████████████████████████▌              | 176/244 [00:19<00:07,  8.85it/s]

 73%|█████████████████████████████████████▋              | 177/244 [00:20<00:07,  8.90it/s]

 73%|█████████████████████████████████████▉              | 178/244 [00:20<00:07,  8.89it/s]

 73%|██████████████████████████████████████▏             | 179/244 [00:20<00:07,  8.92it/s]

 74%|██████████████████████████████████████▎             | 180/244 [00:20<00:07,  8.96it/s]

 74%|██████████████████████████████████████▌             | 181/244 [00:20<00:06,  9.02it/s]

 75%|██████████████████████████████████████▊             | 182/244 [00:20<00:06,  8.97it/s]

 75%|███████████████████████████████████████             | 183/244 [00:20<00:06,  8.95it/s]

 75%|███████████████████████████████████████▏            | 184/244 [00:20<00:06,  8.98it/s]

 76%|███████████████████████████████████████▍            | 185/244 [00:20<00:06,  8.95it/s]

 76%|███████████████████████████████████████▋            | 186/244 [00:21<00:06,  8.89it/s]

 77%|███████████████████████████████████████▊            | 187/244 [00:21<00:06,  8.94it/s]

 77%|████████████████████████████████████████            | 188/244 [00:21<00:06,  9.00it/s]

 77%|████████████████████████████████████████▎           | 189/244 [00:21<00:06,  8.95it/s]

 78%|████████████████████████████████████████▍           | 190/244 [00:21<00:06,  8.92it/s]

 78%|████████████████████████████████████████▋           | 191/244 [00:21<00:05,  8.92it/s]

 79%|████████████████████████████████████████▉           | 192/244 [00:21<00:05,  8.93it/s]

 79%|█████████████████████████████████████████▏          | 193/244 [00:21<00:05,  8.90it/s]

 80%|█████████████████████████████████████████▎          | 194/244 [00:21<00:05,  8.85it/s]

 80%|█████████████████████████████████████████▌          | 195/244 [00:22<00:05,  8.82it/s]

 80%|█████████████████████████████████████████▊          | 196/244 [00:22<00:05,  8.77it/s]

 81%|█████████████████████████████████████████▉          | 197/244 [00:22<00:05,  8.78it/s]

 81%|██████████████████████████████████████████▏         | 198/244 [00:22<00:05,  8.78it/s]

 82%|██████████████████████████████████████████▍         | 199/244 [00:22<00:05,  8.78it/s]

 82%|██████████████████████████████████████████▌         | 200/244 [00:22<00:04,  8.82it/s]

 82%|██████████████████████████████████████████▊         | 201/244 [00:22<00:04,  8.80it/s]

 83%|███████████████████████████████████████████         | 202/244 [00:22<00:04,  8.81it/s]

 83%|███████████████████████████████████████████▎        | 203/244 [00:22<00:04,  8.79it/s]

 84%|███████████████████████████████████████████▍        | 204/244 [00:23<00:04,  8.77it/s]

 84%|███████████████████████████████████████████▋        | 205/244 [00:23<00:04,  8.75it/s]

 84%|███████████████████████████████████████████▉        | 206/244 [00:23<00:04,  8.74it/s]

 85%|████████████████████████████████████████████        | 207/244 [00:23<00:04,  8.73it/s]

 85%|████████████████████████████████████████████▎       | 208/244 [00:23<00:04,  8.73it/s]

 86%|████████████████████████████████████████████▌       | 209/244 [00:23<00:04,  8.72it/s]

 86%|████████████████████████████████████████████▊       | 210/244 [00:23<00:03,  8.70it/s]

 86%|████████████████████████████████████████████▉       | 211/244 [00:23<00:03,  8.72it/s]

 87%|█████████████████████████████████████████████▏      | 212/244 [00:23<00:03,  8.73it/s]

 87%|█████████████████████████████████████████████▍      | 213/244 [00:24<00:03,  8.73it/s]

 88%|█████████████████████████████████████████████▌      | 214/244 [00:24<00:03,  8.75it/s]

 88%|█████████████████████████████████████████████▊      | 215/244 [00:24<00:03,  8.76it/s]

 89%|██████████████████████████████████████████████      | 216/244 [00:24<00:03,  8.74it/s]

 89%|██████████████████████████████████████████████▏     | 217/244 [00:24<00:03,  8.78it/s]

 89%|██████████████████████████████████████████████▍     | 218/244 [00:24<00:02,  8.82it/s]

 90%|██████████████████████████████████████████████▋     | 219/244 [00:24<00:02,  8.83it/s]

 90%|██████████████████████████████████████████████▉     | 220/244 [00:24<00:02,  8.85it/s]

 91%|███████████████████████████████████████████████     | 221/244 [00:25<00:02,  8.83it/s]

 91%|███████████████████████████████████████████████▎    | 222/244 [00:25<00:02,  8.77it/s]

 91%|███████████████████████████████████████████████▌    | 223/244 [00:25<00:02,  8.74it/s]

 92%|███████████████████████████████████████████████▋    | 224/244 [00:25<00:02,  8.69it/s]

 92%|███████████████████████████████████████████████▉    | 225/244 [00:25<00:02,  8.43it/s]

 93%|████████████████████████████████████████████████▏   | 226/244 [00:25<00:02,  8.16it/s]

 93%|████████████████████████████████████████████████▍   | 227/244 [00:25<00:02,  8.32it/s]

 93%|████████████████████████████████████████████████▌   | 228/244 [00:25<00:01,  8.45it/s]

 94%|████████████████████████████████████████████████▊   | 229/244 [00:25<00:01,  8.51it/s]

 94%|█████████████████████████████████████████████████   | 230/244 [00:26<00:01,  8.57it/s]

 95%|█████████████████████████████████████████████████▏  | 231/244 [00:26<00:01,  8.59it/s]

 95%|█████████████████████████████████████████████████▍  | 232/244 [00:26<00:01,  8.61it/s]

 95%|█████████████████████████████████████████████████▋  | 233/244 [00:26<00:01,  8.65it/s]

 96%|█████████████████████████████████████████████████▊  | 234/244 [00:26<00:01,  8.70it/s]

 96%|██████████████████████████████████████████████████  | 235/244 [00:26<00:01,  8.72it/s]

 97%|██████████████████████████████████████████████████▎ | 236/244 [00:26<00:00,  8.72it/s]

 97%|██████████████████████████████████████████████████▌ | 237/244 [00:26<00:00,  8.71it/s]

 98%|██████████████████████████████████████████████████▋ | 238/244 [00:26<00:00,  8.71it/s]

 98%|██████████████████████████████████████████████████▉ | 239/244 [00:27<00:00,  8.69it/s]

 98%|███████████████████████████████████████████████████▏| 240/244 [00:27<00:00,  8.68it/s]

 99%|███████████████████████████████████████████████████▎| 241/244 [00:27<00:00,  8.66it/s]

 99%|███████████████████████████████████████████████████▌| 242/244 [00:27<00:00,  8.66it/s]

100%|███████████████████████████████████████████████████▊| 243/244 [00:27<00:00,  8.72it/s]

100%|████████████████████████████████████████████████████| 244/244 [00:27<00:00,  8.82it/s]

100%|████████████████████████████████████████████████████| 244/244 [00:27<00:00,  8.82it/s]

(732, 9)





Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject,feature,anova_p_value
0,high,unsel,-1.5096,0.0,-1.8942,-1.125,True,AreaShape_Area,1.0895449999999999e-20
1,high,wt,-2.0158,0.0,-2.3969,-1.6347,True,AreaShape_Area,1.0895449999999999e-20
2,unsel,wt,-0.5062,0.0066,-0.8908,-0.1216,True,AreaShape_Area,1.0895449999999999e-20
3,high,unsel,-1.5096,0.0,-1.8942,-1.125,True,AreaShape_CentralMoment_0_0,1.0895449999999999e-20
4,high,wt,-2.0158,0.0,-2.3969,-1.6347,True,AreaShape_CentralMoment_0_0,1.0895449999999999e-20


In [6]:
# save the results
output_file = pathlib.Path(
    "../../data/6.analysis_results/mean_aggregated_anova_results.parquet"
)
output_file.parent.mkdir(exist_ok=True, parents=True)
anova_output_df.to_parquet(output_file)

## Levene's test for homogeneity of variance across all groups

In [7]:
# split the df into three genotypes
high_df = df[df["Metadata_genotype"] == "high"]
unsel_df = df[df["Metadata_genotype"] == "unsel"]
wt_df = df[df["Metadata_genotype"] == "wt"]
levene_test_results = {"feature": [], "levene_statistic": [], "levene_p_value": []}
for feature in tqdm.tqdm(features_df.columns):
    # calculate the levene test for each feature
    levene_results = levene(wt_df[feature], unsel_df[feature], high_df[feature])
    levene_test_results["feature"].append(feature)
    levene_test_results["levene_statistic"].append(levene_results.statistic)
    levene_test_results["levene_p_value"].append(levene_results.pvalue)

levene_test_results_df = pd.DataFrame(levene_test_results)
levene_test_results_df

  0%|                                                              | 0/244 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████| 244/244 [00:00<00:00, 3890.27it/s]




Unnamed: 0,feature,levene_statistic,levene_p_value
0,AreaShape_Area,4.808285,0.010668
1,AreaShape_CentralMoment_0_0,4.808285,0.010668
2,AreaShape_CentralMoment_0_1,9.124611,0.000269
3,AreaShape_CentralMoment_0_2,9.782865,0.000158
4,AreaShape_CentralMoment_0_3,3.424782,0.037402
...,...,...,...
239,Texture_SumVariance_OP_3_03_256,7.546945,0.000994
240,Texture_Variance_OP_3_00_256,8.691200,0.000384
241,Texture_Variance_OP_3_01_256,8.906770,0.000322
242,Texture_Variance_OP_3_02_256,8.563326,0.000427


## Calculate the levenes test statistic for the equality of variances
## Pairwise

In [8]:
# split the df into three genotypes
high_df = df[df["Metadata_genotype"] == "high"]
unsel_df = df[df["Metadata_genotype"] == "unsel"]
wt_df = df[df["Metadata_genotype"] == "wt"]
group_dict = {
    "high_vs_unsel": [high_df, unsel_df],
    "high_vs_wt": [high_df, wt_df],
    "unsel_vs_wt": [wt_df, unsel_df],
}


levene_test_results = {
    "feature": [],
    "levene_statistic": [],
    "levene_p_value": [],
    "group": [],
    "holm_bonferroni_p_value": [],
}

for feature in tqdm.tqdm(features_df.columns):
    levene_p_values = []
    for group_comparison in group_dict.keys():
        # calculate the levene test for each feature
        levene_results = levene(
            group_dict[group_comparison][0][feature],
            group_dict[group_comparison][1][feature],
        )
        levene_test_results["feature"].append(feature)
        levene_test_results["levene_statistic"].append(levene_results.statistic)
        levene_test_results["levene_p_value"].append(levene_results.pvalue)
        levene_test_results["group"].append(group_comparison)
        levene_p_values.append(levene_results.pvalue)
    levene_holm_bonferroni_p_values = multipletests(levene_p_values, method="holm")[1]
    # run holm-bonferroni correction on the p-values for each feature
    [
        levene_test_results["holm_bonferroni_p_value"].append(p_value)
        for p_value in levene_holm_bonferroni_p_values
    ]

  0%|                                                              | 0/244 [00:00<?, ?it/s]

  1%|▋                                                     | 3/244 [00:00<00:09, 26.50it/s]

  2%|█▎                                                    | 6/244 [00:00<00:09, 24.30it/s]

  4%|█▉                                                    | 9/244 [00:00<00:09, 23.83it/s]

  5%|██▌                                                  | 12/244 [00:00<00:09, 24.98it/s]

  6%|███▎                                                 | 15/244 [00:00<00:08, 25.89it/s]

  7%|███▉                                                 | 18/244 [00:00<00:08, 26.73it/s]

  9%|████▌                                                | 21/244 [00:00<00:08, 25.59it/s]

 10%|█████▏                                               | 24/244 [00:00<00:08, 25.51it/s]

 11%|█████▊                                               | 27/244 [00:01<00:08, 26.26it/s]

 12%|██████▌                                              | 30/244 [00:01<00:08, 26.43it/s]

 14%|███████▏                                             | 33/244 [00:01<00:07, 26.58it/s]

 15%|███████▊                                             | 36/244 [00:01<00:07, 26.33it/s]

 16%|████████▍                                            | 39/244 [00:01<00:07, 25.89it/s]

 17%|█████████                                            | 42/244 [00:01<00:07, 26.02it/s]

 18%|█████████▊                                           | 45/244 [00:01<00:07, 26.21it/s]

 20%|██████████▍                                          | 48/244 [00:01<00:07, 25.79it/s]

 21%|███████████                                          | 51/244 [00:01<00:07, 26.21it/s]

 22%|███████████▋                                         | 54/244 [00:02<00:07, 26.25it/s]

 23%|████████████▍                                        | 57/244 [00:02<00:07, 26.41it/s]

 25%|█████████████▎                                       | 61/244 [00:02<00:06, 27.87it/s]

 27%|██████████████                                       | 65/244 [00:02<00:06, 28.89it/s]

 28%|██████████████▊                                      | 68/244 [00:02<00:06, 28.88it/s]

 30%|███████████████▋                                     | 72/244 [00:02<00:05, 29.76it/s]

 31%|████████████████▌                                    | 76/244 [00:02<00:05, 30.40it/s]

 33%|█████████████████▍                                   | 80/244 [00:02<00:05, 30.58it/s]

 34%|██████████████████▏                                  | 84/244 [00:03<00:05, 29.23it/s]

 36%|██████████████████▉                                  | 87/244 [00:03<00:05, 27.86it/s]

 37%|███████████████████▌                                 | 90/244 [00:03<00:05, 26.47it/s]

 38%|████████████████████▏                                | 93/244 [00:03<00:05, 26.92it/s]

 39%|████████████████████▊                                | 96/244 [00:03<00:05, 27.32it/s]

 41%|█████████████████████▌                               | 99/244 [00:03<00:05, 26.15it/s]

 42%|█████████████████████▋                              | 102/244 [00:03<00:05, 26.31it/s]

 43%|██████████████████████▍                             | 105/244 [00:03<00:05, 26.95it/s]

 44%|███████████████████████                             | 108/244 [00:03<00:04, 27.55it/s]

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7a505a2662b0>>
Traceback (most recent call last):
  File "/home/lippincm/miniforge3/envs/op_cell_processing_env/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames


    def _clean_thread_parent_frames(
KeyboardInterrupt: 


 46%|███████████████████████▊                            | 112/244 [00:04<00:04, 27.92it/s]

 47%|████████████████████████▌                           | 115/244 [00:04<00:04, 27.50it/s]

 49%|█████████████████████████▎                          | 119/244 [00:04<00:04, 28.87it/s]

In [None]:
levene_test_results_df = pd.DataFrame(levene_test_results)

# sort the levene test results levene_test_results_df
# change the levene p-value to a float
levene_test_results_df["levene_p_value"] = levene_test_results_df[
    "levene_p_value"
].astype(float)
levene_test_results_df = levene_test_results_df.sort_values(
    ["feature", "group"], ascending=[True, True]
)
levene_test_results_df

In [None]:
# save the levene test results
# out dir
out_dir = pathlib.Path("../../data/6.analysis_results/")
# create the dir if it does not exist
out_dir.mkdir(parents=True, exist_ok=True)
levene_test_results_path = pathlib.Path(
    out_dir / "mean_aggregated_levene_test_results.csv"
)
levene_test_results_df.to_csv(levene_test_results_path)

### Calculate the levene test statistic for the aggregated data across feature types and genotypes

In [None]:
data_path = pathlib.Path(
    "../../data/5.converted_data/mean_aggregated_data.parquet"
).resolve(strict=True)
# Read the data
data = pd.read_parquet(data_path)

# Drop all metadata except for the genotype data
features_df = data.drop(columns=data.filter(like="Metadata").columns)
features_df["Metadata_genotype"] = data["Metadata_genotype"]


# turn the features into a long format
features_long_df = features_df.melt(
    id_vars="Metadata_genotype", var_name="feature", value_name="value"
)
features_long_df.head()
# Separate the feature into different parts
features_long_df[
    ["feature_group", "measurement", "bone", "parameter1", "parameter2", "parameter3"]
] = features_long_df["feature"].str.split("_", expand=True)

# Replace the Metadata_genotype with the actual genotype name
features_long_df["Metadata_genotype"] = features_long_df["Metadata_genotype"].replace(
    {"high": "High-Severity", "unsel": "Mid-Severity", "wt": "Wild Type"}
)
features_long_df.head()

In [None]:
# break each genotype and featuretype into a separate dataframe
high_df = features_long_df[features_long_df["Metadata_genotype"] == "High-Severity"]
unsel_df = features_long_df[features_long_df["Metadata_genotype"] == "Mid-Severity"]
wt_df = features_long_df[features_long_df["Metadata_genotype"] == "Wild Type"]

# each feature group
high_df_AreaShape = high_df[high_df["feature_group"] == "AreaShape"]
high_df_Intensity = high_df[high_df["feature_group"] == "Intensity"]
high_df_Neighbors = high_df[high_df["feature_group"] == "Neighbors"]
high_df_radial = high_df[high_df["feature_group"] == "RadialDistribution"]
high_df_Granularity = high_df[high_df["feature_group"] == "Granularity"]

unsel_df_AreaShape = unsel_df[unsel_df["feature_group"] == "AreaShape"]
unsel_df_Intensity = unsel_df[unsel_df["feature_group"] == "Intensity"]
unsel_df_Neighbors = unsel_df[unsel_df["feature_group"] == "Neighbors"]
unsel_df_radial = unsel_df[unsel_df["feature_group"] == "RadialDistribution"]
unsel_df_Granularity = unsel_df[unsel_df["feature_group"] == "Granularity"]

wt_df_AreaShape = wt_df[wt_df["feature_group"] == "AreaShape"]
wt_df_Intensity = wt_df[wt_df["feature_group"] == "Intensity"]
wt_df_Neighbors = wt_df[wt_df["feature_group"] == "Neighbors"]
wt_df_radial = wt_df[wt_df["feature_group"] == "RadialDistribution"]
wt_df_Granularity = wt_df[wt_df["feature_group"] == "Granularity"]

# levene test for each feature group
levene_test_results = {
    "feature_group": [],
    "levene_statistic": [],
    "levene_p_value": [],
    "group": [],
}

group_dict = {
    "AreaShape": {
        "high_area_v_unsel_area": [high_df_AreaShape, unsel_df_AreaShape],
        "high_area_v_wt_area": [high_df_AreaShape, wt_df_AreaShape],
        "unsel_area_v_wt_area": [wt_df_AreaShape, unsel_df_AreaShape],
    },
    "Intensity": {
        "high_intensity_v_unsel_intensity": [high_df_Intensity, unsel_df_Intensity],
        "high_intensity_v_wt_intensity": [high_df_Intensity, wt_df_Intensity],
        "unsel_intensity_v_wt_intensity": [wt_df_Intensity, unsel_df_Intensity],
    },
    "Neighbors": {
        "high_neighbors_v_unsel_neighbors": [high_df_Neighbors, unsel_df_Neighbors],
        "high_neighbors_v_wt_neighbors": [high_df_Neighbors, wt_df_Neighbors],
        "unsel_neighbors_v_wt_neighbors": [wt_df_Neighbors, unsel_df_Neighbors],
    },
    "RadialDistribution": {
        "high_radial_v_unsel_radial": [high_df_radial, unsel_df_radial],
        "high_radial_v_wt_radial": [high_df_radial, wt_df_radial],
        "unsel_radial_v_wt_radial": [wt_df_radial, unsel_df_radial],
    },
    "Granularity": {
        "high_granularity_v_unsel_granularity": [
            high_df_Granularity,
            unsel_df_Granularity,
        ],
        "high_granularity_v_wt_granularity": [high_df_Granularity, wt_df_Granularity],
        "unsel_granularity_v_wt_granularity": [wt_df_Granularity, unsel_df_Granularity],
    },
}

for feature_group in tqdm.tqdm(group_dict.keys()):
    for group in group_dict[feature_group].keys():
        if not group == "all":
            levene_results = levene(
                group_dict[feature_group][group][0]["value"],
                group_dict[feature_group][group][1]["value"],
            )
            # calculate the variance for each feature group

            levene_test_results["feature_group"].append(feature_group)
            levene_test_results["levene_statistic"].append(levene_results.statistic)
            levene_test_results["levene_p_value"].append(levene_results.pvalue)
            levene_test_results["group"].append(group)
        else:
            pass

levene_test_results_df = pd.DataFrame(levene_test_results)
levene_test_results_df.head()

In [None]:
# save the levene test results
# out dir
out_dir = pathlib.Path("../../data/6.analysis_results/")
# create the dir if it does not exist
out_dir.mkdir(parents=True, exist_ok=True)
levene_test_results_path = pathlib.Path(
    out_dir / "mean_aggregated_levene_test_results_feature_types.csv"
)
levene_test_results_df.to_csv(levene_test_results_path, index=False)

In [None]:
# Drop all metadata except for the genotype data
features_df = data.drop(columns=data.filter(like="Metadata").columns)
features_df["Metadata_genotype"] = data["Metadata_genotype"]

# turn the features into a long format
features_long_df = features_df.melt(
    id_vars="Metadata_genotype", var_name="feature", value_name="value"
)
# get the variance for each feature for each genotype
features_long_df
all_features_var = features_long_df.groupby(["Metadata_genotype", "feature"]).var()
# reset the index
all_features_var = all_features_var.reset_index()
all_features_var.rename(columns={"value": "variance"}, inplace=True)

# save the variance results
var_each_feature_path = pathlib.Path(
    out_dir / "mean_aggregated_variance_results_each_feature.csv"
)
all_features_var.to_csv(var_each_feature_path, index=False)
all_features_var.head()

In [None]:
# get the variance for each feature group
var_df = features_long_df.groupby(["Metadata_genotype", "feature"]).var().reset_index()
var_df.head()
# change the value column name to variance
var_df.rename(columns={"value": "variance"}, inplace=True)

In [None]:
var_df[
    ["feature_group", "measurement", "bone", "parameter1", "parameter2", "parameter3"]
] = var_df["feature"].str.split("_", expand=True)

# Replace the Metadata_genotype with the actual genotype name
var_df["Metadata_genotype"] = var_df["Metadata_genotype"].replace(
    {"high": "High-Severity", "unsel": "Mid-Severity", "wt": "Wild Type"}
)
var_df
var_df = var_df.drop(
    columns=["feature", "measurement", "bone", "parameter1", "parameter2", "parameter3"]
)
var_df
# save the variance results
var_path = pathlib.Path(out_dir / "mean_aggregated_variance_results_feature_types.csv")
var_df.to_csv(var_path, index=False)

In [None]:
# get the mean and stdev for each feature group's variance
var_df = (
    var_df.groupby(["Metadata_genotype", "feature_group"])
    .agg(["mean", "std", "max", "min", "count"])
    .reset_index()
)
# ungroup the columns
var_df.columns = ["_".join(col).strip() for col in var_df.columns.values]
# rename the Metadata_genotype_ column and the feature_group_ column
var_df.rename(
    columns={
        "Metadata_genotype_": "Metadata_genotype",
        "feature_group_": "feature_group",
    },
    inplace=True,
)
var_df

In [None]:
# save the variance results
var_path = pathlib.Path(
    out_dir / "mean_aggregated_variance_results_feature_types_stats.csv"
)
var_df.to_csv(var_path, index=False)