In [1]:
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import tqdm
from scipy.stats import levene

# import anova and tukeyhsd
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [2]:
def anova_function(features_df: pd.DataFrame, Metdata_column: str) -> pd.DataFrame:
    """
    This function will take in a dataframe and a metadata column and return the results of an anova and tukeyhsd test for each feature.


    Parameters
    ----------
    features_df : pd.DataFrame
        The dataframe containing the features with only one metadata column
    Metdata_column : str
        The name of the metadata column to be used for the anova test

    Returns
    -------
    pd.DataFrame
        A dataframe containing the results of the anova and tukeyhsd test for each feature
    """

    # anova and tukeyhsd for each feature
    # create a pandas data frame to store the results
    anova_results = pd.DataFrame()

    # loop through each feature
    for feature in tqdm.tqdm(features_df.columns[:-1]):
        # create a model
        model = ols(f"{feature} ~ C({Metdata_column})", data=features_df).fit()
        # create an anova table
        anova_table = sm.stats.anova_lm(model, typ=2)
        # create a tukeyhsd table
        tukeyhsd = pairwise_tukeyhsd(features_df[feature], features_df[Metdata_column])
        # get the f-statistic based p-value
        anova_p_value = anova_table["PR(>F)"][0]
        tmp = pd.DataFrame(
            tukeyhsd._results_table.data, columns=tukeyhsd._results_table.data[0]
        ).drop(0)
        tmp.reset_index(inplace=True, drop=True)
        # drop the first row
        tmp["feature"] = feature
        tmp["anova_p_value"] = anova_p_value
        tmp = pd.DataFrame(tmp)

        anova_results = pd.concat([anova_results, tmp], axis=0).reset_index(drop=True)
    return anova_results

In [3]:
file_path = pathlib.Path("../../data/5.converted_data/custom_aggregated_data.parquet")
df = pd.read_parquet(file_path)
df.head()

Unnamed: 0,Metadata_genotype,Metadata_replicate,Metadata_side,AreaShape_Area,AreaShape_CentralMoment_0_0,AreaShape_CentralMoment_0_1,AreaShape_CentralMoment_0_2,AreaShape_CentralMoment_0_3,AreaShape_CentralMoment_1_0,AreaShape_CentralMoment_1_1,...,Texture_SumEntropy_OP_3_02_256,Texture_SumEntropy_OP_3_03_256,Texture_SumVariance_OP_3_00_256,Texture_SumVariance_OP_3_01_256,Texture_SumVariance_OP_3_02_256,Texture_SumVariance_OP_3_03_256,Texture_Variance_OP_3_00_256,Texture_Variance_OP_3_01_256,Texture_Variance_OP_3_02_256,Texture_Variance_OP_3_03_256
0,high,1,L,-0.178797,-0.178797,0.003558,-0.386796,0.280118,0.037998,0.434046,...,-0.627762,-0.64499,-0.903354,-0.890939,-0.896995,-0.90287,-0.896145,-0.898448,-0.89888,-0.896609
1,high,1,R,1.371763,1.371763,0.538195,0.288322,0.501618,4.163833,-1.36192,...,0.419551,0.406918,-0.188358,-0.159114,-0.189615,-0.186308,-0.218981,-0.225754,-0.219073,-0.225134
2,high,10,L,1.706234,1.706234,2.652373,3.280425,-1.992966,0.18463,-1.404376,...,0.841475,0.834574,1.149186,1.1522,1.078905,1.144848,1.040009,1.02574,1.045492,1.027617
3,high,10,R,0.771674,0.771674,-1.332747,0.164304,0.244371,0.17376,-1.885144,...,0.45248,0.436138,0.112178,0.131982,0.108602,0.1257,0.075717,0.071207,0.076477,0.067922
4,high,11,L,2.180858,2.180858,2.142686,1.838038,-0.077515,0.00865,0.876597,...,0.494813,0.490844,-0.01614,0.001041,-0.021997,-0.020992,-0.033934,-0.042635,-0.034983,-0.038763


In [4]:
# split the features and the metadata
metadata = df.columns.str.contains("Metadata")
# filter the metadata
metadata_df = df.loc[:, metadata]
# filter the features
features_df = df.loc[:, ~metadata]

## Anova for genotype only

In [5]:
anova_input_df = features_df.copy()
anova_input_df["Metadata_genotype"] = metadata_df["Metadata_genotype"]
anova_output_df = anova_function(anova_input_df, "Metadata_genotype")
print(anova_output_df.shape)
anova_output_df.head()

  0%|                                                             | 0/244 [00:00<?, ?it/s]

  0%|▏                                                    | 1/244 [00:00<00:29,  8.16it/s]

  1%|▍                                                    | 2/244 [00:00<00:29,  8.33it/s]

  1%|▋                                                    | 3/244 [00:00<00:28,  8.45it/s]

  2%|▊                                                    | 4/244 [00:00<00:28,  8.44it/s]

  2%|█                                                    | 5/244 [00:00<00:28,  8.49it/s]

  2%|█▎                                                   | 6/244 [00:00<00:27,  8.64it/s]

  3%|█▌                                                   | 7/244 [00:00<00:27,  8.53it/s]

  3%|█▋                                                   | 8/244 [00:00<00:27,  8.54it/s]

  4%|█▉                                                   | 9/244 [00:01<00:27,  8.50it/s]

  4%|██▏                                                 | 10/244 [00:01<00:27,  8.51it/s]

  5%|██▎                                                 | 11/244 [00:01<00:27,  8.50it/s]

  5%|██▌                                                 | 12/244 [00:01<00:27,  8.49it/s]

  5%|██▊                                                 | 13/244 [00:01<00:27,  8.53it/s]

  6%|██▉                                                 | 14/244 [00:01<00:26,  8.54it/s]

  6%|███▏                                                | 15/244 [00:01<00:26,  8.55it/s]

  7%|███▍                                                | 16/244 [00:01<00:26,  8.50it/s]

  7%|███▌                                                | 17/244 [00:01<00:26,  8.52it/s]

  7%|███▊                                                | 18/244 [00:02<00:26,  8.51it/s]

  8%|████                                                | 19/244 [00:02<00:26,  8.51it/s]

  8%|████▎                                               | 20/244 [00:02<00:26,  8.51it/s]

  9%|████▍                                               | 21/244 [00:02<00:26,  8.52it/s]

  9%|████▋                                               | 22/244 [00:02<00:26,  8.53it/s]

  9%|████▉                                               | 23/244 [00:02<00:25,  8.52it/s]

 10%|█████                                               | 24/244 [00:02<00:25,  8.56it/s]

 10%|█████▎                                              | 25/244 [00:02<00:25,  8.54it/s]

 11%|█████▌                                              | 26/244 [00:03<00:25,  8.54it/s]

 11%|█████▊                                              | 27/244 [00:03<00:25,  8.54it/s]

 11%|█████▉                                              | 28/244 [00:03<00:25,  8.45it/s]

 12%|██████▏                                             | 29/244 [00:03<00:25,  8.37it/s]

 12%|██████▍                                             | 30/244 [00:03<00:25,  8.31it/s]

 13%|██████▌                                             | 31/244 [00:03<00:25,  8.28it/s]

 13%|██████▊                                             | 32/244 [00:03<00:25,  8.23it/s]

 14%|███████                                             | 33/244 [00:03<00:25,  8.19it/s]

 14%|███████▏                                            | 34/244 [00:04<00:25,  8.16it/s]

 14%|███████▍                                            | 35/244 [00:04<00:26,  8.02it/s]

 15%|███████▋                                            | 36/244 [00:04<00:25,  8.05it/s]

 15%|███████▉                                            | 37/244 [00:04<00:25,  8.12it/s]

 16%|████████                                            | 38/244 [00:04<00:25,  8.18it/s]

 16%|████████▎                                           | 39/244 [00:04<00:24,  8.28it/s]

 16%|████████▌                                           | 40/244 [00:04<00:24,  8.35it/s]

 17%|████████▋                                           | 41/244 [00:04<00:24,  8.32it/s]

 17%|████████▉                                           | 42/244 [00:04<00:24,  8.36it/s]

 18%|█████████▏                                          | 43/244 [00:05<00:23,  8.42it/s]

 18%|█████████▍                                          | 44/244 [00:05<00:23,  8.41it/s]

 18%|█████████▌                                          | 45/244 [00:05<00:23,  8.43it/s]

 19%|█████████▊                                          | 46/244 [00:05<00:23,  8.44it/s]

 19%|██████████                                          | 47/244 [00:05<00:23,  8.53it/s]

 20%|██████████▏                                         | 48/244 [00:05<00:23,  8.46it/s]

 20%|██████████▍                                         | 49/244 [00:05<00:23,  8.27it/s]

 20%|██████████▋                                         | 50/244 [00:05<00:23,  8.31it/s]

 21%|██████████▊                                         | 51/244 [00:06<00:22,  8.39it/s]

 21%|███████████                                         | 52/244 [00:06<00:22,  8.43it/s]

 22%|███████████▎                                        | 53/244 [00:06<00:22,  8.51it/s]

 22%|███████████▌                                        | 54/244 [00:06<00:22,  8.55it/s]

 23%|███████████▋                                        | 55/244 [00:06<00:22,  8.58it/s]

 23%|███████████▉                                        | 56/244 [00:06<00:21,  8.68it/s]

 23%|████████████▏                                       | 57/244 [00:06<00:21,  8.67it/s]

 24%|████████████▎                                       | 58/244 [00:06<00:21,  8.66it/s]

 24%|████████████▌                                       | 59/244 [00:06<00:21,  8.62it/s]

 25%|████████████▊                                       | 60/244 [00:07<00:21,  8.59it/s]

 25%|█████████████                                       | 61/244 [00:07<00:21,  8.56it/s]

 25%|█████████████▏                                      | 62/244 [00:07<00:21,  8.56it/s]

 26%|█████████████▍                                      | 63/244 [00:07<00:21,  8.55it/s]

 26%|█████████████▋                                      | 64/244 [00:07<00:21,  8.55it/s]

 27%|█████████████▊                                      | 65/244 [00:07<00:21,  8.23it/s]

 27%|██████████████                                      | 66/244 [00:07<00:21,  8.15it/s]

 27%|██████████████▎                                     | 67/244 [00:07<00:21,  8.23it/s]

 28%|██████████████▍                                     | 68/244 [00:08<00:21,  8.30it/s]

 28%|██████████████▋                                     | 69/244 [00:08<00:20,  8.38it/s]

 29%|██████████████▉                                     | 70/244 [00:08<00:23,  7.46it/s]

 29%|███████████████▏                                    | 71/244 [00:08<00:25,  6.84it/s]

 30%|███████████████▎                                    | 72/244 [00:08<00:26,  6.46it/s]

 30%|███████████████▌                                    | 73/244 [00:08<00:27,  6.19it/s]

 30%|███████████████▊                                    | 74/244 [00:09<00:28,  6.03it/s]

 31%|███████████████▉                                    | 75/244 [00:09<00:28,  5.91it/s]

 31%|████████████████▏                                   | 76/244 [00:09<00:31,  5.26it/s]

 32%|████████████████▍                                   | 77/244 [00:09<00:31,  5.24it/s]

 32%|████████████████▌                                   | 78/244 [00:09<00:33,  5.00it/s]

 32%|████████████████▊                                   | 79/244 [00:10<00:29,  5.68it/s]

 33%|█████████████████                                   | 80/244 [00:10<00:26,  6.23it/s]

 33%|█████████████████▎                                  | 81/244 [00:10<00:24,  6.63it/s]

 34%|█████████████████▍                                  | 82/244 [00:10<00:25,  6.31it/s]

 34%|█████████████████▋                                  | 83/244 [00:10<00:26,  6.15it/s]

 34%|█████████████████▉                                  | 84/244 [00:10<00:26,  6.00it/s]

 35%|██████████████████                                  | 85/244 [00:10<00:26,  5.94it/s]

 35%|██████████████████▎                                 | 86/244 [00:11<00:26,  5.88it/s]

 36%|██████████████████▌                                 | 87/244 [00:11<00:26,  5.82it/s]

 36%|██████████████████▊                                 | 88/244 [00:11<00:26,  5.80it/s]

 36%|██████████████████▉                                 | 89/244 [00:11<00:26,  5.80it/s]

 37%|███████████████████▏                                | 90/244 [00:11<00:26,  5.81it/s]

 37%|███████████████████▍                                | 91/244 [00:11<00:25,  6.08it/s]

 38%|███████████████████▌                                | 92/244 [00:12<00:22,  6.61it/s]

 38%|███████████████████▊                                | 93/244 [00:12<00:21,  7.06it/s]

 39%|████████████████████                                | 94/244 [00:12<00:21,  6.84it/s]

 39%|████████████████████▏                               | 95/244 [00:12<00:22,  6.50it/s]

 39%|████████████████████▍                               | 96/244 [00:12<00:23,  6.27it/s]

 40%|████████████████████▋                               | 97/244 [00:12<00:24,  6.12it/s]

 40%|████████████████████▉                               | 98/244 [00:13<00:24,  5.99it/s]

 41%|█████████████████████                               | 99/244 [00:13<00:25,  5.59it/s]

 41%|████████████████████▉                              | 100/244 [00:13<00:33,  4.32it/s]

 41%|█████████████████████                              | 101/244 [00:13<00:32,  4.36it/s]

 42%|█████████████████████▎                             | 102/244 [00:13<00:28,  4.94it/s]

 42%|█████████████████████▌                             | 103/244 [00:14<00:30,  4.59it/s]

 43%|█████████████████████▋                             | 104/244 [00:14<00:34,  4.05it/s]

 43%|█████████████████████▉                             | 105/244 [00:14<00:33,  4.12it/s]

 43%|██████████████████████▏                            | 106/244 [00:14<00:30,  4.52it/s]

 44%|██████████████████████▎                            | 107/244 [00:15<00:28,  4.82it/s]

 44%|██████████████████████▌                            | 108/244 [00:15<00:26,  5.05it/s]

 45%|██████████████████████▊                            | 109/244 [00:15<00:29,  4.55it/s]

 45%|██████████████████████▉                            | 110/244 [00:15<00:26,  5.06it/s]

 45%|███████████████████████▏                           | 111/244 [00:15<00:23,  5.74it/s]

 46%|███████████████████████▍                           | 112/244 [00:15<00:20,  6.35it/s]

 46%|███████████████████████▌                           | 113/244 [00:16<00:19,  6.85it/s]

 47%|███████████████████████▊                           | 114/244 [00:16<00:21,  5.97it/s]

 47%|████████████████████████                           | 115/244 [00:16<00:28,  4.53it/s]

 48%|████████████████████████▏                          | 116/244 [00:16<00:29,  4.40it/s]

 48%|████████████████████████▍                          | 117/244 [00:17<00:28,  4.48it/s]

 48%|████████████████████████▋                          | 118/244 [00:17<00:26,  4.80it/s]

 49%|████████████████████████▊                          | 119/244 [00:17<00:25,  4.97it/s]

 49%|█████████████████████████                          | 120/244 [00:17<00:23,  5.19it/s]

 50%|█████████████████████████▎                         | 121/244 [00:17<00:22,  5.38it/s]

 50%|█████████████████████████▌                         | 122/244 [00:17<00:22,  5.50it/s]

 50%|█████████████████████████▋                         | 123/244 [00:18<00:21,  5.62it/s]

 51%|█████████████████████████▉                         | 124/244 [00:18<00:19,  6.26it/s]

 51%|██████████████████████████▏                        | 125/244 [00:18<00:17,  6.78it/s]

 52%|██████████████████████████▎                        | 126/244 [00:18<00:19,  6.15it/s]

 52%|██████████████████████████▌                        | 127/244 [00:18<00:21,  5.55it/s]

 52%|██████████████████████████▊                        | 128/244 [00:19<00:24,  4.75it/s]

 53%|██████████████████████████▉                        | 129/244 [00:19<00:25,  4.55it/s]

 53%|███████████████████████████▏                       | 130/244 [00:19<00:25,  4.41it/s]

 54%|███████████████████████████▍                       | 131/244 [00:19<00:25,  4.43it/s]

 54%|███████████████████████████▌                       | 132/244 [00:19<00:23,  4.82it/s]

 55%|███████████████████████████▊                       | 133/244 [00:20<00:21,  5.05it/s]

 55%|████████████████████████████                       | 134/244 [00:20<00:20,  5.25it/s]

 55%|████████████████████████████▏                      | 135/244 [00:20<00:20,  5.43it/s]

 56%|████████████████████████████▍                      | 136/244 [00:20<00:17,  6.09it/s]

 56%|████████████████████████████▋                      | 137/244 [00:20<00:16,  6.63it/s]

 57%|████████████████████████████▊                      | 138/244 [00:20<00:15,  6.70it/s]

 57%|█████████████████████████████                      | 139/244 [00:21<00:17,  5.87it/s]

 57%|█████████████████████████████▎                     | 140/244 [00:21<00:17,  5.85it/s]

 58%|█████████████████████████████▍                     | 141/244 [00:21<00:17,  5.82it/s]

 58%|█████████████████████████████▋                     | 142/244 [00:21<00:17,  5.80it/s]

 59%|█████████████████████████████▉                     | 143/244 [00:21<00:17,  5.78it/s]

 59%|██████████████████████████████                     | 144/244 [00:21<00:17,  5.76it/s]

 59%|██████████████████████████████▎                    | 145/244 [00:22<00:17,  5.77it/s]

 60%|██████████████████████████████▌                    | 146/244 [00:22<00:17,  5.76it/s]

 60%|██████████████████████████████▋                    | 147/244 [00:22<00:16,  5.79it/s]

 61%|██████████████████████████████▉                    | 148/244 [00:22<00:16,  5.80it/s]

 61%|███████████████████████████████▏                   | 149/244 [00:22<00:16,  5.85it/s]

 61%|███████████████████████████████▎                   | 150/244 [00:22<00:15,  6.22it/s]

 62%|███████████████████████████████▌                   | 151/244 [00:23<00:15,  6.05it/s]

 62%|███████████████████████████████▊                   | 152/244 [00:23<00:15,  5.95it/s]

 63%|███████████████████████████████▉                   | 153/244 [00:23<00:15,  5.89it/s]

 63%|████████████████████████████████▏                  | 154/244 [00:23<00:15,  5.81it/s]

 64%|████████████████████████████████▍                  | 155/244 [00:23<00:15,  5.81it/s]

 64%|████████████████████████████████▌                  | 156/244 [00:23<00:15,  5.81it/s]

 64%|████████████████████████████████▊                  | 157/244 [00:24<00:14,  5.81it/s]

 65%|█████████████████████████████████                  | 158/244 [00:24<00:14,  5.80it/s]

 65%|█████████████████████████████████▏                 | 159/244 [00:24<00:14,  5.79it/s]

 66%|█████████████████████████████████▍                 | 160/244 [00:24<00:14,  5.78it/s]

 66%|█████████████████████████████████▋                 | 161/244 [00:24<00:14,  5.72it/s]

 66%|█████████████████████████████████▊                 | 162/244 [00:25<00:14,  5.83it/s]

 67%|██████████████████████████████████                 | 163/244 [00:25<00:12,  6.43it/s]

 67%|██████████████████████████████████▎                | 164/244 [00:25<00:11,  6.95it/s]

 68%|██████████████████████████████████▍                | 165/244 [00:25<00:11,  6.82it/s]

 68%|██████████████████████████████████▋                | 166/244 [00:25<00:12,  6.45it/s]

 68%|██████████████████████████████████▉                | 167/244 [00:25<00:12,  6.22it/s]

 69%|███████████████████████████████████                | 168/244 [00:25<00:13,  5.58it/s]

 69%|███████████████████████████████████▎               | 169/244 [00:26<00:14,  5.31it/s]

 70%|███████████████████████████████████▌               | 170/244 [00:26<00:13,  5.47it/s]

 70%|███████████████████████████████████▋               | 171/244 [00:26<00:13,  5.59it/s]

 70%|███████████████████████████████████▉               | 172/244 [00:26<00:12,  5.76it/s]

 71%|████████████████████████████████████▏              | 173/244 [00:26<00:11,  6.38it/s]

 71%|████████████████████████████████████▎              | 174/244 [00:26<00:10,  6.95it/s]

 72%|████████████████████████████████████▌              | 175/244 [00:27<00:10,  6.77it/s]

 72%|████████████████████████████████████▊              | 176/244 [00:27<00:10,  6.49it/s]

 73%|████████████████████████████████████▉              | 177/244 [00:27<00:10,  6.29it/s]

 73%|█████████████████████████████████████▏             | 178/244 [00:27<00:10,  6.12it/s]

 73%|█████████████████████████████████████▍             | 179/244 [00:27<00:10,  6.03it/s]

 74%|█████████████████████████████████████▌             | 180/244 [00:27<00:10,  5.99it/s]

 74%|█████████████████████████████████████▊             | 181/244 [00:28<00:11,  5.39it/s]

 75%|██████████████████████████████████████             | 182/244 [00:28<00:11,  5.56it/s]

 75%|██████████████████████████████████████▎            | 183/244 [00:28<00:09,  6.21it/s]

 75%|██████████████████████████████████████▍            | 184/244 [00:28<00:08,  6.80it/s]

 76%|██████████████████████████████████████▋            | 185/244 [00:28<00:08,  6.72it/s]

 76%|██████████████████████████████████████▉            | 186/244 [00:28<00:09,  6.39it/s]

 77%|███████████████████████████████████████            | 187/244 [00:29<00:09,  6.15it/s]

 77%|███████████████████████████████████████▎           | 188/244 [00:29<00:09,  6.08it/s]

 77%|███████████████████████████████████████▌           | 189/244 [00:29<00:09,  5.96it/s]

 78%|███████████████████████████████████████▋           | 190/244 [00:29<00:09,  5.89it/s]

 78%|███████████████████████████████████████▉           | 191/244 [00:29<00:09,  5.87it/s]

 79%|████████████████████████████████████████▏          | 192/244 [00:29<00:08,  5.86it/s]

 79%|████████████████████████████████████████▎          | 193/244 [00:30<00:08,  6.11it/s]

 80%|████████████████████████████████████████▌          | 194/244 [00:30<00:08,  5.84it/s]

 80%|████████████████████████████████████████▊          | 195/244 [00:30<00:08,  5.81it/s]

 80%|████████████████████████████████████████▉          | 196/244 [00:30<00:08,  5.76it/s]

 81%|█████████████████████████████████████████▏         | 197/244 [00:30<00:08,  5.74it/s]

 81%|█████████████████████████████████████████▍         | 198/244 [00:30<00:08,  5.74it/s]

 82%|█████████████████████████████████████████▌         | 199/244 [00:31<00:07,  5.72it/s]

 82%|█████████████████████████████████████████▊         | 200/244 [00:31<00:07,  5.74it/s]

 82%|██████████████████████████████████████████         | 201/244 [00:31<00:07,  5.78it/s]

 83%|██████████████████████████████████████████▏        | 202/244 [00:31<00:06,  6.20it/s]

 83%|██████████████████████████████████████████▍        | 203/244 [00:31<00:06,  6.74it/s]

 84%|██████████████████████████████████████████▋        | 204/244 [00:31<00:05,  7.19it/s]

 84%|██████████████████████████████████████████▊        | 205/244 [00:31<00:05,  7.51it/s]

 84%|███████████████████████████████████████████        | 206/244 [00:32<00:05,  6.90it/s]

 85%|███████████████████████████████████████████▎       | 207/244 [00:32<00:05,  6.51it/s]

 85%|███████████████████████████████████████████▍       | 208/244 [00:32<00:05,  6.19it/s]

 86%|███████████████████████████████████████████▋       | 209/244 [00:32<00:05,  6.04it/s]

 86%|███████████████████████████████████████████▉       | 210/244 [00:32<00:06,  5.18it/s]

 86%|████████████████████████████████████████████       | 211/244 [00:33<00:06,  5.25it/s]

 87%|████████████████████████████████████████████▎      | 212/244 [00:33<00:05,  5.38it/s]

 87%|████████████████████████████████████████████▌      | 213/244 [00:33<00:05,  5.49it/s]

 88%|████████████████████████████████████████████▋      | 214/244 [00:33<00:05,  5.97it/s]

 88%|████████████████████████████████████████████▉      | 215/244 [00:33<00:04,  6.56it/s]

 89%|█████████████████████████████████████████████▏     | 216/244 [00:33<00:03,  7.03it/s]

 89%|█████████████████████████████████████████████▎     | 217/244 [00:33<00:04,  6.73it/s]

 89%|█████████████████████████████████████████████▌     | 218/244 [00:34<00:04,  6.44it/s]

 90%|█████████████████████████████████████████████▊     | 219/244 [00:34<00:04,  6.24it/s]

 90%|█████████████████████████████████████████████▉     | 220/244 [00:34<00:03,  6.11it/s]

 91%|██████████████████████████████████████████████▏    | 221/244 [00:34<00:03,  6.01it/s]

 91%|██████████████████████████████████████████████▍    | 222/244 [00:34<00:03,  5.90it/s]

 91%|██████████████████████████████████████████████▌    | 223/244 [00:35<00:03,  5.27it/s]

 92%|██████████████████████████████████████████████▊    | 224/244 [00:35<00:04,  4.44it/s]

 92%|███████████████████████████████████████████████    | 225/244 [00:35<00:04,  4.65it/s]

 93%|███████████████████████████████████████████████▏   | 226/244 [00:35<00:03,  5.39it/s]

 93%|███████████████████████████████████████████████▍   | 227/244 [00:35<00:02,  6.07it/s]

 93%|███████████████████████████████████████████████▋   | 228/244 [00:35<00:02,  6.44it/s]

 94%|███████████████████████████████████████████████▊   | 229/244 [00:36<00:02,  6.22it/s]

 94%|████████████████████████████████████████████████   | 230/244 [00:36<00:02,  6.05it/s]

 95%|████████████████████████████████████████████████▎  | 231/244 [00:36<00:02,  5.97it/s]

 95%|████████████████████████████████████████████████▍  | 232/244 [00:36<00:02,  5.87it/s]

 95%|████████████████████████████████████████████████▋  | 233/244 [00:36<00:01,  5.84it/s]

 96%|████████████████████████████████████████████████▉  | 234/244 [00:37<00:01,  5.83it/s]

 96%|█████████████████████████████████████████████████  | 235/244 [00:37<00:01,  5.34it/s]

 97%|█████████████████████████████████████████████████▎ | 236/244 [00:37<00:01,  4.97it/s]

 97%|█████████████████████████████████████████████████▌ | 237/244 [00:37<00:01,  5.25it/s]

 98%|█████████████████████████████████████████████████▋ | 238/244 [00:37<00:01,  5.43it/s]

 98%|█████████████████████████████████████████████████▉ | 239/244 [00:37<00:00,  5.50it/s]

 98%|██████████████████████████████████████████████████▏| 240/244 [00:38<00:00,  5.51it/s]

 99%|██████████████████████████████████████████████████▎| 241/244 [00:38<00:00,  5.51it/s]

 99%|██████████████████████████████████████████████████▌| 242/244 [00:38<00:00,  5.54it/s]

100%|██████████████████████████████████████████████████▊| 243/244 [00:38<00:00,  5.58it/s]

100%|███████████████████████████████████████████████████| 244/244 [00:38<00:00,  5.49it/s]

100%|███████████████████████████████████████████████████| 244/244 [00:38<00:00,  6.27it/s]




(732, 9)


Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject,feature,anova_p_value
0,high,unsel,-1.7236,0.0,-2.2266,-1.2205,True,AreaShape_Area,8.16302e-20
1,high,wt,-2.5954,0.0,-3.0938,-2.097,True,AreaShape_Area,8.16302e-20
2,unsel,wt,-0.8718,0.0003,-1.3749,-0.3688,True,AreaShape_Area,8.16302e-20
3,high,unsel,-1.5096,0.0,-1.8942,-1.125,True,AreaShape_CentralMoment_0_0,1.0895449999999999e-20
4,high,wt,-2.0158,0.0,-2.3969,-1.6347,True,AreaShape_CentralMoment_0_0,1.0895449999999999e-20


In [6]:
# save the results
output_file = pathlib.Path(
    "../../data/6.analysis_results/custom_aggregated_anova_results.parquet"
)
output_file.parent.mkdir(exist_ok=True, parents=True)
anova_output_df.to_parquet(output_file)

## Levene's test for homogeneity of variance

In [7]:
# split the df into three genotypes
high_df = df[df["Metadata_genotype"] == "high"]
unsel_df = df[df["Metadata_genotype"] == "unsel"]
wt_df = df[df["Metadata_genotype"] == "wt"]
levene_test_results = {"feature": [], "levene_statistic": [], "levene_p_value": []}
for feature in tqdm.tqdm(features_df.columns):
    # calculate the levene test for each feature
    levene_results = levene(wt_df[feature], unsel_df[feature], high_df[feature])
    levene_test_results["feature"].append(feature)
    levene_test_results["levene_statistic"].append(levene_results.statistic)
    levene_test_results["levene_p_value"].append(levene_results.pvalue)

levene_test_results_df = pd.DataFrame(levene_test_results)
levene_test_results_df

  0%|                                                             | 0/244 [00:00<?, ?it/s]

 65%|███████████████████████████████▉                 | 159/244 [00:00<00:00, 1589.18it/s]

100%|█████████████████████████████████████████████████| 244/244 [00:00<00:00, 1806.77it/s]




Unnamed: 0,feature,levene_statistic,levene_p_value
0,AreaShape_Area,2.664096,7.583990e-02
1,AreaShape_CentralMoment_0_0,4.808285,1.066760e-02
2,AreaShape_CentralMoment_0_1,5.060027,8.525926e-03
3,AreaShape_CentralMoment_0_2,9.782865,1.581992e-04
4,AreaShape_CentralMoment_0_3,6.034346,3.623435e-03
...,...,...,...
239,Texture_SumVariance_OP_3_03_256,7.546945,9.942484e-04
240,Texture_Variance_OP_3_00_256,22.209099,2.129331e-08
241,Texture_Variance_OP_3_01_256,8.906770,3.218403e-04
242,Texture_Variance_OP_3_02_256,22.013897,2.414564e-08


## Calculate the levenes test statistic for the equality of variances

In [8]:
# split the df into three genotypes
high_df = df[df["Metadata_genotype"] == "high"]
unsel_df = df[df["Metadata_genotype"] == "unsel"]
wt_df = df[df["Metadata_genotype"] == "wt"]
group_dict = {
    "high_vs_unsel": [high_df, unsel_df],
    "high_vs_wt": [high_df, wt_df],
    "unsel_vs_wt": [wt_df, unsel_df],
    "all": [high_df, unsel_df, wt_df],
}


levene_test_results = {
    "feature": [],
    "levene_statistic": [],
    "levene_p_value": [],
    "group": [],
}
for group in tqdm.tqdm(group_dict.keys()):
    for feature in features_df.columns:
        # calculate the levene test for each feature
        if not group == "all":
            levene_results = levene(
                group_dict[group][0][feature], group_dict[group][1][feature]
            )
            levene_test_results["feature"].append(feature)
            levene_test_results["levene_statistic"].append(levene_results.statistic)
            levene_test_results["levene_p_value"].append(levene_results.pvalue)
            levene_test_results["group"].append(group)
        else:
            levene_results = levene(
                group_dict[group][0][feature],
                group_dict[group][1][feature],
                group_dict[group][2][feature],
            )
            levene_test_results["feature"].append(feature)
            levene_test_results["levene_statistic"].append(levene_results.statistic)
            levene_test_results["levene_p_value"].append(levene_results.pvalue)
            levene_test_results["group"].append(group)

levene_test_results_df = pd.DataFrame(levene_test_results)

# sort the levene test results levene_test_results_df
# change the levene p-value to a float
levene_test_results_df["levene_p_value"] = levene_test_results_df[
    "levene_p_value"
].astype(float)
levene_test_results_df = levene_test_results_df.sort_values(
    "levene_p_value", ascending=False
)
levene_test_results_df

  0%|                                                               | 0/4 [00:00<?, ?it/s]

  W = numer / denom


 50%|███████████████████████████▌                           | 2/4 [00:00<00:00, 19.09it/s]

100%|███████████████████████████████████████████████████████| 4/4 [00:00<00:00, 19.68it/s]




Unnamed: 0,feature,levene_statistic,levene_p_value,group
584,AreaShape_Zernike_9_7,0.000006,9.980628e-01,unsel_vs_wt
437,Texture_AngularSecondMoment_OP_3_01_256,0.000013,9.971029e-01,high_vs_wt
618,RadialDistribution_RadialCV_OP_1of4,0.000150,9.902816e-01,unsel_vs_wt
58,AreaShape_SpatialMoment_0_2,0.000379,9.845444e-01,high_vs_unsel
916,RadialDistribution_ZernikePhase_OP_8_4,0.017349,9.828045e-01,all
...,...,...,...,...
886,RadialDistribution_ZernikeMagnitude_OP_8_0,31.690623,7.309801e-11,all
888,RadialDistribution_ZernikeMagnitude_OP_8_4,32.285839,5.251345e-11,all
266,AreaShape_HuMoment_2,67.925962,4.025587e-11,high_vs_wt
872,RadialDistribution_ZernikeMagnitude_OP_4_0,42.876346,2.213991e-13,all


In [9]:
# save the levene test results
# out dir
out_dir = pathlib.Path("../../data/6.analysis_results/")
# create the dir if it does not exist
out_dir.mkdir(parents=True, exist_ok=True)
levene_test_results_path = pathlib.Path(
    out_dir / "custom_aggregated_levene_test_results.csv"
)
levene_test_results_df.to_csv(levene_test_results_path)

### Calculate the levene test statistic for the aggregated data across feature types and genotypes

In [10]:
data_path = pathlib.Path(
    "../../data/5.converted_data/custom_aggregated_data.parquet"
).resolve(strict=True)
# Read the data
data = pd.read_parquet(data_path)

# Drop all metadata except for the genotype data
features_df = data.drop(columns=data.filter(like="Metadata").columns)
features_df["Metadata_genotype"] = data["Metadata_genotype"]


# turn the features into a long format
features_long_df = features_df.melt(
    id_vars="Metadata_genotype", var_name="feature", value_name="value"
)
features_long_df.head()
# Separate the feature into different parts
features_long_df[
    ["feature_group", "measurement", "bone", "parameter1", "parameter2", "parameter3"]
] = features_long_df["feature"].str.split("_", expand=True)

# Replace the Metadata_genotype with the actual genotype name
features_long_df["Metadata_genotype"] = features_long_df["Metadata_genotype"].replace(
    {"high": "High-Severity", "unsel": "Mid-Severity", "wt": "Wild Type"}
)
features_long_df.head()

Unnamed: 0,Metadata_genotype,feature,value,feature_group,measurement,bone,parameter1,parameter2,parameter3
0,High-Severity,AreaShape_Area,-0.178797,AreaShape,Area,,,,
1,High-Severity,AreaShape_Area,1.371763,AreaShape,Area,,,,
2,High-Severity,AreaShape_Area,1.706234,AreaShape,Area,,,,
3,High-Severity,AreaShape_Area,0.771674,AreaShape,Area,,,,
4,High-Severity,AreaShape_Area,2.180858,AreaShape,Area,,,,


In [11]:
# break each genotype and featuretype into a separate dataframe
high_df = features_long_df[features_long_df["Metadata_genotype"] == "High-Severity"]
unsel_df = features_long_df[features_long_df["Metadata_genotype"] == "Mid-Severity"]
wt_df = features_long_df[features_long_df["Metadata_genotype"] == "Wild Type"]

# each feature group
high_df_AreaShape = high_df[high_df["feature_group"] == "AreaShape"]
high_df_Intensity = high_df[high_df["feature_group"] == "Intensity"]
high_df_Neighbors = high_df[high_df["feature_group"] == "Neighbors"]
high_df_radial = high_df[high_df["feature_group"] == "RadialDistribution"]
high_df_Granularity = high_df[high_df["feature_group"] == "Granularity"]

unsel_df_AreaShape = unsel_df[unsel_df["feature_group"] == "AreaShape"]
unsel_df_Intensity = unsel_df[unsel_df["feature_group"] == "Intensity"]
unsel_df_Neighbors = unsel_df[unsel_df["feature_group"] == "Neighbors"]
unsel_df_radial = unsel_df[unsel_df["feature_group"] == "RadialDistribution"]
unsel_df_Granularity = unsel_df[unsel_df["feature_group"] == "Granularity"]

wt_df_AreaShape = wt_df[wt_df["feature_group"] == "AreaShape"]
wt_df_Intensity = wt_df[wt_df["feature_group"] == "Intensity"]
wt_df_Neighbors = wt_df[wt_df["feature_group"] == "Neighbors"]
wt_df_radial = wt_df[wt_df["feature_group"] == "RadialDistribution"]
wt_df_Granularity = wt_df[wt_df["feature_group"] == "Granularity"]

# levene test for each feature group
levene_test_results = {
    "feature_group": [],
    "levene_statistic": [],
    "levene_p_value": [],
    "group": [],
}

group_dict = {
    "AreaShape": {
        "high_area_v_unsel_area": [high_df_AreaShape, unsel_df_AreaShape],
        "high_area_v_wt_area": [high_df_AreaShape, wt_df_AreaShape],
        "unsel_area_v_wt_area": [wt_df_AreaShape, unsel_df_AreaShape],
    },
    "Intensity": {
        "high_intensity_v_unsel_intensity": [high_df_Intensity, unsel_df_Intensity],
        "high_intensity_v_wt_intensity": [high_df_Intensity, wt_df_Intensity],
        "unsel_intensity_v_wt_intensity": [wt_df_Intensity, unsel_df_Intensity],
    },
    "Neighbors": {
        "high_neighbors_v_unsel_neighbors": [high_df_Neighbors, unsel_df_Neighbors],
        "high_neighbors_v_wt_neighbors": [high_df_Neighbors, wt_df_Neighbors],
        "unsel_neighbors_v_wt_neighbors": [wt_df_Neighbors, unsel_df_Neighbors],
    },
    "RadialDistribution": {
        "high_radial_v_unsel_radial": [high_df_radial, unsel_df_radial],
        "high_radial_v_wt_radial": [high_df_radial, wt_df_radial],
        "unsel_radial_v_wt_radial": [wt_df_radial, unsel_df_radial],
    },
    "Granularity": {
        "high_granularity_v_unsel_granularity": [
            high_df_Granularity,
            unsel_df_Granularity,
        ],
        "high_granularity_v_wt_granularity": [high_df_Granularity, wt_df_Granularity],
        "unsel_granularity_v_wt_granularity": [wt_df_Granularity, unsel_df_Granularity],
    },
}

for feature_group in tqdm.tqdm(group_dict.keys()):
    for group in group_dict[feature_group].keys():
        if not group == "all":
            levene_results = levene(
                group_dict[feature_group][group][0]["value"],
                group_dict[feature_group][group][1]["value"],
            )
            # calculate the variance for each feature group

            levene_test_results["feature_group"].append(feature_group)
            levene_test_results["levene_statistic"].append(levene_results.statistic)
            levene_test_results["levene_p_value"].append(levene_results.pvalue)
            levene_test_results["group"].append(group)
        else:
            pass

levene_test_results_df = pd.DataFrame(levene_test_results)
levene_test_results_df.head()

  0%|                                                               | 0/5 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████| 5/5 [00:00<00:00, 737.94it/s]




Unnamed: 0,feature_group,levene_statistic,levene_p_value,group
0,AreaShape,47.175905,7.22209e-12,high_area_v_unsel_area
1,AreaShape,113.242704,3.4325559999999997e-26,high_area_v_wt_area
2,AreaShape,7.530054,0.006087899,unsel_area_v_wt_area
3,Intensity,0.9311,0.3348608,high_intensity_v_unsel_intensity
4,Intensity,59.572829,3.35157e-14,high_intensity_v_wt_intensity


In [12]:
# save the levene test results
# out dir
out_dir = pathlib.Path("../../data/6.analysis_results/")
# create the dir if it does not exist
out_dir.mkdir(parents=True, exist_ok=True)
levene_test_results_path = pathlib.Path(
    out_dir / "custom_aggregated_levene_test_results_feature_types.csv"
)
levene_test_results_df.to_csv(levene_test_results_path, index=False)

In [13]:
# Drop all metadata except for the genotype data
features_df = data.drop(columns=data.filter(like="Metadata").columns)
features_df["Metadata_genotype"] = data["Metadata_genotype"]

# turn the features into a long format
features_long_df = features_df.melt(
    id_vars="Metadata_genotype", var_name="feature", value_name="value"
)
features_long_df

Unnamed: 0,Metadata_genotype,feature,value
0,high,AreaShape_Area,-0.178797
1,high,AreaShape_Area,1.371763
2,high,AreaShape_Area,1.706234
3,high,AreaShape_Area,0.771674
4,high,AreaShape_Area,2.180858
...,...,...,...
20247,wt,Texture_Variance_OP_3_03_256,1.434540
20248,wt,Texture_Variance_OP_3_03_256,1.956497
20249,wt,Texture_Variance_OP_3_03_256,1.223845
20250,wt,Texture_Variance_OP_3_03_256,-1.017697


In [14]:
# get the variance for each feature group
var_df = features_long_df.groupby(["Metadata_genotype", "feature"]).var().reset_index()
var_df.head()
# change the value column name to variance
var_df.rename(columns={"value": "variance"}, inplace=True)

In [15]:
var_df[
    ["feature_group", "measurement", "bone", "parameter1", "parameter2", "parameter3"]
] = var_df["feature"].str.split("_", expand=True)


# Replace the Metadata_genotype with the actual genotype name
var_df["Metadata_genotype"] = var_df["Metadata_genotype"].replace(
    {"high": "High-Severity", "unsel": "Mid-Severity", "wt": "Wild Type"}
)
var_df
var_df = var_df.drop(
    columns=["feature", "measurement", "bone", "parameter1", "parameter2", "parameter3"]
)
var_df
# save the variance results
var_path = pathlib.Path(
    out_dir / "custom_aggregated_variance_results_feature_types.csv"
)
var_df.to_csv(var_path, index=False)

In [16]:
# get the mean and stdev for each feature group's variance
var_df = (
    var_df.groupby(["Metadata_genotype", "feature_group"])
    .agg(["mean", "std", "max", "min", "count"])
    .reset_index()
)
# ungroup the columns
var_df.columns = ["_".join(col).strip() for col in var_df.columns.values]
# rename the Metadata_genotype_ column and the feature_group_ column
var_df.rename(
    columns={
        "Metadata_genotype_": "Metadata_genotype",
        "feature_group_": "feature_group",
    },
    inplace=True,
)
var_df

Unnamed: 0,Metadata_genotype,feature_group,variance_mean,variance_std,variance_max,variance_min,variance_count
0,High-Severity,AreaShape,0.774588,0.66997,3.855982,5e-06,98
1,High-Severity,Granularity,0.27819,0.357954,0.981199,0.015287,6
2,High-Severity,Intensity,0.529629,0.281565,1.157014,0.129043,15
3,High-Severity,Neighbors,0.730238,0.237898,0.953424,0.479949,3
4,High-Severity,RadialDistribution,0.670404,0.430295,1.751219,0.0,70
5,High-Severity,Texture,0.300315,0.206077,0.719217,0.032302,52
6,Mid-Severity,AreaShape,1.112337,0.783353,3.934014,0.165658,98
7,Mid-Severity,Granularity,0.792799,0.681926,1.982012,0.191056,6
8,Mid-Severity,Intensity,0.699825,0.633322,2.091309,0.076053,15
9,Mid-Severity,Neighbors,1.909325,1.552501,3.634385,0.624438,3


In [17]:
# save the variance results
var_path = pathlib.Path(
    out_dir / "custom_aggregated_variance_results_feature_types_stats.csv"
)
var_df.to_csv(var_path, index=False)