In [1]:
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import tqdm
from scipy.stats import levene

# import anova and tukeyhsd
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [2]:
def anova_function(features_df: pd.DataFrame, Metdata_column: str) -> pd.DataFrame:
    """
    This function will take in a dataframe and a metadata column and return the results of an anova and tukeyhsd test for each feature.


    Parameters
    ----------
    features_df : pd.DataFrame
        The dataframe containing the features with only one metadata column
    Metdata_column : str
        The name of the metadata column to be used for the anova test

    Returns
    -------
    pd.DataFrame
        A dataframe containing the results of the anova and tukeyhsd test for each feature
    """

    # anova and tukeyhsd for each feature
    # create a pandas data frame to store the results
    anova_results = pd.DataFrame()

    # loop through each feature
    for feature in tqdm.tqdm(features_df.columns[:-1]):
        # create a model
        model = ols(f"{feature} ~ C({Metdata_column})", data=features_df).fit()
        # create an anova table
        anova_table = sm.stats.anova_lm(model, typ=2)
        # create a tukeyhsd table
        tukeyhsd = pairwise_tukeyhsd(features_df[feature], features_df[Metdata_column])
        # get the f-statistic based p-value
        anova_p_value = anova_table["PR(>F)"][0]
        tmp = pd.DataFrame(
            tukeyhsd._results_table.data, columns=tukeyhsd._results_table.data[0]
        ).drop(0)
        tmp.reset_index(inplace=True, drop=True)
        # drop the first row
        tmp["feature"] = feature
        tmp["anova_p_value"] = anova_p_value
        tmp = pd.DataFrame(tmp)

        anova_results = pd.concat([anova_results, tmp], axis=0).reset_index(drop=True)
    return anova_results

In [3]:
file_path = pathlib.Path("../../data/5.converted_data/mean_aggregated_data.parquet")
df = pd.read_parquet(file_path)
df.head()

Unnamed: 0,Metadata_genotype,Metadata_replicate,Metadata_side,AreaShape_Area,AreaShape_CentralMoment_0_0,AreaShape_CentralMoment_0_1,AreaShape_CentralMoment_0_2,AreaShape_CentralMoment_0_3,AreaShape_CentralMoment_1_0,AreaShape_CentralMoment_1_1,...,Texture_SumEntropy_OP_3_02_256,Texture_SumEntropy_OP_3_03_256,Texture_SumVariance_OP_3_00_256,Texture_SumVariance_OP_3_01_256,Texture_SumVariance_OP_3_02_256,Texture_SumVariance_OP_3_03_256,Texture_Variance_OP_3_00_256,Texture_Variance_OP_3_01_256,Texture_Variance_OP_3_02_256,Texture_Variance_OP_3_03_256
0,high,1,L,-0.178797,-0.178797,0.003558,-0.386796,0.280118,0.037998,0.434046,...,-0.627762,-0.64499,-0.903354,-0.890939,-0.896995,-0.90287,-0.896145,-0.898448,-0.89888,-0.896609
1,high,1,R,1.371763,1.371763,0.538195,0.288322,0.501618,4.163833,-1.36192,...,0.419551,0.406918,-0.188358,-0.159114,-0.189615,-0.186308,-0.218981,-0.225754,-0.219073,-0.225134
2,high,10,L,1.706234,1.706234,2.652373,3.280425,-1.992966,0.18463,-1.404376,...,0.841475,0.834574,1.149186,1.1522,1.078905,1.144848,1.040009,1.02574,1.045492,1.027617
3,high,10,R,0.771674,0.771674,-1.332747,0.164304,0.244371,0.17376,-1.885144,...,0.45248,0.436138,0.112178,0.131982,0.108602,0.1257,0.075717,0.071207,0.076477,0.067922
4,high,11,L,2.180858,2.180858,2.142686,1.838038,-0.077515,0.00865,0.876597,...,0.494813,0.490844,-0.01614,0.001041,-0.021997,-0.020992,-0.033934,-0.042635,-0.034983,-0.038763


In [4]:
# split the features and the metadata
metadata = df.columns.str.contains("Metadata")
# filter the metadata
metadata_df = df.loc[:, metadata]
# filter the features
features_df = df.loc[:, ~metadata]

## Anova for genotype only

In [5]:
anova_input_df = features_df.copy()
anova_input_df["Metadata_genotype"] = metadata_df["Metadata_genotype"]
anova_output_df = anova_function(anova_input_df, "Metadata_genotype")
print(anova_output_df.shape)
anova_output_df.head()

  0%|                                                                   | 0/244 [00:00<?, ?it/s]

  0%|▏                                                          | 1/244 [00:00<00:30,  8.03it/s]

  1%|▍                                                          | 2/244 [00:00<00:29,  8.32it/s]

  1%|▋                                                          | 3/244 [00:00<00:28,  8.45it/s]

  2%|▉                                                          | 4/244 [00:00<00:28,  8.44it/s]

  2%|█▏                                                         | 5/244 [00:00<00:28,  8.50it/s]

  2%|█▍                                                         | 6/244 [00:00<00:27,  8.69it/s]

  3%|█▋                                                         | 7/244 [00:00<00:27,  8.63it/s]

  3%|█▉                                                         | 8/244 [00:00<00:27,  8.62it/s]

  4%|██▏                                                        | 9/244 [00:01<00:27,  8.55it/s]

  4%|██▍                                                       | 10/244 [00:01<00:27,  8.53it/s]

  5%|██▌                                                       | 11/244 [00:01<00:27,  8.54it/s]

  5%|██▊                                                       | 12/244 [00:01<00:27,  8.51it/s]

  5%|███                                                       | 13/244 [00:01<00:27,  8.50it/s]

  6%|███▎                                                      | 14/244 [00:01<00:27,  8.49it/s]

  6%|███▌                                                      | 15/244 [00:01<00:26,  8.50it/s]

  7%|███▊                                                      | 16/244 [00:01<00:26,  8.49it/s]

  7%|████                                                      | 17/244 [00:01<00:26,  8.46it/s]

  7%|████▎                                                     | 18/244 [00:02<00:26,  8.45it/s]

  8%|████▌                                                     | 19/244 [00:02<00:26,  8.44it/s]

  8%|████▊                                                     | 20/244 [00:02<00:26,  8.46it/s]

  9%|████▉                                                     | 21/244 [00:02<00:26,  8.48it/s]

  9%|█████▏                                                    | 22/244 [00:02<00:26,  8.49it/s]

  9%|█████▍                                                    | 23/244 [00:02<00:25,  8.51it/s]

 10%|█████▋                                                    | 24/244 [00:02<00:25,  8.55it/s]

 10%|█████▉                                                    | 25/244 [00:02<00:25,  8.53it/s]

 11%|██████▏                                                   | 26/244 [00:03<00:25,  8.57it/s]

 11%|██████▍                                                   | 27/244 [00:03<00:25,  8.56it/s]

 11%|██████▋                                                   | 28/244 [00:03<00:25,  8.57it/s]

 12%|██████▉                                                   | 29/244 [00:03<00:25,  8.56it/s]

 12%|███████▏                                                  | 30/244 [00:03<00:25,  8.53it/s]

 13%|███████▎                                                  | 31/244 [00:03<00:24,  8.53it/s]

 13%|███████▌                                                  | 32/244 [00:03<00:24,  8.51it/s]

 14%|███████▊                                                  | 33/244 [00:03<00:24,  8.55it/s]

 14%|████████                                                  | 34/244 [00:03<00:24,  8.55it/s]

 14%|████████▎                                                 | 35/244 [00:04<00:24,  8.59it/s]

 15%|████████▌                                                 | 36/244 [00:04<00:24,  8.37it/s]

 15%|████████▊                                                 | 37/244 [00:04<00:24,  8.41it/s]

 16%|█████████                                                 | 38/244 [00:04<00:24,  8.46it/s]

 16%|█████████▎                                                | 39/244 [00:04<00:24,  8.47it/s]

 16%|█████████▌                                                | 40/244 [00:04<00:23,  8.51it/s]

 17%|█████████▋                                                | 41/244 [00:04<00:23,  8.52it/s]

 17%|█████████▉                                                | 42/244 [00:04<00:23,  8.51it/s]

 18%|██████████▏                                               | 43/244 [00:05<00:23,  8.51it/s]

 18%|██████████▍                                               | 44/244 [00:05<00:23,  8.46it/s]

 18%|██████████▋                                               | 45/244 [00:05<00:23,  8.43it/s]

 19%|██████████▉                                               | 46/244 [00:05<00:23,  8.44it/s]

 19%|███████████▏                                              | 47/244 [00:05<00:23,  8.49it/s]

 20%|███████████▍                                              | 48/244 [00:05<00:23,  8.47it/s]

 20%|███████████▋                                              | 49/244 [00:05<00:23,  8.43it/s]

 20%|███████████▉                                              | 50/244 [00:05<00:22,  8.50it/s]

 21%|████████████                                              | 51/244 [00:05<00:22,  8.51it/s]

 21%|████████████▎                                             | 52/244 [00:06<00:22,  8.35it/s]

 22%|████████████▌                                             | 53/244 [00:06<00:22,  8.42it/s]

 22%|████████████▊                                             | 54/244 [00:06<00:22,  8.47it/s]

 23%|█████████████                                             | 55/244 [00:06<00:22,  8.51it/s]

 23%|█████████████▎                                            | 56/244 [00:06<00:21,  8.61it/s]

 23%|█████████████▌                                            | 57/244 [00:06<00:21,  8.62it/s]

 24%|█████████████▊                                            | 58/244 [00:06<00:21,  8.62it/s]

 24%|██████████████                                            | 59/244 [00:06<00:21,  8.62it/s]

 25%|██████████████▎                                           | 60/244 [00:07<00:21,  8.62it/s]

 25%|██████████████▌                                           | 61/244 [00:07<00:21,  8.63it/s]

 25%|██████████████▋                                           | 62/244 [00:07<00:21,  8.61it/s]

 26%|██████████████▉                                           | 63/244 [00:07<00:21,  8.60it/s]

 26%|███████████████▏                                          | 64/244 [00:07<00:20,  8.59it/s]

 27%|███████████████▍                                          | 65/244 [00:07<00:21,  8.45it/s]

 27%|███████████████▋                                          | 66/244 [00:07<00:21,  8.35it/s]

 27%|███████████████▉                                          | 67/244 [00:07<00:21,  8.39it/s]

 28%|████████████████▏                                         | 68/244 [00:07<00:20,  8.42it/s]

 28%|████████████████▍                                         | 69/244 [00:08<00:20,  8.50it/s]

 29%|████████████████▋                                         | 70/244 [00:08<00:20,  8.57it/s]

 29%|████████████████▉                                         | 71/244 [00:08<00:20,  8.59it/s]

 30%|█████████████████                                         | 72/244 [00:08<00:19,  8.62it/s]

 30%|█████████████████▎                                        | 73/244 [00:08<00:19,  8.63it/s]

 30%|█████████████████▌                                        | 74/244 [00:08<00:19,  8.58it/s]

 31%|█████████████████▊                                        | 75/244 [00:08<00:19,  8.59it/s]

 31%|██████████████████                                        | 76/244 [00:08<00:19,  8.53it/s]

 32%|██████████████████▎                                       | 77/244 [00:09<00:19,  8.54it/s]

 32%|██████████████████▌                                       | 78/244 [00:09<00:19,  8.61it/s]

 32%|██████████████████▊                                       | 79/244 [00:09<00:19,  8.65it/s]

 33%|███████████████████                                       | 80/244 [00:09<00:19,  8.61it/s]

 33%|███████████████████▎                                      | 81/244 [00:09<00:18,  8.66it/s]

 34%|███████████████████▍                                      | 82/244 [00:09<00:18,  8.66it/s]

 34%|███████████████████▋                                      | 83/244 [00:09<00:18,  8.73it/s]

 34%|███████████████████▉                                      | 84/244 [00:09<00:18,  8.72it/s]

 35%|████████████████████▏                                     | 85/244 [00:09<00:18,  8.68it/s]

 35%|████████████████████▍                                     | 86/244 [00:10<00:18,  8.69it/s]

 36%|████████████████████▋                                     | 87/244 [00:10<00:19,  7.86it/s]

 36%|████████████████████▉                                     | 88/244 [00:10<00:19,  8.10it/s]

 36%|█████████████████████▏                                    | 89/244 [00:10<00:18,  8.30it/s]

 37%|█████████████████████▍                                    | 90/244 [00:10<00:18,  8.42it/s]

 37%|█████████████████████▋                                    | 91/244 [00:10<00:19,  7.92it/s]

 38%|█████████████████████▊                                    | 92/244 [00:10<00:21,  7.15it/s]

 38%|██████████████████████                                    | 93/244 [00:11<00:22,  6.65it/s]

 39%|██████████████████████▎                                   | 94/244 [00:11<00:23,  6.34it/s]

 39%|██████████████████████▌                                   | 95/244 [00:11<00:25,  5.79it/s]

 39%|██████████████████████▊                                   | 96/244 [00:11<00:25,  5.77it/s]

 40%|███████████████████████                                   | 97/244 [00:11<00:25,  5.79it/s]

 40%|███████████████████████▎                                  | 98/244 [00:12<00:28,  5.04it/s]

 41%|███████████████████████▌                                  | 99/244 [00:12<00:30,  4.78it/s]

 41%|███████████████████████▎                                 | 100/244 [00:12<00:26,  5.51it/s]

 41%|███████████████████████▌                                 | 101/244 [00:12<00:24,  5.78it/s]

 42%|███████████████████████▊                                 | 102/244 [00:12<00:24,  5.79it/s]

 42%|████████████████████████                                 | 103/244 [00:12<00:24,  5.78it/s]

 43%|████████████████████████▎                                | 104/244 [00:13<00:24,  5.77it/s]

 43%|████████████████████████▌                                | 105/244 [00:13<00:24,  5.78it/s]

 43%|████████████████████████▊                                | 106/244 [00:13<00:23,  5.80it/s]

 44%|████████████████████████▉                                | 107/244 [00:13<00:29,  4.67it/s]

 44%|█████████████████████████▏                               | 108/244 [00:13<00:27,  4.93it/s]

 45%|█████████████████████████▍                               | 109/244 [00:14<00:25,  5.26it/s]

 45%|█████████████████████████▋                               | 110/244 [00:14<00:22,  5.95it/s]

 45%|█████████████████████████▉                               | 111/244 [00:14<00:21,  6.05it/s]

 46%|██████████████████████████▏                              | 112/244 [00:14<00:22,  5.95it/s]

 46%|██████████████████████████▍                              | 113/244 [00:14<00:22,  5.87it/s]

 47%|██████████████████████████▋                              | 114/244 [00:14<00:22,  5.83it/s]

 47%|██████████████████████████▊                              | 115/244 [00:15<00:22,  5.79it/s]

 48%|███████████████████████████                              | 116/244 [00:15<00:22,  5.78it/s]

 48%|███████████████████████████▎                             | 117/244 [00:15<00:25,  5.01it/s]

 48%|███████████████████████████▌                             | 118/244 [00:15<00:24,  5.08it/s]

 49%|███████████████████████████▊                             | 119/244 [00:15<00:23,  5.35it/s]

 49%|████████████████████████████                             | 120/244 [00:15<00:22,  5.60it/s]

 50%|████████████████████████████▎                            | 121/244 [00:16<00:21,  5.64it/s]

 50%|████████████████████████████▌                            | 122/244 [00:16<00:21,  5.65it/s]

 50%|████████████████████████████▋                            | 123/244 [00:16<00:21,  5.66it/s]

 51%|████████████████████████████▉                            | 124/244 [00:16<00:21,  5.68it/s]

 51%|█████████████████████████████▏                           | 125/244 [00:16<00:20,  5.67it/s]

 52%|█████████████████████████████▍                           | 126/244 [00:17<00:20,  5.68it/s]

 52%|█████████████████████████████▋                           | 127/244 [00:17<00:20,  5.70it/s]

 52%|█████████████████████████████▉                           | 128/244 [00:17<00:20,  5.73it/s]

 53%|██████████████████████████████▏                          | 129/244 [00:17<00:19,  5.86it/s]

 53%|██████████████████████████████▎                          | 130/244 [00:17<00:17,  6.48it/s]

 54%|██████████████████████████████▌                          | 131/244 [00:17<00:16,  7.02it/s]

 54%|██████████████████████████████▊                          | 132/244 [00:17<00:14,  7.50it/s]

 55%|███████████████████████████████                          | 133/244 [00:18<00:15,  7.31it/s]

 55%|███████████████████████████████▎                         | 134/244 [00:18<00:16,  6.77it/s]

 55%|███████████████████████████████▌                         | 135/244 [00:18<00:16,  6.43it/s]

 56%|███████████████████████████████▊                         | 136/244 [00:18<00:17,  6.23it/s]

 56%|████████████████████████████████                         | 137/244 [00:18<00:17,  6.06it/s]

 57%|████████████████████████████████▏                        | 138/244 [00:18<00:17,  5.97it/s]

 57%|████████████████████████████████▍                        | 139/244 [00:19<00:17,  5.91it/s]

 57%|████████████████████████████████▋                        | 140/244 [00:19<00:17,  5.87it/s]

 58%|████████████████████████████████▉                        | 141/244 [00:19<00:17,  5.87it/s]

 58%|█████████████████████████████████▏                       | 142/244 [00:19<00:17,  5.84it/s]

 59%|█████████████████████████████████▍                       | 143/244 [00:19<00:16,  5.95it/s]

 59%|█████████████████████████████████▋                       | 144/244 [00:19<00:15,  6.54it/s]

 59%|█████████████████████████████████▊                       | 145/244 [00:20<00:15,  6.51it/s]

 60%|██████████████████████████████████                       | 146/244 [00:20<00:15,  6.25it/s]

 60%|██████████████████████████████████▎                      | 147/244 [00:20<00:15,  6.09it/s]

 61%|██████████████████████████████████▌                      | 148/244 [00:20<00:16,  5.98it/s]

 61%|██████████████████████████████████▊                      | 149/244 [00:20<00:16,  5.91it/s]

 61%|███████████████████████████████████                      | 150/244 [00:20<00:16,  5.81it/s]

 62%|███████████████████████████████████▎                     | 151/244 [00:21<00:19,  4.77it/s]

 62%|███████████████████████████████████▌                     | 152/244 [00:21<00:19,  4.70it/s]

 63%|███████████████████████████████████▋                     | 153/244 [00:21<00:18,  4.98it/s]

 63%|███████████████████████████████████▉                     | 154/244 [00:21<00:16,  5.58it/s]

 64%|████████████████████████████████████▏                    | 155/244 [00:21<00:14,  6.05it/s]

 64%|████████████████████████████████████▍                    | 156/244 [00:22<00:14,  6.00it/s]

 64%|████████████████████████████████████▋                    | 157/244 [00:22<00:14,  5.92it/s]

 65%|████████████████████████████████████▉                    | 158/244 [00:22<00:14,  5.86it/s]

 65%|█████████████████████████████████████▏                   | 159/244 [00:22<00:14,  5.82it/s]

 66%|█████████████████████████████████████▍                   | 160/244 [00:22<00:15,  5.31it/s]

 66%|█████████████████████████████████████▌                   | 161/244 [00:23<00:17,  4.86it/s]

 66%|█████████████████████████████████████▊                   | 162/244 [00:23<00:16,  4.88it/s]

 67%|██████████████████████████████████████                   | 163/244 [00:23<00:15,  5.13it/s]

 67%|██████████████████████████████████████▎                  | 164/244 [00:23<00:14,  5.33it/s]

 68%|██████████████████████████████████████▌                  | 165/244 [00:23<00:13,  5.68it/s]

 68%|██████████████████████████████████████▊                  | 166/244 [00:23<00:13,  5.69it/s]

 68%|███████████████████████████████████████                  | 167/244 [00:24<00:13,  5.68it/s]

 69%|███████████████████████████████████████▏                 | 168/244 [00:24<00:13,  5.73it/s]

 69%|███████████████████████████████████████▍                 | 169/244 [00:24<00:12,  5.85it/s]

 70%|███████████████████████████████████████▋                 | 170/244 [00:24<00:12,  5.86it/s]

 70%|███████████████████████████████████████▉                 | 171/244 [00:24<00:12,  5.87it/s]

 70%|████████████████████████████████████████▏                | 172/244 [00:24<00:12,  5.86it/s]

 71%|████████████████████████████████████████▍                | 173/244 [00:25<00:12,  5.83it/s]

 71%|████████████████████████████████████████▋                | 174/244 [00:25<00:12,  5.80it/s]

 72%|████████████████████████████████████████▉                | 175/244 [00:25<00:11,  5.89it/s]

 72%|█████████████████████████████████████████                | 176/244 [00:25<00:10,  6.25it/s]

 73%|█████████████████████████████████████████▎               | 177/244 [00:25<00:09,  6.84it/s]

 73%|█████████████████████████████████████████▌               | 178/244 [00:25<00:09,  7.29it/s]

 73%|█████████████████████████████████████████▊               | 179/244 [00:25<00:08,  7.48it/s]

 74%|██████████████████████████████████████████               | 180/244 [00:26<00:11,  5.76it/s]

 74%|██████████████████████████████████████████▎              | 181/244 [00:26<00:12,  5.12it/s]

 75%|██████████████████████████████████████████▌              | 182/244 [00:26<00:13,  4.74it/s]

 75%|██████████████████████████████████████████▊              | 183/244 [00:26<00:12,  5.02it/s]

 75%|██████████████████████████████████████████▉              | 184/244 [00:27<00:11,  5.25it/s]

 76%|███████████████████████████████████████████▏             | 185/244 [00:27<00:10,  5.40it/s]

 76%|███████████████████████████████████████████▍             | 186/244 [00:27<00:11,  5.18it/s]

 77%|███████████████████████████████████████████▋             | 187/244 [00:27<00:12,  4.47it/s]

 77%|███████████████████████████████████████████▉             | 188/244 [00:27<00:13,  4.22it/s]

 77%|████████████████████████████████████████████▏            | 189/244 [00:28<00:12,  4.58it/s]

 78%|████████████████████████████████████████████▍            | 190/244 [00:28<00:10,  5.32it/s]

 78%|████████████████████████████████████████████▌            | 191/244 [00:28<00:09,  5.44it/s]

 79%|████████████████████████████████████████████▊            | 192/244 [00:28<00:09,  5.55it/s]

 79%|█████████████████████████████████████████████            | 193/244 [00:28<00:09,  5.60it/s]

 80%|█████████████████████████████████████████████▎           | 194/244 [00:29<00:09,  5.27it/s]

 80%|█████████████████████████████████████████████▌           | 195/244 [00:29<00:10,  4.71it/s]

 80%|█████████████████████████████████████████████▊           | 196/244 [00:29<00:09,  4.99it/s]

 81%|██████████████████████████████████████████████           | 197/244 [00:29<00:09,  5.21it/s]

 81%|██████████████████████████████████████████████▎          | 198/244 [00:29<00:08,  5.36it/s]

 82%|██████████████████████████████████████████████▍          | 199/244 [00:29<00:08,  5.45it/s]

 82%|██████████████████████████████████████████████▋          | 200/244 [00:30<00:08,  5.01it/s]

 82%|██████████████████████████████████████████████▉          | 201/244 [00:30<00:09,  4.68it/s]

 83%|███████████████████████████████████████████████▏         | 202/244 [00:30<00:07,  5.41it/s]

 83%|███████████████████████████████████████████████▍         | 203/244 [00:30<00:08,  4.65it/s]

 84%|███████████████████████████████████████████████▋         | 204/244 [00:31<00:09,  4.43it/s]

 84%|███████████████████████████████████████████████▉         | 205/244 [00:31<00:08,  4.45it/s]

 84%|████████████████████████████████████████████████         | 206/244 [00:31<00:08,  4.68it/s]

 85%|████████████████████████████████████████████████▎        | 207/244 [00:31<00:07,  4.94it/s]

 85%|████████████████████████████████████████████████▌        | 208/244 [00:31<00:07,  5.00it/s]

 86%|████████████████████████████████████████████████▊        | 209/244 [00:32<00:06,  5.18it/s]

 86%|█████████████████████████████████████████████████        | 210/244 [00:32<00:07,  4.40it/s]

 86%|█████████████████████████████████████████████████▎       | 211/244 [00:32<00:07,  4.29it/s]

 87%|█████████████████████████████████████████████████▌       | 212/244 [00:32<00:07,  4.05it/s]

 87%|█████████████████████████████████████████████████▊       | 213/244 [00:33<00:06,  4.56it/s]

 88%|█████████████████████████████████████████████████▉       | 214/244 [00:33<00:06,  4.84it/s]

 88%|██████████████████████████████████████████████████▏      | 215/244 [00:33<00:05,  5.06it/s]

 89%|██████████████████████████████████████████████████▍      | 216/244 [00:33<00:05,  5.22it/s]

 89%|██████████████████████████████████████████████████▋      | 217/244 [00:33<00:05,  5.37it/s]

 89%|██████████████████████████████████████████████████▉      | 218/244 [00:33<00:04,  5.46it/s]

 90%|███████████████████████████████████████████████████▏     | 219/244 [00:34<00:04,  5.52it/s]

 90%|███████████████████████████████████████████████████▍     | 220/244 [00:34<00:04,  5.57it/s]

 91%|███████████████████████████████████████████████████▋     | 221/244 [00:34<00:04,  5.60it/s]

 91%|███████████████████████████████████████████████████▊     | 222/244 [00:34<00:03,  5.61it/s]

 91%|████████████████████████████████████████████████████     | 223/244 [00:34<00:03,  5.64it/s]

 92%|████████████████████████████████████████████████████▎    | 224/244 [00:34<00:03,  5.68it/s]

 92%|████████████████████████████████████████████████████▌    | 225/244 [00:35<00:03,  5.76it/s]

 93%|████████████████████████████████████████████████████▊    | 226/244 [00:35<00:02,  6.11it/s]

 93%|█████████████████████████████████████████████████████    | 227/244 [00:35<00:02,  6.70it/s]

 93%|█████████████████████████████████████████████████████▎   | 228/244 [00:35<00:02,  7.17it/s]

 94%|█████████████████████████████████████████████████████▍   | 229/244 [00:35<00:01,  7.50it/s]

 94%|█████████████████████████████████████████████████████▋   | 230/244 [00:35<00:02,  6.94it/s]

 95%|█████████████████████████████████████████████████████▉   | 231/244 [00:35<00:01,  6.55it/s]

 95%|██████████████████████████████████████████████████████▏  | 232/244 [00:36<00:01,  6.27it/s]

 95%|██████████████████████████████████████████████████████▍  | 233/244 [00:36<00:01,  6.06it/s]

 96%|██████████████████████████████████████████████████████▋  | 234/244 [00:36<00:01,  5.52it/s]

 96%|██████████████████████████████████████████████████████▉  | 235/244 [00:36<00:01,  5.55it/s]

 97%|███████████████████████████████████████████████████████▏ | 236/244 [00:36<00:01,  5.57it/s]

 97%|███████████████████████████████████████████████████████▎ | 237/244 [00:37<00:01,  5.60it/s]

 98%|███████████████████████████████████████████████████████▌ | 238/244 [00:37<00:01,  5.62it/s]

 98%|███████████████████████████████████████████████████████▊ | 239/244 [00:37<00:00,  5.62it/s]

 98%|████████████████████████████████████████████████████████ | 240/244 [00:37<00:00,  6.08it/s]

 99%|████████████████████████████████████████████████████████▎| 241/244 [00:37<00:00,  6.41it/s]

 99%|████████████████████████████████████████████████████████▌| 242/244 [00:37<00:00,  6.00it/s]

100%|████████████████████████████████████████████████████████▊| 243/244 [00:38<00:00,  5.58it/s]

100%|█████████████████████████████████████████████████████████| 244/244 [00:38<00:00,  5.62it/s]

100%|█████████████████████████████████████████████████████████| 244/244 [00:38<00:00,  6.37it/s]

(732, 9)





Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject,feature,anova_p_value
0,high,unsel,-1.5096,0.0,-1.8942,-1.125,True,AreaShape_Area,1.0895449999999999e-20
1,high,wt,-2.0158,0.0,-2.3969,-1.6347,True,AreaShape_Area,1.0895449999999999e-20
2,unsel,wt,-0.5062,0.0066,-0.8908,-0.1216,True,AreaShape_Area,1.0895449999999999e-20
3,high,unsel,-1.5096,0.0,-1.8942,-1.125,True,AreaShape_CentralMoment_0_0,1.0895449999999999e-20
4,high,wt,-2.0158,0.0,-2.3969,-1.6347,True,AreaShape_CentralMoment_0_0,1.0895449999999999e-20


In [6]:
# save the results
output_file = pathlib.Path(
    "../../data/6.analysis_results/mean_aggregated_anova_results.parquet"
)
output_file.parent.mkdir(exist_ok=True, parents=True)
anova_output_df.to_parquet(output_file)

## Levene's test for homogeneity of variance

In [7]:
# split the df into three genotypes
high_df = df[df["Metadata_genotype"] == "high"]
unsel_df = df[df["Metadata_genotype"] == "unsel"]
wt_df = df[df["Metadata_genotype"] == "wt"]
levene_test_results = {"feature": [], "levene_statistic": [], "levene_p_value": []}
for feature in tqdm.tqdm(features_df.columns):
    # calculate the levene test for each feature
    levene_results = levene(wt_df[feature], unsel_df[feature], high_df[feature])
    levene_test_results["feature"].append(feature)
    levene_test_results["levene_statistic"].append(levene_results.statistic)
    levene_test_results["levene_p_value"].append(levene_results.pvalue)

levene_test_results_df = pd.DataFrame(levene_test_results)
levene_test_results_df

  0%|                                                                   | 0/244 [00:00<?, ?it/s]

 97%|█████████████████████████████████████████████████████▍ | 237/244 [00:00<00:00, 2369.70it/s]

100%|███████████████████████████████████████████████████████| 244/244 [00:00<00:00, 2319.63it/s]




Unnamed: 0,feature,levene_statistic,levene_p_value
0,AreaShape_Area,4.808285,0.010668
1,AreaShape_CentralMoment_0_0,4.808285,0.010668
2,AreaShape_CentralMoment_0_1,9.124611,0.000269
3,AreaShape_CentralMoment_0_2,9.782865,0.000158
4,AreaShape_CentralMoment_0_3,3.424782,0.037402
...,...,...,...
239,Texture_SumVariance_OP_3_03_256,7.546945,0.000994
240,Texture_Variance_OP_3_00_256,8.691200,0.000384
241,Texture_Variance_OP_3_01_256,8.906770,0.000322
242,Texture_Variance_OP_3_02_256,8.563326,0.000427


## Calculate the levenes test statistic for the equality of variances

In [8]:
# split the df into three genotypes
high_df = df[df["Metadata_genotype"] == "high"]
unsel_df = df[df["Metadata_genotype"] == "unsel"]
wt_df = df[df["Metadata_genotype"] == "wt"]
group_dict = {
    "high_vs_unsel": [high_df, unsel_df],
    "high_vs_wt": [high_df, wt_df],
    "unsel_vs_wt": [wt_df, unsel_df],
    "all": [high_df, unsel_df, wt_df],
}


levene_test_results = {
    "feature": [],
    "levene_statistic": [],
    "levene_p_value": [],
    "group": [],
}
for group in tqdm.tqdm(group_dict.keys()):
    for feature in features_df.columns:
        # calculate the levene test for each feature
        if not group == "all":
            levene_results = levene(
                group_dict[group][0][feature], group_dict[group][1][feature]
            )
            levene_test_results["feature"].append(feature)
            levene_test_results["levene_statistic"].append(levene_results.statistic)
            levene_test_results["levene_p_value"].append(levene_results.pvalue)
            levene_test_results["group"].append(group)
        else:
            levene_results = levene(
                group_dict[group][0][feature],
                group_dict[group][1][feature],
                group_dict[group][2][feature],
            )
            levene_test_results["feature"].append(feature)
            levene_test_results["levene_statistic"].append(levene_results.statistic)
            levene_test_results["levene_p_value"].append(levene_results.pvalue)
            levene_test_results["group"].append(group)

levene_test_results_df = pd.DataFrame(levene_test_results)

# sort the levene test results levene_test_results_df
# change the levene p-value to a float
levene_test_results_df["levene_p_value"] = levene_test_results_df[
    "levene_p_value"
].astype(float)
levene_test_results_df = levene_test_results_df.sort_values(
    "levene_p_value", ascending=False
)
levene_test_results_df

  0%|                                                                     | 0/4 [00:00<?, ?it/s]

  W = numer / denom
 50%|██████████████████████████████▌                              | 2/4 [00:00<00:00, 12.70it/s]

100%|█████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 11.60it/s]

100%|█████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 11.60it/s]




Unnamed: 0,feature,levene_statistic,levene_p_value,group
437,Texture_AngularSecondMoment_OP_3_01_256,0.000013,9.971029e-01,high_vs_wt
675,RadialDistribution_ZernikePhase_OP_9_1,0.000477,9.826518e-01,unsel_vs_wt
539,AreaShape_NormalizedMoment_3_2,0.000572,9.810111e-01,unsel_vs_wt
87,AreaShape_Zernike_7_7,0.001244,9.719976e-01,high_vs_unsel
436,Texture_AngularSecondMoment_OP_3_00_256,0.001246,9.719719e-01,high_vs_wt
...,...,...,...,...
887,RadialDistribution_ZernikeMagnitude_OP_8_2,26.848462,1.198789e-09,all
266,AreaShape_HuMoment_2,54.658521,9.472909e-10,high_vs_wt
872,RadialDistribution_ZernikeMagnitude_OP_4_0,27.984815,6.108337e-10,all
929,Texture_Contrast_OP_3_01_256,30.070757,1.823590e-10,all


In [9]:
# save the levene test results
# out dir
out_dir = pathlib.Path("../../data/6.analysis_results/")
# create the dir if it does not exist
out_dir.mkdir(parents=True, exist_ok=True)
levene_test_results_path = pathlib.Path(
    out_dir / "mean_aggregated_levene_test_results.csv"
)
levene_test_results_df.to_csv(levene_test_results_path)

### Calculate the levene test statistic for the aggregated data across feature types and genotypes

In [10]:
data_path = pathlib.Path(
    "../../data/5.converted_data/mean_aggregated_data.parquet"
).resolve(strict=True)
# Read the data
data = pd.read_parquet(data_path)

# Drop all metadata except for the genotype data
features_df = data.drop(columns=data.filter(like="Metadata").columns)
features_df["Metadata_genotype"] = data["Metadata_genotype"]


# turn the features into a long format
features_long_df = features_df.melt(
    id_vars="Metadata_genotype", var_name="feature", value_name="value"
)
features_long_df.head()
# Separate the feature into different parts
features_long_df[
    ["feature_group", "measurement", "bone", "parameter1", "parameter2", "parameter3"]
] = features_long_df["feature"].str.split("_", expand=True)

# Replace the Metadata_genotype with the actual genotype name
features_long_df["Metadata_genotype"] = features_long_df["Metadata_genotype"].replace(
    {"high": "High-Severity", "unsel": "Mid-Severity", "wt": "Wild Type"}
)
features_long_df.head()

Unnamed: 0,Metadata_genotype,feature,value,feature_group,measurement,bone,parameter1,parameter2,parameter3
0,High-Severity,AreaShape_Area,-0.178797,AreaShape,Area,,,,
1,High-Severity,AreaShape_Area,1.371763,AreaShape,Area,,,,
2,High-Severity,AreaShape_Area,1.706234,AreaShape,Area,,,,
3,High-Severity,AreaShape_Area,0.771674,AreaShape,Area,,,,
4,High-Severity,AreaShape_Area,2.180858,AreaShape,Area,,,,


In [11]:
# break each genotype and featuretype into a separate dataframe
high_df = features_long_df[features_long_df["Metadata_genotype"] == "High-Severity"]
unsel_df = features_long_df[features_long_df["Metadata_genotype"] == "Mid-Severity"]
wt_df = features_long_df[features_long_df["Metadata_genotype"] == "Wild Type"]

# each feature group
high_df_AreaShape = high_df[high_df["feature_group"] == "AreaShape"]
high_df_Intensity = high_df[high_df["feature_group"] == "Intensity"]
high_df_Neighbors = high_df[high_df["feature_group"] == "Neighbors"]
high_df_radial = high_df[high_df["feature_group"] == "RadialDistribution"]
high_df_Granularity = high_df[high_df["feature_group"] == "Granularity"]

unsel_df_AreaShape = unsel_df[unsel_df["feature_group"] == "AreaShape"]
unsel_df_Intensity = unsel_df[unsel_df["feature_group"] == "Intensity"]
unsel_df_Neighbors = unsel_df[unsel_df["feature_group"] == "Neighbors"]
unsel_df_radial = unsel_df[unsel_df["feature_group"] == "RadialDistribution"]
unsel_df_Granularity = unsel_df[unsel_df["feature_group"] == "Granularity"]

wt_df_AreaShape = wt_df[wt_df["feature_group"] == "AreaShape"]
wt_df_Intensity = wt_df[wt_df["feature_group"] == "Intensity"]
wt_df_Neighbors = wt_df[wt_df["feature_group"] == "Neighbors"]
wt_df_radial = wt_df[wt_df["feature_group"] == "RadialDistribution"]
wt_df_Granularity = wt_df[wt_df["feature_group"] == "Granularity"]

# levene test for each feature group
levene_test_results = {
    "feature_group": [],
    "levene_statistic": [],
    "levene_p_value": [],
    "group": [],
}

group_dict = {
    "AreaShape": {
        "high_area_v_unsel_area": [high_df_AreaShape, unsel_df_AreaShape],
        "high_area_v_wt_area": [high_df_AreaShape, wt_df_AreaShape],
        "unsel_area_v_wt_area": [wt_df_AreaShape, unsel_df_AreaShape],
    },
    "Intensity": {
        "high_intensity_v_unsel_intensity": [high_df_Intensity, unsel_df_Intensity],
        "high_intensity_v_wt_intensity": [high_df_Intensity, wt_df_Intensity],
        "unsel_intensity_v_wt_intensity": [wt_df_Intensity, unsel_df_Intensity],
    },
    "Neighbors": {
        "high_neighbors_v_unsel_neighbors": [high_df_Neighbors, unsel_df_Neighbors],
        "high_neighbors_v_wt_neighbors": [high_df_Neighbors, wt_df_Neighbors],
        "unsel_neighbors_v_wt_neighbors": [wt_df_Neighbors, unsel_df_Neighbors],
    },
    "RadialDistribution": {
        "high_radial_v_unsel_radial": [high_df_radial, unsel_df_radial],
        "high_radial_v_wt_radial": [high_df_radial, wt_df_radial],
        "unsel_radial_v_wt_radial": [wt_df_radial, unsel_df_radial],
    },
    "Granularity": {
        "high_granularity_v_unsel_granularity": [
            high_df_Granularity,
            unsel_df_Granularity,
        ],
        "high_granularity_v_wt_granularity": [high_df_Granularity, wt_df_Granularity],
        "unsel_granularity_v_wt_granularity": [wt_df_Granularity, unsel_df_Granularity],
    },
}

for feature_group in tqdm.tqdm(group_dict.keys()):
    for group in group_dict[feature_group].keys():
        if not group == "all":
            levene_results = levene(
                group_dict[feature_group][group][0]["value"],
                group_dict[feature_group][group][1]["value"],
            )
            # calculate the variance for each feature group

            levene_test_results["feature_group"].append(feature_group)
            levene_test_results["levene_statistic"].append(levene_results.statistic)
            levene_test_results["levene_p_value"].append(levene_results.pvalue)
            levene_test_results["group"].append(group)
        else:
            pass

levene_test_results_df = pd.DataFrame(levene_test_results)
levene_test_results_df.head()

  0%|                                                                     | 0/5 [00:00<?, ?it/s]

100%|████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 998.55it/s]




Unnamed: 0,feature_group,levene_statistic,levene_p_value,group
0,AreaShape,392.323288,2.4330170000000003e-84,high_area_v_unsel_area
1,AreaShape,796.248155,1.069709e-163,high_area_v_wt_area
2,AreaShape,36.56823,1.573327e-09,unsel_area_v_wt_area
3,Intensity,65.355599,2.227985e-15,high_intensity_v_unsel_intensity
4,Intensity,0.834856,0.3611348,high_intensity_v_wt_intensity


In [12]:
# save the levene test results
# out dir
out_dir = pathlib.Path("../../data/6.analysis_results/")
# create the dir if it does not exist
out_dir.mkdir(parents=True, exist_ok=True)
levene_test_results_path = pathlib.Path(
    out_dir / "mean_aggregated_levene_test_results_feature_types.csv"
)
levene_test_results_df.to_csv(levene_test_results_path, index=False)

In [13]:
# Drop all metadata except for the genotype data
features_df = data.drop(columns=data.filter(like="Metadata").columns)
features_df["Metadata_genotype"] = data["Metadata_genotype"]

# turn the features into a long format
features_long_df = features_df.melt(
    id_vars="Metadata_genotype", var_name="feature", value_name="value"
)
features_long_df

Unnamed: 0,Metadata_genotype,feature,value
0,high,AreaShape_Area,-0.178797
1,high,AreaShape_Area,1.371763
2,high,AreaShape_Area,1.706234
3,high,AreaShape_Area,0.771674
4,high,AreaShape_Area,2.180858
...,...,...,...
20247,wt,Texture_Variance_OP_3_03_256,1.434540
20248,wt,Texture_Variance_OP_3_03_256,1.956497
20249,wt,Texture_Variance_OP_3_03_256,1.223845
20250,wt,Texture_Variance_OP_3_03_256,-1.017697


In [14]:
# get the variance for each feature group
var_df = features_long_df.groupby(["Metadata_genotype", "feature"]).var().reset_index()
var_df.head()
# change the value column name to variance
var_df.rename(columns={"value": "variance"}, inplace=True)

In [15]:
var_df[
    ["feature_group", "measurement", "bone", "parameter1", "parameter2", "parameter3"]
] = var_df["feature"].str.split("_", expand=True)

# Replace the Metadata_genotype with the actual genotype name
var_df["Metadata_genotype"] = var_df["Metadata_genotype"].replace(
    {"high": "High-Severity", "unsel": "Mid-Severity", "wt": "Wild Type"}
)
var_df
var_df = var_df.drop(
    columns=["feature", "measurement", "bone", "parameter1", "parameter2", "parameter3"]
)
var_df
# save the variance results
var_path = pathlib.Path(out_dir / "mean_aggregated_variance_results_feature_types.csv")
var_df.to_csv(var_path, index=False)

In [16]:
# get the mean and stdev for each feature group's variance
var_df = (
    var_df.groupby(["Metadata_genotype", "feature_group"])
    .agg(["mean", "std", "max", "min", "count"])
    .reset_index()
)
# ungroup the columns
var_df.columns = ["_".join(col).strip() for col in var_df.columns.values]
# rename the Metadata_genotype_ column and the feature_group_ column
var_df.rename(
    columns={
        "Metadata_genotype_": "Metadata_genotype",
        "feature_group_": "feature_group",
    },
    inplace=True,
)
var_df

Unnamed: 0,Metadata_genotype,feature_group,variance_mean,variance_std,variance_max,variance_min,variance_count
0,High-Severity,AreaShape,0.774588,0.66997,3.855982,5e-06,98
1,High-Severity,Granularity,0.27819,0.357954,0.981199,0.015287,6
2,High-Severity,Intensity,0.529629,0.281565,1.157014,0.129043,15
3,High-Severity,Neighbors,0.730238,0.237898,0.953424,0.479949,3
4,High-Severity,RadialDistribution,0.670404,0.430295,1.751219,0.0,70
5,High-Severity,Texture,0.300315,0.206077,0.719217,0.032302,52
6,Mid-Severity,AreaShape,0.608826,0.372412,2.166008,0.160115,98
7,Mid-Severity,Granularity,0.31479,0.143204,0.539786,0.191056,6
8,Mid-Severity,Intensity,0.285852,0.211016,0.824411,0.076053,15
9,Mid-Severity,Neighbors,1.02523,0.424006,1.469152,0.624438,3


In [17]:
# save the variance results
var_path = pathlib.Path(
    out_dir / "mean_aggregated_variance_results_feature_types_stats.csv"
)
var_df.to_csv(var_path, index=False)