This notebook calculates the Mahalanobis distance between points on a pca. 
I will document more about what Mahalanobis distance is and how it is calculated in this notebook.

In [1]:
import itertools
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tqdm
from scipy.spatial.distance import mahalanobis
from scipy.stats import f_oneway as anova
from scipy.stats import ttest_ind
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [2]:
# set the path to the data
mean_aggregated_data_pca_path = pathlib.Path(
    "../../data/6.analysis_results/mean_aggregated_pca.parquet"
).resolve(strict=True)

# read the data
mean_aggregated_data_pca = pd.read_parquet(mean_aggregated_data_pca_path)
print(mean_aggregated_data_pca.shape)
mean_aggregated_data_pca.head()

(83, 86)


Unnamed: 0,Metadata_genotype,Metadata_replicate,Metadata_side,PC1,PC2,PC3,PC4,PC5,PC6,PC7,...,PC74,PC75,PC76,PC77,PC78,PC79,PC80,PC81,PC82,PC83
0,High-Severity,1,L,-7.71026,-4.264421,1.521397,-1.390342,9.41062,-6.054486,1.336078,...,0.213169,0.034717,0.129185,-0.14664,-0.081667,-0.075119,-0.020639,0.058634,0.028028,9.906943e-16
1,High-Severity,1,R,0.6958,8.170808,-1.21778,-2.386591,-2.60968,1.468471,4.371038,...,-0.215559,-0.068413,-0.266401,-0.0235,-0.0542,-0.023511,0.119934,0.145838,0.035305,-8.556710000000001e-17
2,High-Severity,10,L,6.910961,11.25212,-2.831527,-3.131419,-2.634531,3.551757,-2.684211,...,-0.075988,-0.123323,-0.210824,0.063699,0.14891,0.04104,0.020007,0.161693,-0.143457,-4.445563e-16
3,High-Severity,10,R,2.179877,6.184598,-3.46374,-0.260566,-3.067898,1.597567,2.061996,...,0.416289,0.348893,0.15141,0.326023,-0.207744,0.234788,-0.024102,-0.15051,0.049366,3.319675e-17
4,High-Severity,11,L,1.33669,9.305025,3.27923,0.938479,6.459164,-4.94401,-1.393619,...,-0.028287,0.092703,0.038004,0.095451,0.043627,0.006402,-0.030578,-0.017234,0.008486,-1.12952e-15


# Mahalanobis Distance
For more in depth information on Mahalanobis distance, please refer to this [link](https://medium.com/@the_daft_introvert/mahalanobis-distance-5c11a757b099).
Mahalanobis distance is a measure of the distance between a point P and a distribution D. 
It is a multi-dimensional generalization of the idea of measuring how many standard deviations away P is from the mean of D. 
This distance is zero if P is at the mean of D, and grows as P moves away from the mean along each principal component axis.
The formula for Mahalanobis distance is given by:
### $D^2 = (x - \mu)^T \Sigma^{-1} (x - \mu)$
where:
- $D$ is the Mahalanobis distance

## Caluclating Mahalanobis Distance on PCA 
Here I calculate the Mahalanobis distance between the points on the first two principal components of the data for each of the three genotypes.

In [3]:
# split the genotypes
high_severity = mean_aggregated_data_pca[
    mean_aggregated_data_pca["Metadata_genotype"] == "High-Severity"
]
low_severity = mean_aggregated_data_pca[
    mean_aggregated_data_pca["Metadata_genotype"] == "Mid-Severity"
]
wt = mean_aggregated_data_pca[
    mean_aggregated_data_pca["Metadata_genotype"] == "Wild Type"
]
print(len(high_severity), len(low_severity), len(wt))

28 27 28


In [4]:
# drop the the Metadata columns
metadata_columns = high_severity.columns[high_severity.columns.str.contains("Metadata")]
high_severity = high_severity.drop(metadata_columns, axis=1)
low_severity = low_severity.drop(metadata_columns, axis=1)
wt = wt.drop(metadata_columns, axis=1)


# convert the df to coordinates
high_severity_coords = high_severity.to_numpy()
low_severity_coords = low_severity.to_numpy()
wt_coords = wt.to_numpy()
print(high_severity_coords.shape, low_severity_coords.shape, wt_coords.shape)

(28, 83) (27, 83) (28, 83)


#### High-Severity

In [5]:
# define the mean and the inverse covariance matrix needed for the mahalanobis distance calculation
cov_matrix = np.cov(high_severity_coords, rowvar=False)
inv_cov_matrix = np.linalg.inv(cov_matrix)
# calculate the mahalanobis distance for each point within each genotype
high_severity_mahalanobis_distances = []
# calculate the mahalanobis distance for each point combination
for point in itertools.combinations(high_severity_coords, 2):
    distance = mahalanobis(point[0], point[1], inv_cov_matrix)
    if np.isnan(distance):  # Check for NaN values
        continue
    high_severity_mahalanobis_distances.append(distance)
mean_high_severity_mahalanobis_distance = np.mean(high_severity_mahalanobis_distances)
print(
    f"The mean mahalanobis distance for the High-Severity genotype is {mean_high_severity_mahalanobis_distance}"
)

The mean mahalanobis distance for the High-Severity genotype is 11.762056342410409


  return np.sqrt(m)


#### Low-Severity

In [6]:
# define the mean and the inverse covariance matrix needed for the mahalanobis distance calculation
cov_matrix = np.cov(low_severity_coords, rowvar=False)
inv_cov_matrix = np.linalg.inv(cov_matrix)
# calculate the mahalanobis distance for each point within each genotype
low_severity_mahalanobis_distances = []
# calculate the mahalanobis distance for each point
for point in itertools.combinations(low_severity_coords, 2):
    distance = mahalanobis(point[0], point[1], inv_cov_matrix)
    if np.isnan(distance):  # Check for NaN values
        continue
    low_severity_mahalanobis_distances.append(distance)
mean_low_severity_mahalanobis_distance = np.mean(low_severity_mahalanobis_distances)
print(
    f"The mean mahalanobis distance for the low-Severity genotype is {mean_low_severity_mahalanobis_distance}"
)

The mean mahalanobis distance for the low-Severity genotype is 13.336934733749693


#### Wild Type

In [7]:
# define the mean and the inverse covariance matrix needed for the mahalanobis distance calculation
cov_matrix = np.cov(wt_coords, rowvar=False)
inv_cov_matrix = np.linalg.inv(cov_matrix)
# calculate the mahalanobis distance for each point within each genotype
wt_mahalanobis_distances = []
# calculate the mahalanobis distance for each point
for point in itertools.combinations(wt_coords, 2):
    distance = mahalanobis(point[0], point[1], inv_cov_matrix)
    if np.isnan(distance):  # Check for NaN values
        continue
    wt_mahalanobis_distances.append(distance)
mean_wt_mahalanobis_distance = np.mean(wt_mahalanobis_distances)
print(
    f"The mean mahalanobis distance for the Wild Type genotype is {mean_wt_mahalanobis_distance}"
)

The mean mahalanobis distance for the Wild Type genotype is 10.91427607576582


## Calculate the Mahalanobis distance for random shuffled data points

In [8]:
# from the distribution of each PCA component, randomly sample 1000 points and calculate the mahalanobis distance
# get the distribution of each PCA component
high_severity_pca_component_distribution_std = high_severity.describe().loc["std"]
low_severity_pca_component_distribution_std = low_severity.describe().loc["std"]
wt_pca_component_distribution_std = wt.describe().loc["std"]

high_severity_pca_component_distribution_mean = high_severity.describe().loc["mean"]
low_severity_pca_component_distribution_mean = low_severity.describe().loc["mean"]
wt_pca_component_distribution_mean = wt.describe().loc["mean"]

In [9]:
# do 1000 trials of sampling from each PCA component distribution
# from the distribution of each PCA component, randomly sample 1000 points and calculate the mahalanobis distance

# set the output mean Mahalanobis distance lists per genotype
mean_high_severity_sampled_mahalanobis_distances_from_trials = []
mean_low_severity_sampled_mahalanobis_distances_from_trials = []
mean_wt_sampled_mahalanobis_distances_from_trials = []

# sample 1000 points from the distribution of each PCA component
num_pca_components = high_severity_pca_component_distribution_std.shape[0]
num_samples = 100
num_trials = 1000

for trial in tqdm.tqdm(range(num_trials)):
    high_severity_sampled_points = pd.DataFrame()
    low_severity_sampled_points = pd.DataFrame()
    wt_sampled_points = pd.DataFrame()

    for i in range(num_pca_components):
        high_severity_sampled_points[f"PC{i+1}"] = np.random.normal(
            loc=high_severity_pca_component_distribution_mean[i],
            scale=high_severity_pca_component_distribution_std[i],
            size=num_samples,
        )
        low_severity_sampled_points[f"PC{i+1}"] = np.random.normal(
            loc=low_severity_pca_component_distribution_mean[i],
            scale=low_severity_pca_component_distribution_std[i],
            size=num_samples,
        )
        wt_sampled_points[f"PC{i+1}"] = np.random.normal(
            loc=wt_pca_component_distribution_mean[i],
            scale=wt_pca_component_distribution_std[i],
            size=num_samples,
        )

    inv_cov_matrix = np.linalg.inv(
        np.cov(high_severity_sampled_points.to_numpy(), rowvar=False)
    )
    # calculate the mahalanobis distance for each point within each genotype
    high_severity_sampled_mahalanobis_distances = []
    # calculate the mahalanobis distance for each point combination
    for point in itertools.combinations(high_severity_sampled_points.to_numpy(), 2):
        distance = mahalanobis(point[0], point[1], inv_cov_matrix)
        if np.isnan(distance):  # Check for NaN values
            continue
        high_severity_sampled_mahalanobis_distances.append(distance)

    mean_high_severity_sampled_mahalanobis_distance = np.mean(
        high_severity_sampled_mahalanobis_distances
    )

    inv_cov_matrix = np.linalg.inv(
        np.cov(low_severity_sampled_points.to_numpy(), rowvar=False)
    )
    # calculate the mahalanobis distance for each point within each genotype
    low_severity_sampled_mahalanobis_distances = []
    # calculate the mahalanobis distance for each point combination
    for point in itertools.combinations(low_severity_sampled_points.to_numpy(), 2):
        distance = mahalanobis(point[0], point[1], inv_cov_matrix)
        if np.isnan(distance):  # Check for NaN values
            continue
        low_severity_sampled_mahalanobis_distances.append(distance)

    mean_low_severity_sampled_mahalanobis_distance = np.mean(
        low_severity_sampled_mahalanobis_distances
    )

    inv_cov_matrix = np.linalg.inv(np.cov(wt_sampled_points.to_numpy(), rowvar=False))
    # calculate the mahalanobis distance for each point within each genotype
    wt_sampled_mahalanobis_distances = []
    # calculate the mahalanobis distance for each point combination
    for point in itertools.combinations(wt_sampled_points.to_numpy(), 2):
        distance = mahalanobis(point[0], point[1], inv_cov_matrix)
        if np.isnan(distance):  # Check for NaN values
            continue
        wt_sampled_mahalanobis_distances.append(distance)

    mean_wt_sampled_mahalanobis_distance = np.mean(wt_sampled_mahalanobis_distances)

    # append the average mahalanobis distance from the trial to the list for the trial per genotype
    mean_high_severity_sampled_mahalanobis_distances_from_trials.append(
        mean_high_severity_sampled_mahalanobis_distance
    )
    mean_low_severity_sampled_mahalanobis_distances_from_trials.append(
        mean_low_severity_sampled_mahalanobis_distance
    )
    mean_wt_sampled_mahalanobis_distances_from_trials.append(
        mean_wt_sampled_mahalanobis_distance
    )

  0%|                                                                  | 0/1000 [00:00<?, ?it/s]

  0%|                                                          | 1/1000 [00:00<10:29,  1.59it/s]

  0%|                                                          | 2/1000 [00:01<11:39,  1.43it/s]

  0%|▏                                                         | 3/1000 [00:02<12:54,  1.29it/s]

  0%|▏                                                         | 4/1000 [00:02<10:29,  1.58it/s]

  0%|▎                                                         | 5/1000 [00:03<13:26,  1.23it/s]

  1%|▎                                                         | 6/1000 [00:04<15:34,  1.06it/s]

  1%|▍                                                         | 7/1000 [00:06<16:05,  1.03it/s]

  1%|▍                                                         | 8/1000 [00:07<17:46,  1.07s/it]

  1%|▌                                                         | 9/1000 [00:08<17:30,  1.06s/it]

  1%|▌                                                        | 10/1000 [00:08<14:58,  1.10it/s]

  1%|▋                                                        | 11/1000 [00:09<11:30,  1.43it/s]

  1%|▋                                                        | 12/1000 [00:09<08:53,  1.85it/s]

  1%|▋                                                        | 13/1000 [00:09<07:04,  2.32it/s]

  1%|▊                                                        | 14/1000 [00:09<06:58,  2.36it/s]

  2%|▊                                                        | 15/1000 [00:10<07:57,  2.06it/s]

  2%|▉                                                        | 16/1000 [00:12<13:57,  1.17it/s]

  2%|▉                                                        | 17/1000 [00:12<12:20,  1.33it/s]

  2%|█                                                        | 18/1000 [00:13<12:45,  1.28it/s]

  2%|█                                                        | 19/1000 [00:14<12:22,  1.32it/s]

  2%|█▏                                                       | 20/1000 [00:14<09:48,  1.66it/s]

  2%|█▏                                                       | 21/1000 [00:15<12:44,  1.28it/s]

  2%|█▎                                                       | 22/1000 [00:16<13:19,  1.22it/s]

  2%|█▎                                                       | 23/1000 [00:17<13:04,  1.25it/s]

  2%|█▎                                                       | 24/1000 [00:18<13:17,  1.22it/s]

  2%|█▍                                                       | 25/1000 [00:19<15:54,  1.02it/s]

  3%|█▍                                                       | 26/1000 [00:20<14:30,  1.12it/s]

  3%|█▌                                                       | 27/1000 [00:21<14:34,  1.11it/s]

  3%|█▌                                                       | 28/1000 [00:22<15:10,  1.07it/s]

  3%|█▋                                                       | 29/1000 [00:23<15:06,  1.07it/s]

  3%|█▋                                                       | 30/1000 [00:23<12:45,  1.27it/s]

  3%|█▊                                                       | 31/1000 [00:23<09:43,  1.66it/s]

  3%|█▊                                                       | 32/1000 [00:23<07:39,  2.10it/s]

  3%|█▉                                                       | 33/1000 [00:24<06:40,  2.42it/s]

  3%|█▉                                                       | 34/1000 [00:24<07:15,  2.22it/s]

  4%|█▉                                                       | 35/1000 [00:25<10:25,  1.54it/s]

  4%|██                                                       | 36/1000 [00:26<10:06,  1.59it/s]

  4%|██                                                       | 37/1000 [00:26<08:39,  1.85it/s]

  4%|██▏                                                      | 38/1000 [00:27<09:34,  1.67it/s]

  4%|██▏                                                      | 39/1000 [00:28<09:19,  1.72it/s]

  4%|██▎                                                      | 40/1000 [00:28<09:59,  1.60it/s]

  4%|██▎                                                      | 41/1000 [00:29<09:36,  1.66it/s]

  4%|██▍                                                      | 42/1000 [00:30<11:13,  1.42it/s]

  4%|██▍                                                      | 43/1000 [00:31<11:44,  1.36it/s]

  4%|██▌                                                      | 44/1000 [00:31<11:42,  1.36it/s]

  4%|██▌                                                      | 45/1000 [00:32<09:33,  1.67it/s]

  5%|██▌                                                      | 46/1000 [00:32<08:12,  1.94it/s]

  5%|██▋                                                      | 47/1000 [00:33<10:17,  1.54it/s]

  5%|██▋                                                      | 48/1000 [00:34<12:27,  1.27it/s]

  5%|██▊                                                      | 49/1000 [00:35<15:09,  1.05it/s]

  5%|██▊                                                      | 50/1000 [00:36<13:31,  1.17it/s]

  5%|██▉                                                      | 51/1000 [00:37<13:12,  1.20it/s]

  5%|██▉                                                      | 52/1000 [00:38<13:26,  1.18it/s]

  5%|███                                                      | 53/1000 [00:39<15:58,  1.01s/it]

  5%|███                                                      | 54/1000 [00:40<16:19,  1.04s/it]

  6%|███▏                                                     | 55/1000 [00:41<13:18,  1.18it/s]

  6%|███▏                                                     | 56/1000 [00:41<10:49,  1.45it/s]

  6%|███▏                                                     | 57/1000 [00:42<11:27,  1.37it/s]

  6%|███▎                                                     | 58/1000 [00:43<13:07,  1.20it/s]

  6%|███▎                                                     | 59/1000 [00:44<16:02,  1.02s/it]

  6%|███▍                                                     | 60/1000 [00:45<13:06,  1.20it/s]

  6%|███▍                                                     | 61/1000 [00:46<15:57,  1.02s/it]

  6%|███▌                                                     | 62/1000 [00:47<16:29,  1.06s/it]

  6%|███▌                                                     | 63/1000 [00:48<15:57,  1.02s/it]

  6%|███▋                                                     | 64/1000 [00:49<15:03,  1.04it/s]

  6%|███▋                                                     | 65/1000 [00:50<15:18,  1.02it/s]

  7%|███▊                                                     | 66/1000 [00:52<18:24,  1.18s/it]

  7%|███▊                                                     | 67/1000 [00:52<13:40,  1.14it/s]

  7%|███▉                                                     | 68/1000 [00:53<13:03,  1.19it/s]

  7%|███▉                                                     | 69/1000 [00:54<15:12,  1.02it/s]

  7%|███▉                                                     | 70/1000 [00:55<15:09,  1.02it/s]

  7%|████                                                     | 71/1000 [00:55<11:25,  1.36it/s]

  7%|████                                                     | 72/1000 [00:55<08:59,  1.72it/s]

  7%|████▏                                                    | 73/1000 [00:56<09:51,  1.57it/s]

  7%|████▏                                                    | 74/1000 [00:56<08:55,  1.73it/s]

  8%|████▎                                                    | 75/1000 [00:57<10:34,  1.46it/s]

  8%|████▎                                                    | 76/1000 [00:58<10:42,  1.44it/s]

  8%|████▍                                                    | 77/1000 [00:59<11:08,  1.38it/s]

  8%|████▍                                                    | 78/1000 [01:00<11:00,  1.40it/s]

  8%|████▌                                                    | 79/1000 [01:01<13:37,  1.13it/s]

  8%|████▌                                                    | 80/1000 [01:02<12:56,  1.18it/s]

  8%|████▌                                                    | 81/1000 [01:02<12:00,  1.28it/s]

  8%|████▋                                                    | 82/1000 [01:02<09:26,  1.62it/s]

  8%|████▋                                                    | 83/1000 [01:03<10:57,  1.40it/s]

  8%|████▊                                                    | 84/1000 [01:05<14:05,  1.08it/s]

  8%|████▊                                                    | 85/1000 [01:05<12:37,  1.21it/s]

  9%|████▉                                                    | 86/1000 [01:06<12:47,  1.19it/s]

  9%|████▉                                                    | 87/1000 [01:07<13:25,  1.13it/s]

  9%|█████                                                    | 88/1000 [01:08<11:38,  1.31it/s]

  9%|█████                                                    | 89/1000 [01:09<11:36,  1.31it/s]

  9%|█████▏                                                   | 90/1000 [01:09<10:30,  1.44it/s]

  9%|█████▏                                                   | 91/1000 [01:09<09:05,  1.67it/s]

  9%|█████▏                                                   | 92/1000 [01:10<09:00,  1.68it/s]

  9%|█████▎                                                   | 93/1000 [01:11<08:51,  1.71it/s]

  9%|█████▎                                                   | 94/1000 [01:11<09:25,  1.60it/s]

 10%|█████▍                                                   | 95/1000 [01:13<12:36,  1.20it/s]

 10%|█████▍                                                   | 96/1000 [01:13<11:59,  1.26it/s]

 10%|█████▌                                                   | 97/1000 [01:14<10:46,  1.40it/s]

 10%|█████▌                                                   | 98/1000 [01:15<12:02,  1.25it/s]

 10%|█████▋                                                   | 99/1000 [01:16<12:26,  1.21it/s]

 10%|█████▌                                                  | 100/1000 [01:17<12:45,  1.18it/s]

 10%|█████▋                                                  | 101/1000 [01:17<09:41,  1.55it/s]

 10%|█████▋                                                  | 102/1000 [01:17<09:00,  1.66it/s]

 10%|█████▊                                                  | 103/1000 [01:19<11:39,  1.28it/s]

 10%|█████▊                                                  | 104/1000 [01:20<14:02,  1.06it/s]

 10%|█████▉                                                  | 105/1000 [01:21<13:30,  1.10it/s]

 11%|█████▉                                                  | 106/1000 [01:22<14:55,  1.00s/it]

 11%|█████▉                                                  | 107/1000 [01:23<14:37,  1.02it/s]

 11%|██████                                                  | 108/1000 [01:24<13:50,  1.07it/s]

 11%|██████                                                  | 109/1000 [01:24<10:25,  1.42it/s]

 11%|██████▏                                                 | 110/1000 [01:24<08:19,  1.78it/s]

 11%|██████▏                                                 | 111/1000 [01:24<06:44,  2.20it/s]

 11%|██████▎                                                 | 112/1000 [01:25<05:48,  2.55it/s]

 11%|██████▎                                                 | 113/1000 [01:25<04:51,  3.04it/s]

 11%|██████▍                                                 | 114/1000 [01:25<04:53,  3.02it/s]

 12%|██████▍                                                 | 115/1000 [01:25<04:11,  3.52it/s]

 12%|██████▍                                                 | 116/1000 [01:25<03:42,  3.97it/s]

 12%|██████▌                                                 | 117/1000 [01:26<03:33,  4.14it/s]

 12%|██████▌                                                 | 118/1000 [01:26<03:14,  4.52it/s]

 12%|██████▋                                                 | 119/1000 [01:26<03:03,  4.80it/s]

 12%|██████▋                                                 | 120/1000 [01:26<02:55,  5.02it/s]

 12%|██████▊                                                 | 121/1000 [01:26<03:00,  4.87it/s]

 12%|██████▊                                                 | 122/1000 [01:27<02:53,  5.06it/s]

 12%|██████▉                                                 | 123/1000 [01:27<02:48,  5.19it/s]

 12%|██████▉                                                 | 124/1000 [01:27<02:51,  5.11it/s]

 12%|███████                                                 | 125/1000 [01:27<02:46,  5.27it/s]

 13%|███████                                                 | 126/1000 [01:27<02:51,  5.11it/s]

 13%|███████                                                 | 127/1000 [01:28<03:08,  4.63it/s]

 13%|███████▏                                                | 128/1000 [01:28<03:00,  4.84it/s]

 13%|███████▏                                                | 129/1000 [01:28<02:52,  5.06it/s]

 13%|███████▎                                                | 130/1000 [01:28<02:58,  4.88it/s]

 13%|███████▎                                                | 131/1000 [01:28<02:52,  5.04it/s]

 13%|███████▍                                                | 132/1000 [01:29<02:47,  5.19it/s]

 13%|███████▍                                                | 133/1000 [01:29<03:04,  4.69it/s]

 13%|███████▌                                                | 134/1000 [01:29<02:55,  4.94it/s]

 14%|███████▌                                                | 135/1000 [01:29<03:26,  4.18it/s]

 14%|███████▌                                                | 136/1000 [01:29<03:19,  4.32it/s]

 14%|███████▋                                                | 137/1000 [01:30<03:05,  4.64it/s]

 14%|███████▋                                                | 138/1000 [01:30<02:59,  4.80it/s]

 14%|███████▊                                                | 139/1000 [01:30<02:51,  5.03it/s]

 14%|███████▊                                                | 140/1000 [01:30<02:44,  5.22it/s]

 14%|███████▉                                                | 141/1000 [01:30<02:44,  5.23it/s]

 14%|███████▉                                                | 142/1000 [01:31<02:40,  5.34it/s]

 14%|████████                                                | 143/1000 [01:31<02:44,  5.21it/s]

 14%|████████                                                | 144/1000 [01:31<02:51,  4.98it/s]

 14%|████████                                                | 145/1000 [01:31<03:29,  4.09it/s]

 15%|████████▏                                               | 146/1000 [01:32<03:11,  4.46it/s]

 15%|████████▏                                               | 147/1000 [01:32<03:19,  4.28it/s]

 15%|████████▎                                               | 148/1000 [01:32<03:25,  4.15it/s]

 15%|████████▎                                               | 149/1000 [01:32<04:00,  3.54it/s]

 15%|████████▍                                               | 150/1000 [01:33<03:36,  3.93it/s]

 15%|████████▍                                               | 151/1000 [01:33<03:40,  3.86it/s]

 15%|████████▌                                               | 152/1000 [01:33<03:32,  4.00it/s]

 15%|████████▌                                               | 153/1000 [01:33<03:56,  3.59it/s]

 15%|████████▌                                               | 154/1000 [01:34<03:33,  3.95it/s]

 16%|████████▋                                               | 155/1000 [01:34<04:24,  3.19it/s]

 16%|████████▋                                               | 156/1000 [01:34<04:08,  3.40it/s]

 16%|████████▊                                               | 157/1000 [01:35<04:36,  3.05it/s]

 16%|████████▊                                               | 158/1000 [01:35<05:00,  2.81it/s]

 16%|████████▉                                               | 159/1000 [01:36<04:51,  2.88it/s]

 16%|████████▉                                               | 160/1000 [01:36<05:00,  2.80it/s]

 16%|█████████                                               | 161/1000 [01:36<04:34,  3.06it/s]

 16%|█████████                                               | 162/1000 [01:36<04:23,  3.18it/s]

 16%|█████████▏                                              | 163/1000 [01:37<04:07,  3.38it/s]

 16%|█████████▏                                              | 164/1000 [01:37<03:40,  3.79it/s]

 16%|█████████▏                                              | 165/1000 [01:37<04:10,  3.33it/s]

 17%|█████████▎                                              | 166/1000 [01:38<04:14,  3.28it/s]

 17%|█████████▎                                              | 167/1000 [01:38<03:47,  3.65it/s]

 17%|█████████▍                                              | 168/1000 [01:38<03:50,  3.61it/s]

 17%|█████████▍                                              | 169/1000 [01:38<03:36,  3.84it/s]

 17%|█████████▌                                              | 170/1000 [01:39<03:51,  3.58it/s]

 17%|█████████▌                                              | 171/1000 [01:39<03:26,  4.02it/s]

 17%|█████████▋                                              | 172/1000 [01:39<03:07,  4.41it/s]

 17%|█████████▋                                              | 173/1000 [01:39<03:13,  4.28it/s]

 17%|█████████▋                                              | 174/1000 [01:39<03:19,  4.14it/s]

 18%|█████████▊                                              | 175/1000 [01:40<03:19,  4.15it/s]

 18%|█████████▊                                              | 176/1000 [01:40<03:03,  4.49it/s]

 18%|█████████▉                                              | 177/1000 [01:40<02:58,  4.62it/s]

 18%|█████████▉                                              | 178/1000 [01:40<03:13,  4.26it/s]

 18%|██████████                                              | 179/1000 [01:41<02:58,  4.59it/s]

 18%|██████████                                              | 180/1000 [01:41<02:50,  4.80it/s]

 18%|██████████▏                                             | 181/1000 [01:41<03:17,  4.16it/s]

 18%|██████████▏                                             | 182/1000 [01:41<03:27,  3.95it/s]

 18%|██████████▏                                             | 183/1000 [01:42<03:21,  4.05it/s]

 18%|██████████▎                                             | 184/1000 [01:42<03:04,  4.42it/s]

 18%|██████████▎                                             | 185/1000 [01:42<03:09,  4.30it/s]

 19%|██████████▍                                             | 186/1000 [01:42<03:17,  4.12it/s]

 19%|██████████▍                                             | 187/1000 [01:43<03:23,  4.00it/s]

 19%|██████████▌                                             | 188/1000 [01:43<03:02,  4.45it/s]

 19%|██████████▌                                             | 189/1000 [01:43<03:06,  4.34it/s]

 19%|██████████▋                                             | 190/1000 [01:43<03:16,  4.13it/s]

 19%|██████████▋                                             | 191/1000 [01:43<03:00,  4.47it/s]

 19%|██████████▊                                             | 192/1000 [01:44<02:53,  4.65it/s]

 19%|██████████▊                                             | 193/1000 [01:44<03:07,  4.31it/s]

 19%|██████████▊                                             | 194/1000 [01:44<03:12,  4.18it/s]

 20%|██████████▉                                             | 195/1000 [01:44<03:30,  3.82it/s]

 20%|██████████▉                                             | 196/1000 [01:45<03:09,  4.25it/s]

 20%|███████████                                             | 197/1000 [01:45<03:30,  3.81it/s]

 20%|███████████                                             | 198/1000 [01:45<03:48,  3.51it/s]

 20%|███████████▏                                            | 199/1000 [01:46<03:48,  3.50it/s]

 20%|███████████▏                                            | 200/1000 [01:46<03:36,  3.70it/s]

 20%|███████████▎                                            | 201/1000 [01:46<03:37,  3.67it/s]

 20%|███████████▎                                            | 202/1000 [01:46<03:36,  3.69it/s]

 20%|███████████▎                                            | 203/1000 [01:47<03:38,  3.65it/s]

 20%|███████████▍                                            | 204/1000 [01:47<03:57,  3.36it/s]

 20%|███████████▍                                            | 205/1000 [01:47<04:26,  2.99it/s]

 21%|███████████▌                                            | 206/1000 [01:48<03:50,  3.44it/s]

 21%|███████████▌                                            | 207/1000 [01:48<03:59,  3.31it/s]

 21%|███████████▋                                            | 208/1000 [01:48<04:10,  3.17it/s]

 21%|███████████▋                                            | 209/1000 [01:48<03:53,  3.39it/s]

 21%|███████████▊                                            | 210/1000 [01:49<03:52,  3.39it/s]

 21%|███████████▊                                            | 211/1000 [01:49<04:18,  3.05it/s]

 21%|███████████▊                                            | 212/1000 [01:49<03:56,  3.33it/s]

 21%|███████████▉                                            | 213/1000 [01:50<03:35,  3.65it/s]

 21%|███████████▉                                            | 214/1000 [01:50<03:26,  3.81it/s]

 22%|████████████                                            | 215/1000 [01:50<03:07,  4.19it/s]

 22%|████████████                                            | 216/1000 [01:50<02:59,  4.38it/s]

 22%|████████████▏                                           | 217/1000 [01:51<03:05,  4.23it/s]

 22%|████████████▏                                           | 218/1000 [01:51<03:28,  3.76it/s]

 22%|████████████▎                                           | 219/1000 [01:51<03:06,  4.19it/s]

 22%|████████████▎                                           | 220/1000 [01:51<03:32,  3.67it/s]

 22%|████████████▍                                           | 221/1000 [01:52<03:27,  3.76it/s]

 22%|████████████▍                                           | 222/1000 [01:52<03:07,  4.15it/s]

 22%|████████████▍                                           | 223/1000 [01:52<02:52,  4.50it/s]

 22%|████████████▌                                           | 224/1000 [01:52<03:02,  4.26it/s]

 22%|████████████▌                                           | 225/1000 [01:53<03:23,  3.81it/s]

 23%|████████████▋                                           | 226/1000 [01:53<03:41,  3.50it/s]

 23%|████████████▋                                           | 227/1000 [01:53<03:29,  3.69it/s]

 23%|████████████▊                                           | 228/1000 [01:53<03:27,  3.72it/s]

 23%|████████████▊                                           | 229/1000 [01:54<03:01,  4.24it/s]

 23%|████████████▉                                           | 230/1000 [01:54<03:19,  3.85it/s]

 23%|████████████▉                                           | 231/1000 [01:54<03:01,  4.23it/s]

 23%|████████████▉                                           | 232/1000 [01:54<02:47,  4.60it/s]

 23%|█████████████                                           | 233/1000 [01:54<02:37,  4.88it/s]

 23%|█████████████                                           | 234/1000 [01:55<02:31,  5.06it/s]

 24%|█████████████▏                                          | 235/1000 [01:55<02:25,  5.27it/s]

 24%|█████████████▏                                          | 236/1000 [01:55<02:23,  5.32it/s]

 24%|█████████████▎                                          | 237/1000 [01:55<02:21,  5.39it/s]

 24%|█████████████▎                                          | 238/1000 [01:55<02:26,  5.22it/s]

 24%|█████████████▍                                          | 239/1000 [01:56<02:19,  5.44it/s]

 24%|█████████████▍                                          | 240/1000 [01:56<02:19,  5.47it/s]

 24%|█████████████▍                                          | 241/1000 [01:56<02:17,  5.53it/s]

 24%|█████████████▌                                          | 242/1000 [01:56<03:03,  4.14it/s]

 24%|█████████████▌                                          | 243/1000 [01:56<02:47,  4.53it/s]

 24%|█████████████▋                                          | 244/1000 [01:57<02:54,  4.33it/s]

 24%|█████████████▋                                          | 245/1000 [01:57<02:42,  4.66it/s]

 25%|█████████████▊                                          | 246/1000 [01:57<03:17,  3.82it/s]

 25%|█████████████▊                                          | 247/1000 [01:58<03:41,  3.40it/s]

 25%|█████████████▉                                          | 248/1000 [01:58<03:22,  3.72it/s]

 25%|█████████████▉                                          | 249/1000 [01:58<03:01,  4.15it/s]

 25%|██████████████                                          | 250/1000 [01:58<02:48,  4.45it/s]

 25%|██████████████                                          | 251/1000 [01:58<02:43,  4.57it/s]

 25%|██████████████                                          | 252/1000 [01:59<02:34,  4.83it/s]

 25%|██████████████▏                                         | 253/1000 [01:59<03:14,  3.84it/s]

 25%|██████████████▏                                         | 254/1000 [01:59<03:44,  3.32it/s]

 26%|██████████████▎                                         | 255/1000 [02:00<03:46,  3.29it/s]

 26%|██████████████▎                                         | 256/1000 [02:00<03:35,  3.45it/s]

 26%|██████████████▍                                         | 257/1000 [02:00<03:08,  3.94it/s]

 26%|██████████████▍                                         | 258/1000 [02:00<02:52,  4.31it/s]

 26%|██████████████▌                                         | 259/1000 [02:00<02:40,  4.61it/s]

 26%|██████████████▌                                         | 260/1000 [02:01<02:38,  4.68it/s]

 26%|██████████████▌                                         | 261/1000 [02:01<02:49,  4.35it/s]

 26%|██████████████▋                                         | 262/1000 [02:01<02:50,  4.33it/s]

 26%|██████████████▋                                         | 263/1000 [02:01<02:39,  4.61it/s]

 26%|██████████████▊                                         | 264/1000 [02:02<02:34,  4.75it/s]

 26%|██████████████▊                                         | 265/1000 [02:02<02:26,  5.00it/s]

 27%|██████████████▉                                         | 266/1000 [02:02<02:23,  5.13it/s]

 27%|██████████████▉                                         | 267/1000 [02:02<02:53,  4.22it/s]

 27%|███████████████                                         | 268/1000 [02:02<02:39,  4.59it/s]

 27%|███████████████                                         | 269/1000 [02:03<02:34,  4.73it/s]

 27%|███████████████                                         | 270/1000 [02:03<02:26,  5.00it/s]

 27%|███████████████▏                                        | 271/1000 [02:03<03:00,  4.03it/s]

 27%|███████████████▏                                        | 272/1000 [02:03<02:45,  4.41it/s]

 27%|███████████████▎                                        | 273/1000 [02:04<02:56,  4.11it/s]

 27%|███████████████▎                                        | 274/1000 [02:04<03:37,  3.33it/s]

 28%|███████████████▍                                        | 275/1000 [02:04<03:46,  3.20it/s]

 28%|███████████████▍                                        | 276/1000 [02:05<03:59,  3.02it/s]

 28%|███████████████▌                                        | 277/1000 [02:05<04:01,  3.00it/s]

 28%|███████████████▌                                        | 278/1000 [02:05<04:12,  2.86it/s]

 28%|███████████████▌                                        | 279/1000 [02:06<03:50,  3.13it/s]

 28%|███████████████▋                                        | 280/1000 [02:06<03:19,  3.60it/s]

 28%|███████████████▋                                        | 281/1000 [02:06<02:57,  4.04it/s]

 28%|███████████████▊                                        | 282/1000 [02:06<03:11,  3.75it/s]

 28%|███████████████▊                                        | 283/1000 [02:07<03:14,  3.68it/s]

 28%|███████████████▉                                        | 284/1000 [02:07<03:32,  3.37it/s]

 28%|███████████████▉                                        | 285/1000 [02:07<03:15,  3.65it/s]

 29%|████████████████                                        | 286/1000 [02:07<02:54,  4.09it/s]

 29%|████████████████                                        | 287/1000 [02:08<02:39,  4.47it/s]

 29%|████████████████▏                                       | 288/1000 [02:08<02:32,  4.67it/s]

 29%|████████████████▏                                       | 289/1000 [02:08<02:45,  4.29it/s]

 29%|████████████████▏                                       | 290/1000 [02:08<02:48,  4.21it/s]

 29%|████████████████▎                                       | 291/1000 [02:09<02:49,  4.19it/s]

 29%|████████████████▎                                       | 292/1000 [02:09<02:50,  4.15it/s]

 29%|████████████████▍                                       | 293/1000 [02:09<02:59,  3.94it/s]

 29%|████████████████▍                                       | 294/1000 [02:09<02:58,  3.96it/s]

 30%|████████████████▌                                       | 295/1000 [02:10<02:46,  4.23it/s]

 30%|████████████████▌                                       | 296/1000 [02:10<02:51,  4.11it/s]

 30%|████████████████▋                                       | 297/1000 [02:10<02:49,  4.16it/s]

 30%|████████████████▋                                       | 298/1000 [02:10<02:48,  4.16it/s]

 30%|████████████████▋                                       | 299/1000 [02:10<02:49,  4.13it/s]

 30%|████████████████▊                                       | 300/1000 [02:11<03:03,  3.81it/s]

 30%|████████████████▊                                       | 301/1000 [02:11<02:46,  4.20it/s]

 30%|████████████████▉                                       | 302/1000 [02:11<02:49,  4.12it/s]

 30%|████████████████▉                                       | 303/1000 [02:11<02:39,  4.38it/s]

 30%|█████████████████                                       | 304/1000 [02:12<02:56,  3.95it/s]

 30%|█████████████████                                       | 305/1000 [02:12<02:45,  4.19it/s]

 31%|█████████████████▏                                      | 306/1000 [02:12<02:33,  4.52it/s]

 31%|█████████████████▏                                      | 307/1000 [02:13<03:10,  3.65it/s]

 31%|█████████████████▏                                      | 308/1000 [02:13<03:31,  3.27it/s]

 31%|█████████████████▎                                      | 309/1000 [02:13<03:29,  3.30it/s]

 31%|█████████████████▎                                      | 310/1000 [02:14<03:40,  3.13it/s]

 31%|█████████████████▍                                      | 311/1000 [02:14<03:16,  3.50it/s]

 31%|█████████████████▍                                      | 312/1000 [02:14<03:22,  3.39it/s]

 31%|█████████████████▌                                      | 313/1000 [02:14<03:31,  3.25it/s]

 31%|█████████████████▌                                      | 314/1000 [02:15<03:52,  2.95it/s]

 32%|█████████████████▋                                      | 315/1000 [02:15<03:55,  2.91it/s]

 32%|█████████████████▋                                      | 316/1000 [02:15<03:42,  3.07it/s]

 32%|█████████████████▊                                      | 317/1000 [02:16<04:00,  2.84it/s]

 32%|█████████████████▊                                      | 318/1000 [02:16<03:24,  3.33it/s]

 32%|█████████████████▊                                      | 319/1000 [02:16<03:13,  3.52it/s]

 32%|█████████████████▉                                      | 320/1000 [02:17<03:01,  3.74it/s]

 32%|█████████████████▉                                      | 321/1000 [02:17<03:20,  3.39it/s]

 32%|██████████████████                                      | 322/1000 [02:17<03:21,  3.36it/s]

 32%|██████████████████                                      | 323/1000 [02:17<03:13,  3.49it/s]

 32%|██████████████████▏                                     | 324/1000 [02:18<03:14,  3.47it/s]

 32%|██████████████████▏                                     | 325/1000 [02:18<03:09,  3.56it/s]

 33%|██████████████████▎                                     | 326/1000 [02:18<02:48,  4.00it/s]

 33%|██████████████████▎                                     | 327/1000 [02:18<02:50,  3.95it/s]

 33%|██████████████████▎                                     | 328/1000 [02:19<02:48,  3.98it/s]

 33%|██████████████████▍                                     | 329/1000 [02:19<02:35,  4.32it/s]

 33%|██████████████████▍                                     | 330/1000 [02:19<02:24,  4.64it/s]

 33%|██████████████████▌                                     | 331/1000 [02:19<02:19,  4.79it/s]

 33%|██████████████████▌                                     | 332/1000 [02:20<02:25,  4.58it/s]

 33%|██████████████████▋                                     | 333/1000 [02:20<02:17,  4.87it/s]

 33%|██████████████████▋                                     | 334/1000 [02:20<02:10,  5.09it/s]

 33%|██████████████████▋                                     | 334/1000 [02:20<04:39,  2.38it/s]




In [None]:
print(
    f"The high Mahalonobis distance for {num_trials} trials is: {round(np.mean(mean_high_severity_sampled_mahalanobis_distances_from_trials),0)}"
)
print(
    f"The low Mahalonobis distance for {num_trials} trials is: {round(np.mean(mean_low_severity_sampled_mahalanobis_distances_from_trials),0)}"
)
print(
    f"The wt Mahalonobis distance for {num_trials} trials is: {round(np.mean(mean_wt_sampled_mahalanobis_distances_from_trials),0)}"
)

### Show the mahalanobis distance for each genotype

In [None]:
print(
    "Mahalanobis distance for High-Severity: ", mean_high_severity_mahalanobis_distance
)
print("Mahalanobis distance for Low-Severity: ", mean_low_severity_mahalanobis_distance)
print("Mahalanobis distance for Wild Type: ", mean_wt_mahalanobis_distance)

In [None]:
# hypothesis testing the difference between the Mahalanobis distance of the sampled points and the actual points
# calculate the p-value for the difference between the Mahalanobis distance of the sampled points and the actual points

high_severity_p_value = ttest_ind(
    high_severity_mahalanobis_distances,
    mean_high_severity_sampled_mahalanobis_distances_from_trials,
).pvalue
low_severity_p_value = ttest_ind(
    low_severity_mahalanobis_distances,
    mean_low_severity_sampled_mahalanobis_distances_from_trials,
).pvalue
wt_p_value = ttest_ind(
    wt_mahalanobis_distances, mean_wt_sampled_mahalanobis_distances_from_trials
).pvalue

print(
    f"The p-value for the difference between the Mahalanobis distance of the sampled points and the actual points for the High-Severity genotype is {high_severity_p_value}"
)
print(
    f"The p-value for the difference between the Mahalanobis distance of the sampled points and the actual points for the Low-Severity genotype is {low_severity_p_value}"
)
print(
    f"The p-value for the difference between the Mahalanobis distance of the sampled points and the actual points for the Wild Type genotype is {wt_p_value}"
)

In [None]:
# anova test the mahalanobis distance between the genotypes
anova_result_across_genotypes = anova(
    high_severity_mahalanobis_distances,
    low_severity_mahalanobis_distances,
    wt_mahalanobis_distances,
)

print(
    f"The p-value for the ANOVA across the genotypes is {anova_result_across_genotypes.pvalue}"
)

# tukeys HSD test for the mahalanobis distance between the genotypes
tukeys_result_across_genotypes = pairwise_tukeyhsd(
    np.concatenate(
        [
            high_severity_mahalanobis_distances,
            low_severity_mahalanobis_distances,
            wt_mahalanobis_distances,
        ]
    ),
    np.concatenate(
        [
            ["High Severity"] * len(high_severity_mahalanobis_distances),
            ["Low Severity"] * len(low_severity_mahalanobis_distances),
            ["Wild Type"] * len(wt_mahalanobis_distances),
        ]
    ),
)
# get the tukeys HSD results as a dataframe
tukeys_result_across_genotypes_df = pd.DataFrame(
    data=tukeys_result_across_genotypes._results_table.data[1:],
    columns=tukeys_result_across_genotypes._results_table.data[0],
)
tukeys_result_across_genotypes_df

### write the Mahanobis distance stats to a file

In [None]:
# set the output dir
mahalanobis_output_dir = pathlib.Path("../results/mean_aggregated_results/").resolve()
# make the dir if it does not exist
mahalanobis_output_dir.mkdir(parents=True, exist_ok=True)

# define the output file path
mahalanobis_output_file_path = pathlib.Path(
    mahalanobis_output_dir / "mean_aggregated_mahalanobis_distance_results.csv"
).resolve()

# compile the results into a df
mahalanobis_results_df = pd.DataFrame(
    {
        "Genotype": ["High-Severity", "Low-Severity", "Wild Type"],
        "Actual Mean Mahalanobis Distance": [
            mean_high_severity_mahalanobis_distance,
            mean_low_severity_mahalanobis_distance,
            mean_wt_mahalanobis_distance,
        ],
        "Sampled Mean Mahalanobis Distance": [
            np.mean(mean_high_severity_sampled_mahalanobis_distances_from_trials),
            np.mean(mean_low_severity_sampled_mahalanobis_distances_from_trials),
            np.mean(mean_wt_sampled_mahalanobis_distances_from_trials),
        ],
        "p-Value for Actual compared to sampled": [
            high_severity_p_value,
            low_severity_p_value,
            wt_p_value,
        ],
        "ANOVA Compared to High-Severity p-adj": [
            "NA",
            tukeys_result_across_genotypes_df.loc[0, "p-adj"],
            tukeys_result_across_genotypes_df.loc[1, "p-adj"],
        ],
        "ANOVA Compared to Low-Severity p-adj": [
            tukeys_result_across_genotypes_df.loc[0, "p-adj"],
            "NA",
            tukeys_result_across_genotypes_df.loc[2, "p-adj"],
        ],
        "ANOVA Compared to Wild Type p-adj": [
            tukeys_result_across_genotypes_df.loc[1, "p-adj"],
            tukeys_result_across_genotypes_df.loc[2, "p-adj"],
            "NA",
        ],
    }
)
mahalanobis_results_df

In [None]:
# output the results
mahalanobis_results_df.to_csv(mahalanobis_output_file_path, index=False)

#### Visualization of the last trial's sampled points

In [None]:
# annotate the genotypes of the sampled points
high_severity_sampled_points["Metadata_genotype"] = "High-Severity"
low_severity_sampled_points["Metadata_genotype"] = "Mid-Severity"
wt_sampled_points["Metadata_genotype"] = "Wild Type"
# concat the genotype sampled points
sampled_points = pd.concat(
    [wt_sampled_points, low_severity_sampled_points, high_severity_sampled_points]
)

# plot the first 2 PCA components from the actual data
# set up subplots
plt.figure(figsize=(10, 5))
# subplot 1
plt.subplot(1, 2, 1)
sns.scatterplot(
    x="PC1",
    y="PC2",
    data=mean_aggregated_data_pca,
    hue="Metadata_genotype",
    alpha=0.5,
)
plt.title("Real PCA")

# plot the first 2 PCA components from the sampled data
# subplot 2
plt.subplot(1, 2, 2)
sns.scatterplot(
    x="PC1",
    y="PC2",
    data=sampled_points,
    hue="Metadata_genotype",
    alpha=0.5,
)
plt.title("Sampled PCA")
plt.show()