This notebook calculates the Mahalanobis distance between points on a pca. 
I will document more about what Mahalanobis distance is and how it is calculated in this notebook.

In [1]:
import itertools
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tqdm
from scipy.spatial.distance import mahalanobis
from scipy.stats import f_oneway as anova
from scipy.stats import ttest_ind
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [2]:
# set the path to the data
mean_aggregated_data_pca_path = pathlib.Path(
    "../../data/6.analysis_results/mean_aggregated_pca.parquet"
).resolve(strict=True)

# read the data
mean_aggregated_data_pca = pd.read_parquet(mean_aggregated_data_pca_path)
print(mean_aggregated_data_pca.shape)
mean_aggregated_data_pca.head()

(83, 86)


Unnamed: 0,Metadata_genotype,Metadata_replicate,Metadata_side,PC1,PC2,PC3,PC4,PC5,PC6,PC7,...,PC74,PC75,PC76,PC77,PC78,PC79,PC80,PC81,PC82,PC83
0,High-Severity,1,L,-7.71026,-4.264421,1.521397,-1.390342,9.41062,-6.054486,1.336078,...,0.213169,0.034717,0.129185,-0.14664,-0.081667,-0.075119,-0.020639,0.058634,0.028028,9.906943e-16
1,High-Severity,1,R,0.6958,8.170808,-1.21778,-2.386591,-2.60968,1.468471,4.371038,...,-0.215559,-0.068413,-0.266401,-0.0235,-0.0542,-0.023511,0.119934,0.145838,0.035305,-8.556710000000001e-17
2,High-Severity,10,L,6.910961,11.25212,-2.831527,-3.131419,-2.634531,3.551757,-2.684211,...,-0.075988,-0.123323,-0.210824,0.063699,0.14891,0.04104,0.020007,0.161693,-0.143457,-4.445563e-16
3,High-Severity,10,R,2.179877,6.184598,-3.46374,-0.260566,-3.067898,1.597567,2.061996,...,0.416289,0.348893,0.15141,0.326023,-0.207744,0.234788,-0.024102,-0.15051,0.049366,3.319675e-17
4,High-Severity,11,L,1.33669,9.305025,3.27923,0.938479,6.459164,-4.94401,-1.393619,...,-0.028287,0.092703,0.038004,0.095451,0.043627,0.006402,-0.030578,-0.017234,0.008486,-1.12952e-15


# Mahalanobis Distance
For more in depth information on Mahalanobis distance, please refer to this [link](https://medium.com/@the_daft_introvert/mahalanobis-distance-5c11a757b099).
Mahalanobis distance is a measure of the distance between a point P and a distribution D. 
It is a multi-dimensional generalization of the idea of measuring how many standard deviations away P is from the mean of D. 
This distance is zero if P is at the mean of D, and grows as P moves away from the mean along each principal component axis.
The formula for Mahalanobis distance is given by:
### $D^2 = (x - \mu)^T \Sigma^{-1} (x - \mu)$
where:
- $D$ is the Mahalanobis distance

## Caluclating Mahalanobis Distance on PCA 
Here I calculate the Mahalanobis distance between the points on the first two principal components of the data for each of the three genotypes.

In [3]:
# split the genotypes
high_severity = mean_aggregated_data_pca[
    mean_aggregated_data_pca["Metadata_genotype"] == "High-Severity"
]
low_severity = mean_aggregated_data_pca[
    mean_aggregated_data_pca["Metadata_genotype"] == "Mid-Severity"
]
wt = mean_aggregated_data_pca[
    mean_aggregated_data_pca["Metadata_genotype"] == "Wild Type"
]
print(len(high_severity), len(low_severity), len(wt))

28 27 28


In [4]:
# drop the the Metadata columns
metadata_columns = high_severity.columns[high_severity.columns.str.contains("Metadata")]
high_severity = high_severity.drop(metadata_columns, axis=1)
low_severity = low_severity.drop(metadata_columns, axis=1)
wt = wt.drop(metadata_columns, axis=1)


# convert the df to coordinates
high_severity_coords = high_severity.to_numpy()
low_severity_coords = low_severity.to_numpy()
wt_coords = wt.to_numpy()
print(high_severity_coords.shape, low_severity_coords.shape, wt_coords.shape)

(28, 83) (27, 83) (28, 83)


#### High-Severity

In [5]:
# define the mean and the inverse covariance matrix needed for the mahalanobis distance calculation
cov_matrix = np.cov(high_severity_coords, rowvar=False)
inv_cov_matrix = np.linalg.inv(cov_matrix)
# calculate the mahalanobis distance for each point within each genotype
high_severity_mahalanobis_distances = []
# calculate the mahalanobis distance for each point combination
for point in itertools.combinations(high_severity_coords, 2):
    distance = mahalanobis(point[0], point[1], inv_cov_matrix)
    if np.isnan(distance):  # Check for NaN values
        continue
    high_severity_mahalanobis_distances.append(distance)
mean_high_severity_mahalanobis_distance = np.mean(high_severity_mahalanobis_distances)
print(
    f"The mean mahalanobis distance for the High-Severity genotype is {mean_high_severity_mahalanobis_distance}"
)

The mean mahalanobis distance for the High-Severity genotype is 11.762056342410409


  return np.sqrt(m)


#### Low-Severity

In [6]:
# define the mean and the inverse covariance matrix needed for the mahalanobis distance calculation
cov_matrix = np.cov(low_severity_coords, rowvar=False)
inv_cov_matrix = np.linalg.inv(cov_matrix)
# calculate the mahalanobis distance for each point within each genotype
low_severity_mahalanobis_distances = []
# calculate the mahalanobis distance for each point
for point in itertools.combinations(low_severity_coords, 2):
    distance = mahalanobis(point[0], point[1], inv_cov_matrix)
    if np.isnan(distance):  # Check for NaN values
        continue
    low_severity_mahalanobis_distances.append(distance)
mean_low_severity_mahalanobis_distance = np.mean(low_severity_mahalanobis_distances)
print(
    f"The mean mahalanobis distance for the low-Severity genotype is {mean_low_severity_mahalanobis_distance}"
)

The mean mahalanobis distance for the low-Severity genotype is 13.336934733749693


#### Wild Type

In [7]:
# define the mean and the inverse covariance matrix needed for the mahalanobis distance calculation
cov_matrix = np.cov(wt_coords, rowvar=False)
inv_cov_matrix = np.linalg.inv(cov_matrix)
# calculate the mahalanobis distance for each point within each genotype
wt_mahalanobis_distances = []
# calculate the mahalanobis distance for each point
for point in itertools.combinations(wt_coords, 2):
    distance = mahalanobis(point[0], point[1], inv_cov_matrix)
    if np.isnan(distance):  # Check for NaN values
        continue
    wt_mahalanobis_distances.append(distance)
mean_wt_mahalanobis_distance = np.mean(wt_mahalanobis_distances)
print(
    f"The mean mahalanobis distance for the Wild Type genotype is {mean_wt_mahalanobis_distance}"
)

The mean mahalanobis distance for the Wild Type genotype is 10.91427607576582


## Calculate the Mahalanobis distance for random shuffled data points

In [8]:
# from the distribution of each PCA component, randomly sample 1000 points and calculate the mahalanobis distance
# get the distribution of each PCA component
high_severity_pca_component_distribution_std = high_severity.describe().loc["std"]
low_severity_pca_component_distribution_std = low_severity.describe().loc["std"]
wt_pca_component_distribution_std = wt.describe().loc["std"]

high_severity_pca_component_distribution_mean = high_severity.describe().loc["mean"]
low_severity_pca_component_distribution_mean = low_severity.describe().loc["mean"]
wt_pca_component_distribution_mean = wt.describe().loc["mean"]

In [9]:
# do 1000 trials of sampling from each PCA component distribution
# from the distribution of each PCA component, randomly sample 1000 points and calculate the mahalanobis distance

# set the output mean Mahalanobis distance lists per genotype
mean_high_severity_sampled_mahalanobis_distances_from_trials = []
mean_low_severity_sampled_mahalanobis_distances_from_trials = []
mean_wt_sampled_mahalanobis_distances_from_trials = []

# sample 1000 points from the distribution of each PCA component
num_pca_components = high_severity_pca_component_distribution_std.shape[0]
num_samples = 100
num_trials = 1000

for trial in tqdm.tqdm(range(num_trials)):
    high_severity_sampled_points = pd.DataFrame()
    low_severity_sampled_points = pd.DataFrame()
    wt_sampled_points = pd.DataFrame()

    for i in range(num_pca_components):
        high_severity_sampled_points[f"PC{i+1}"] = np.random.normal(
            loc=high_severity_pca_component_distribution_mean[i],
            scale=high_severity_pca_component_distribution_std[i],
            size=num_samples,
        )
        low_severity_sampled_points[f"PC{i+1}"] = np.random.normal(
            loc=low_severity_pca_component_distribution_mean[i],
            scale=low_severity_pca_component_distribution_std[i],
            size=num_samples,
        )
        wt_sampled_points[f"PC{i+1}"] = np.random.normal(
            loc=wt_pca_component_distribution_mean[i],
            scale=wt_pca_component_distribution_std[i],
            size=num_samples,
        )

    inv_cov_matrix = np.linalg.inv(
        np.cov(high_severity_sampled_points.to_numpy(), rowvar=False)
    )
    # calculate the mahalanobis distance for each point within each genotype
    high_severity_sampled_mahalanobis_distances = []
    # calculate the mahalanobis distance for each point combination
    for point in itertools.combinations(high_severity_sampled_points.to_numpy(), 2):
        distance = mahalanobis(point[0], point[1], inv_cov_matrix)
        if np.isnan(distance):  # Check for NaN values
            continue
        high_severity_sampled_mahalanobis_distances.append(distance)

    mean_high_severity_sampled_mahalanobis_distance = np.mean(
        high_severity_sampled_mahalanobis_distances
    )

    inv_cov_matrix = np.linalg.inv(
        np.cov(low_severity_sampled_points.to_numpy(), rowvar=False)
    )
    # calculate the mahalanobis distance for each point within each genotype
    low_severity_sampled_mahalanobis_distances = []
    # calculate the mahalanobis distance for each point combination
    for point in itertools.combinations(low_severity_sampled_points.to_numpy(), 2):
        distance = mahalanobis(point[0], point[1], inv_cov_matrix)
        if np.isnan(distance):  # Check for NaN values
            continue
        low_severity_sampled_mahalanobis_distances.append(distance)

    mean_low_severity_sampled_mahalanobis_distance = np.mean(
        low_severity_sampled_mahalanobis_distances
    )

    inv_cov_matrix = np.linalg.inv(np.cov(wt_sampled_points.to_numpy(), rowvar=False))
    # calculate the mahalanobis distance for each point within each genotype
    wt_sampled_mahalanobis_distances = []
    # calculate the mahalanobis distance for each point combination
    for point in itertools.combinations(wt_sampled_points.to_numpy(), 2):
        distance = mahalanobis(point[0], point[1], inv_cov_matrix)
        if np.isnan(distance):  # Check for NaN values
            continue
        wt_sampled_mahalanobis_distances.append(distance)

    mean_wt_sampled_mahalanobis_distance = np.mean(wt_sampled_mahalanobis_distances)

    # append the average mahalanobis distance from the trial to the list for the trial per genotype
    mean_high_severity_sampled_mahalanobis_distances_from_trials.append(
        mean_high_severity_sampled_mahalanobis_distance
    )
    mean_low_severity_sampled_mahalanobis_distances_from_trials.append(
        mean_low_severity_sampled_mahalanobis_distance
    )
    mean_wt_sampled_mahalanobis_distances_from_trials.append(
        mean_wt_sampled_mahalanobis_distance
    )

  0%|                                                                              | 0/1000 [00:00<?, ?it/s]

  0%|                                                                      | 1/1000 [00:00<02:44,  6.08it/s]

  0%|▏                                                                     | 2/1000 [00:00<03:20,  4.98it/s]

  0%|▏                                                                     | 3/1000 [00:00<04:10,  3.98it/s]

  0%|▎                                                                     | 4/1000 [00:00<04:05,  4.05it/s]

  0%|▎                                                                     | 5/1000 [00:01<04:33,  3.64it/s]

  1%|▍                                                                     | 6/1000 [00:01<04:39,  3.55it/s]

  1%|▍                                                                     | 7/1000 [00:01<04:28,  3.70it/s]

  1%|▌                                                                     | 8/1000 [00:01<03:57,  4.18it/s]

  1%|▋                                                                     | 9/1000 [00:02<03:36,  4.57it/s]

  1%|▋                                                                    | 10/1000 [00:02<03:45,  4.39it/s]

  1%|▊                                                                    | 11/1000 [00:02<03:27,  4.77it/s]

  1%|▊                                                                    | 12/1000 [00:02<03:29,  4.72it/s]

  1%|▉                                                                    | 13/1000 [00:03<03:39,  4.50it/s]

  1%|▉                                                                    | 14/1000 [00:03<03:52,  4.25it/s]

  2%|█                                                                    | 15/1000 [00:03<03:53,  4.21it/s]

  2%|█                                                                    | 16/1000 [00:03<03:34,  4.59it/s]

  2%|█▏                                                                   | 17/1000 [00:03<03:21,  4.88it/s]

  2%|█▏                                                                   | 18/1000 [00:04<03:11,  5.12it/s]

  2%|█▎                                                                   | 19/1000 [00:04<03:31,  4.63it/s]

  2%|█▍                                                                   | 20/1000 [00:04<03:40,  4.45it/s]

  2%|█▍                                                                   | 21/1000 [00:04<03:45,  4.34it/s]

  2%|█▌                                                                   | 22/1000 [00:04<03:28,  4.69it/s]

  2%|█▌                                                                   | 23/1000 [00:05<03:16,  4.97it/s]

  2%|█▋                                                                   | 24/1000 [00:05<03:17,  4.94it/s]

  2%|█▋                                                                   | 25/1000 [00:05<03:35,  4.52it/s]

  3%|█▊                                                                   | 26/1000 [00:06<04:22,  3.72it/s]

  3%|█▊                                                                   | 27/1000 [00:06<04:20,  3.73it/s]

  3%|█▉                                                                   | 28/1000 [00:06<04:41,  3.45it/s]

  3%|██                                                                   | 29/1000 [00:06<04:28,  3.61it/s]

  3%|██                                                                   | 30/1000 [00:07<04:55,  3.28it/s]

  3%|██▏                                                                  | 31/1000 [00:07<05:09,  3.13it/s]

  3%|██▏                                                                  | 32/1000 [00:08<05:44,  2.81it/s]

  3%|██▎                                                                  | 33/1000 [00:08<05:46,  2.79it/s]

  3%|██▎                                                                  | 34/1000 [00:08<04:52,  3.30it/s]

  4%|██▍                                                                  | 35/1000 [00:08<05:03,  3.18it/s]

  4%|██▍                                                                  | 36/1000 [00:09<04:43,  3.40it/s]

  4%|██▌                                                                  | 37/1000 [00:09<04:33,  3.52it/s]

  4%|██▌                                                                  | 38/1000 [00:09<04:08,  3.87it/s]

  4%|██▋                                                                  | 39/1000 [00:09<04:06,  3.91it/s]

  4%|██▊                                                                  | 40/1000 [00:10<04:35,  3.48it/s]

  4%|██▊                                                                  | 41/1000 [00:10<04:20,  3.68it/s]

  4%|██▉                                                                  | 42/1000 [00:10<04:43,  3.38it/s]

  4%|██▉                                                                  | 43/1000 [00:11<04:29,  3.56it/s]

  4%|███                                                                  | 44/1000 [00:11<04:45,  3.35it/s]

  4%|███                                                                  | 45/1000 [00:11<04:26,  3.58it/s]

  5%|███▏                                                                 | 46/1000 [00:11<04:17,  3.71it/s]

  5%|███▏                                                                 | 47/1000 [00:12<04:04,  3.89it/s]

  5%|███▎                                                                 | 48/1000 [00:12<04:24,  3.60it/s]

  5%|███▍                                                                 | 49/1000 [00:12<04:40,  3.38it/s]

  5%|███▍                                                                 | 50/1000 [00:13<04:54,  3.22it/s]

  5%|███▌                                                                 | 51/1000 [00:13<04:16,  3.70it/s]

  5%|███▌                                                                 | 52/1000 [00:13<04:10,  3.78it/s]

  5%|███▋                                                                 | 53/1000 [00:13<04:47,  3.29it/s]

  5%|███▋                                                                 | 54/1000 [00:14<04:11,  3.77it/s]

  6%|███▊                                                                 | 55/1000 [00:14<04:07,  3.82it/s]

  6%|███▊                                                                 | 56/1000 [00:14<04:02,  3.89it/s]

  6%|███▉                                                                 | 57/1000 [00:14<04:30,  3.48it/s]

  6%|████                                                                 | 58/1000 [00:15<03:58,  3.95it/s]

  6%|████                                                                 | 59/1000 [00:15<03:35,  4.38it/s]

  6%|████▏                                                                | 60/1000 [00:15<03:23,  4.61it/s]

  6%|████▏                                                                | 61/1000 [00:15<03:11,  4.90it/s]

  6%|████▎                                                                | 62/1000 [00:15<03:22,  4.62it/s]

  6%|████▎                                                                | 63/1000 [00:16<03:46,  4.13it/s]

  6%|████▍                                                                | 64/1000 [00:16<04:15,  3.67it/s]

  6%|████▍                                                                | 65/1000 [00:16<03:47,  4.11it/s]

  7%|████▌                                                                | 66/1000 [00:16<03:40,  4.23it/s]

  7%|████▌                                                                | 67/1000 [00:17<03:23,  4.57it/s]

  7%|████▋                                                                | 68/1000 [00:17<03:21,  4.64it/s]

  7%|████▊                                                                | 69/1000 [00:17<03:28,  4.46it/s]

  7%|████▊                                                                | 70/1000 [00:17<03:14,  4.77it/s]

  7%|████▉                                                                | 71/1000 [00:17<03:04,  5.03it/s]

  7%|████▉                                                                | 72/1000 [00:18<02:59,  5.17it/s]

  7%|█████                                                                | 73/1000 [00:18<03:10,  4.86it/s]

  7%|█████                                                                | 74/1000 [00:18<03:02,  5.09it/s]

  8%|█████▏                                                               | 75/1000 [00:18<03:02,  5.06it/s]

  8%|█████▏                                                               | 76/1000 [00:18<02:56,  5.23it/s]

  8%|█████▎                                                               | 77/1000 [00:19<03:11,  4.81it/s]

  8%|█████▍                                                               | 78/1000 [00:19<03:09,  4.86it/s]

  8%|█████▍                                                               | 79/1000 [00:19<03:00,  5.09it/s]

  8%|█████▌                                                               | 80/1000 [00:19<03:14,  4.74it/s]

  8%|█████▌                                                               | 81/1000 [00:20<03:41,  4.16it/s]

  8%|█████▋                                                               | 82/1000 [00:20<03:22,  4.53it/s]

  8%|█████▋                                                               | 83/1000 [00:20<03:12,  4.75it/s]

  8%|█████▊                                                               | 84/1000 [00:20<03:04,  4.97it/s]

  8%|█████▊                                                               | 85/1000 [00:20<02:57,  5.16it/s]

  9%|█████▉                                                               | 86/1000 [00:21<02:59,  5.08it/s]

  9%|██████                                                               | 87/1000 [00:21<03:13,  4.73it/s]

  9%|██████                                                               | 88/1000 [00:21<03:31,  4.31it/s]

  9%|██████▏                                                              | 89/1000 [00:21<03:41,  4.11it/s]

  9%|██████▏                                                              | 90/1000 [00:22<03:29,  4.34it/s]

  9%|██████▎                                                              | 91/1000 [00:22<03:20,  4.53it/s]

  9%|██████▎                                                              | 92/1000 [00:22<04:00,  3.77it/s]

  9%|██████▍                                                              | 93/1000 [00:22<03:35,  4.20it/s]

  9%|██████▍                                                              | 94/1000 [00:23<03:45,  4.02it/s]

 10%|██████▌                                                              | 95/1000 [00:23<03:53,  3.88it/s]

 10%|██████▌                                                              | 96/1000 [00:23<03:50,  3.93it/s]

 10%|██████▋                                                              | 97/1000 [00:23<03:28,  4.34it/s]

 10%|██████▊                                                              | 98/1000 [00:23<03:12,  4.68it/s]

 10%|██████▊                                                              | 99/1000 [00:24<03:01,  4.95it/s]

 10%|██████▊                                                             | 100/1000 [00:24<03:13,  4.65it/s]

 10%|██████▊                                                             | 101/1000 [00:24<03:21,  4.46it/s]

 10%|██████▉                                                             | 102/1000 [00:24<03:24,  4.40it/s]

 10%|███████                                                             | 103/1000 [00:25<03:29,  4.29it/s]

 10%|███████                                                             | 104/1000 [00:25<03:12,  4.66it/s]

 10%|███████▏                                                            | 105/1000 [00:25<03:01,  4.94it/s]

 11%|███████▏                                                            | 106/1000 [00:25<02:53,  5.16it/s]

 11%|███████▎                                                            | 107/1000 [00:25<03:09,  4.71it/s]

 11%|███████▎                                                            | 108/1000 [00:26<03:24,  4.35it/s]

 11%|███████▍                                                            | 109/1000 [00:26<03:10,  4.69it/s]

 11%|███████▍                                                            | 110/1000 [00:26<02:59,  4.96it/s]

 11%|███████▌                                                            | 111/1000 [00:26<02:52,  5.17it/s]

 11%|███████▌                                                            | 112/1000 [00:26<02:46,  5.33it/s]

 11%|███████▋                                                            | 113/1000 [00:26<02:42,  5.46it/s]

 11%|███████▊                                                            | 114/1000 [00:27<02:39,  5.55it/s]

 12%|███████▊                                                            | 115/1000 [00:27<02:37,  5.61it/s]

 12%|███████▉                                                            | 116/1000 [00:27<02:49,  5.21it/s]

 12%|███████▉                                                            | 117/1000 [00:27<03:04,  4.79it/s]

 12%|████████                                                            | 118/1000 [00:27<02:54,  5.06it/s]

 12%|████████                                                            | 119/1000 [00:28<02:47,  5.25it/s]

 12%|████████▏                                                           | 120/1000 [00:28<03:04,  4.76it/s]

 12%|████████▏                                                           | 121/1000 [00:28<02:57,  4.95it/s]

 12%|████████▎                                                           | 122/1000 [00:28<02:49,  5.17it/s]

 12%|████████▎                                                           | 123/1000 [00:28<02:44,  5.33it/s]

 12%|████████▍                                                           | 124/1000 [00:29<02:40,  5.45it/s]

 12%|████████▌                                                           | 125/1000 [00:29<03:03,  4.78it/s]

 13%|████████▌                                                           | 126/1000 [00:29<02:53,  5.04it/s]

 13%|████████▋                                                           | 127/1000 [00:29<02:46,  5.24it/s]

 13%|████████▋                                                           | 128/1000 [00:29<02:41,  5.39it/s]

 13%|████████▊                                                           | 129/1000 [00:30<02:38,  5.49it/s]

 13%|████████▊                                                           | 130/1000 [00:30<03:29,  4.14it/s]

 13%|████████▉                                                           | 131/1000 [00:30<03:12,  4.52it/s]

 13%|████████▉                                                           | 132/1000 [00:30<03:17,  4.40it/s]

 13%|█████████                                                           | 133/1000 [00:31<03:30,  4.12it/s]

 13%|█████████                                                           | 134/1000 [00:31<04:21,  3.32it/s]

 14%|█████████▏                                                          | 135/1000 [00:31<03:48,  3.79it/s]

 14%|█████████▏                                                          | 136/1000 [00:31<03:43,  3.86it/s]

 14%|█████████▎                                                          | 137/1000 [00:32<04:01,  3.57it/s]

 14%|█████████▍                                                          | 138/1000 [00:32<03:33,  4.03it/s]

 14%|█████████▍                                                          | 139/1000 [00:32<03:37,  3.95it/s]

 14%|█████████▌                                                          | 140/1000 [00:33<03:58,  3.61it/s]

 14%|█████████▌                                                          | 141/1000 [00:33<03:49,  3.74it/s]

 14%|█████████▋                                                          | 142/1000 [00:33<03:44,  3.82it/s]

 14%|█████████▋                                                          | 143/1000 [00:33<03:40,  3.89it/s]

 14%|█████████▊                                                          | 144/1000 [00:34<03:57,  3.61it/s]

 14%|█████████▊                                                          | 145/1000 [00:34<03:31,  4.05it/s]

 15%|█████████▉                                                          | 146/1000 [00:34<03:12,  4.43it/s]

 15%|█████████▉                                                          | 147/1000 [00:34<03:17,  4.32it/s]

 15%|██████████                                                          | 148/1000 [00:34<03:21,  4.22it/s]

 15%|██████████▏                                                         | 149/1000 [00:35<03:05,  4.58it/s]

 15%|██████████▏                                                         | 150/1000 [00:35<02:54,  4.86it/s]

 15%|██████████▎                                                         | 151/1000 [00:35<02:47,  5.08it/s]

 15%|██████████▎                                                         | 152/1000 [00:35<02:51,  4.94it/s]

 15%|██████████▍                                                         | 153/1000 [00:35<03:02,  4.64it/s]

 15%|██████████▍                                                         | 154/1000 [00:36<03:02,  4.64it/s]

 16%|██████████▌                                                         | 155/1000 [00:36<02:51,  4.92it/s]

 16%|██████████▌                                                         | 156/1000 [00:36<03:02,  4.63it/s]

 16%|██████████▋                                                         | 157/1000 [00:36<03:09,  4.45it/s]

 16%|██████████▋                                                         | 158/1000 [00:37<02:57,  4.74it/s]

 16%|██████████▊                                                         | 159/1000 [00:37<02:57,  4.74it/s]

 16%|██████████▉                                                         | 160/1000 [00:37<03:08,  4.45it/s]

 16%|██████████▉                                                         | 160/1000 [00:37<03:17,  4.26it/s]




In [None]:
print(
    f"The high Mahalonobis distance for {num_trials} trials is: {round(np.mean(mean_high_severity_sampled_mahalanobis_distances_from_trials),0)}"
)
print(
    f"The low Mahalonobis distance for {num_trials} trials is: {round(np.mean(mean_low_severity_sampled_mahalanobis_distances_from_trials),0)}"
)
print(
    f"The wt Mahalonobis distance for {num_trials} trials is: {round(np.mean(mean_wt_sampled_mahalanobis_distances_from_trials),0)}"
)

### Show the mahalanobis distance for each genotype

In [None]:
print(
    "Mahalanobis distance for High-Severity: ", mean_high_severity_mahalanobis_distance
)
print("Mahalanobis distance for Low-Severity: ", mean_low_severity_mahalanobis_distance)
print("Mahalanobis distance for Wild Type: ", mean_wt_mahalanobis_distance)

In [None]:
# hypothesis testing the difference between the Mahalanobis distance of the sampled points and the actual points
# calculate the p-value for the difference between the Mahalanobis distance of the sampled points and the actual points

high_severity_p_value = ttest_ind(
    high_severity_mahalanobis_distances,
    mean_high_severity_sampled_mahalanobis_distances_from_trials,
).pvalue
low_severity_p_value = ttest_ind(
    low_severity_mahalanobis_distances,
    mean_low_severity_sampled_mahalanobis_distances_from_trials,
).pvalue
wt_p_value = ttest_ind(
    wt_mahalanobis_distances, mean_wt_sampled_mahalanobis_distances_from_trials
).pvalue

print(
    f"The p-value for the difference between the Mahalanobis distance of the sampled points and the actual points for the High-Severity genotype is {high_severity_p_value}"
)
print(
    f"The p-value for the difference between the Mahalanobis distance of the sampled points and the actual points for the Low-Severity genotype is {low_severity_p_value}"
)
print(
    f"The p-value for the difference between the Mahalanobis distance of the sampled points and the actual points for the Wild Type genotype is {wt_p_value}"
)

In [None]:
# anova test the mahalanobis distance between the genotypes
anova_result_across_genotypes = anova(
    high_severity_mahalanobis_distances,
    low_severity_mahalanobis_distances,
    wt_mahalanobis_distances,
)

print(
    f"The p-value for the ANOVA across the genotypes is {anova_result_across_genotypes.pvalue}"
)

# tukeys HSD test for the mahalanobis distance between the genotypes
tukeys_result_across_genotypes = pairwise_tukeyhsd(
    np.concatenate(
        [
            high_severity_mahalanobis_distances,
            low_severity_mahalanobis_distances,
            wt_mahalanobis_distances,
        ]
    ),
    np.concatenate(
        [
            ["High Severity"] * len(high_severity_mahalanobis_distances),
            ["Low Severity"] * len(low_severity_mahalanobis_distances),
            ["Wild Type"] * len(wt_mahalanobis_distances),
        ]
    ),
)
# get the tukeys HSD results as a dataframe
tukeys_result_across_genotypes_df = pd.DataFrame(
    data=tukeys_result_across_genotypes._results_table.data[1:],
    columns=tukeys_result_across_genotypes._results_table.data[0],
)
tukeys_result_across_genotypes_df

### write the Mahanobis distance stats to a file

In [None]:
# set the output dir
mahalanobis_output_dir = pathlib.Path("../results/mean_aggregated_results/").resolve()
# make the dir if it does not exist
mahalanobis_output_dir.mkdir(parents=True, exist_ok=True)

# define the output file path
mahalanobis_output_file_path = pathlib.Path(
    mahalanobis_output_dir / "mean_aggregated_mahalanobis_distance_results.csv"
).resolve()

# compile the results into a df
mahalanobis_results_df = pd.DataFrame(
    {
        "Genotype": ["High-Severity", "Low-Severity", "Wild Type"],
        "Actual Mean Mahalanobis Distance": [
            mean_high_severity_mahalanobis_distance,
            mean_low_severity_mahalanobis_distance,
            mean_wt_mahalanobis_distance,
        ],
        "Sampled Mean Mahalanobis Distance": [
            np.mean(mean_high_severity_sampled_mahalanobis_distances_from_trials),
            np.mean(mean_low_severity_sampled_mahalanobis_distances_from_trials),
            np.mean(mean_wt_sampled_mahalanobis_distances_from_trials),
        ],
        "p-Value for Actual compared to sampled": [
            high_severity_p_value,
            low_severity_p_value,
            wt_p_value,
        ],
        "ANOVA Compared to High-Severity p-adj": [
            "NA",
            tukeys_result_across_genotypes_df.loc[0, "p-adj"],
            tukeys_result_across_genotypes_df.loc[1, "p-adj"],
        ],
        "ANOVA Compared to Low-Severity p-adj": [
            tukeys_result_across_genotypes_df.loc[0, "p-adj"],
            "NA",
            tukeys_result_across_genotypes_df.loc[2, "p-adj"],
        ],
        "ANOVA Compared to Wild Type p-adj": [
            tukeys_result_across_genotypes_df.loc[1, "p-adj"],
            tukeys_result_across_genotypes_df.loc[2, "p-adj"],
            "NA",
        ],
    }
)
mahalanobis_results_df

In [None]:
# output the results
mahalanobis_results_df.to_csv(mahalanobis_output_file_path, index=False)

#### Visualization of the last trial's sampled points

In [None]:
# annotate the genotypes of the sampled points
high_severity_sampled_points["Metadata_genotype"] = "High-Severity"
low_severity_sampled_points["Metadata_genotype"] = "Mid-Severity"
wt_sampled_points["Metadata_genotype"] = "Wild Type"
# concat the genotype sampled points
sampled_points = pd.concat(
    [wt_sampled_points, low_severity_sampled_points, high_severity_sampled_points]
)

# plot the first 2 PCA components from the actual data
# set up subplots
plt.figure(figsize=(10, 5))
# subplot 1
plt.subplot(1, 2, 1)
sns.scatterplot(
    x="PC1",
    y="PC2",
    data=mean_aggregated_data_pca,
    hue="Metadata_genotype",
    alpha=0.5,
)
plt.title("Real PCA")

# plot the first 2 PCA components from the sampled data
# subplot 2
plt.subplot(1, 2, 2)
sns.scatterplot(
    x="PC1",
    y="PC2",
    data=sampled_points,
    hue="Metadata_genotype",
    alpha=0.5,
)
plt.title("Sampled PCA")
plt.show()