In [None]:
import sys
import os
from icecream import ic

from pathlib import Path

import utils_behavior

from utils_behavior import Ballpushing_utils
from utils_behavior import Utils
from utils_behavior import Processing
from utils_behavior import HoloviewsTemplates

import pandas as pd
import hvplot.pandas
import numpy as np

from scipy import stats
from statsmodels.stats.multitest import multipletests

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns


import importlib

import holoviews as hv

hv.extension("bokeh")

# Get the path to either save or load experiments

In [None]:
savepath = Utils.get_labserver() / "Experimental_data/MultiMazeRecorder/Datasets/240306_TNT_Fine_Experiments.pkl"


# Get the list of experiments

Use the cells below to run the code from the beginning. Useful if something looks wrong in the already pre-saved dataset.

In [None]:
# Get the data path
Datapath = Utils.get_data_path()

# Get all folders with "TNT_Fine" in the name

Folders = [
    f for f in os.listdir(Datapath) if "TNT_Fine" in f and "Tracked" in f and os.path.isdir(Datapath / f)
]

Folders

In [None]:
# Generate Experiment objects from each folder

Experiments = [Ballpushing_utils.Experiment(Datapath / f) for f in Folders]

In [None]:
Ballpushing_utils.save_object(Experiments, savepath.as_posix())

# Load pre-saved data from lab server

To quickly reload already built Experiments, use cells below.

In [None]:
# Load the experiments from the saved file
Experiments = utils_behavior.Ballpushing_utils.load_object(savepath.as_posix())

# Make the dataset from experiments

In [None]:
data = Ballpushing_utils.Dataset(Experiments)

In [None]:
print(data)

In [None]:
data.generate_dataset("summary")

# Drop the flies whose genotype is either "M6", M7, PR or CS
data.data=data.data[~data.data["Genotype"].isin(["M6", "M7", "PR", "CS"])]

In [None]:
mydata = data.data

# Plotting methods

## Set save folder

This also needs to be updated if you want to generate a new set of plots.

In [None]:
savepath = Utils.get_labserver() / "Experimental_data/MultiMazeRecorder/Plots/240418_summaries"

In [None]:
metrics = [
    "NumberEvents",
    "FinalEvent",
    "FinalTime",
    "SignificantEvents",
    "SignificantFirst",
    "SignificantFirstTime",
    "Pushes",
    "Pulls",
    "PullingRatio",
    "InteractionProportion",
    "AhaMoment",
    "AhaMomentIndex",
    "InsightEffect",
    "TimeToFinish",
    "SignificantRatio",
]

# Loop over the metrics
for metric in metrics:
    # Generate the jitter boxplot for the current metric
    HoloviewsTemplates.jitter_boxplot(
        data.data,
        metric,
        show=True,
        save=True,
        metadata=data.metadata,
        bs_controls=True,
        sort_by="median",
        hline_method="boxplot",
        readme=None,
    )

In some cases, one might only want to save one metric; here's how:

In [None]:
# Resave one of the metrics
HoloviewsTemplates.jitter_boxplot(
    data.data,
    "PullingRatio",
    show=False,
    save=True,
    metadata=data.metadata,
    bs_controls=True,
    sort_by="median",
    hline_method="boxplot",
)

# Statistics

In this chapter we try some statistics. Data has too many outliers to allow for a parametric test. We'll use non parametric tests. We'll also do PCA to see if any pattern is immediately obvious.

## Non-parametric tests

In [None]:
savepath = (
    Utils.get_labserver() / "Experimental_data/MultiMazeRecorder/Datasets/Stats_TNT_Fine"
)

In [None]:
metrics = [
    "NumberEvents",
    "FinalEvent",
    "FinalTime",
    "SignificantEvents",
    "SignificantFirst",
    "SignificantFirstTime",
    "Pushes",
    "Pulls",
    "PullingRatio",
    "InteractionProportion",
    "AhaMoment",
    "AhaMomentIndex",
    "InsightEffect",
    "TimeToFinish",
    "SignificantRatio",
]

Significant_results = []

for metric in metrics:

    # Ensure the data only contains numeric values
    if not pd.api.types.is_numeric_dtype(data.data[metric]):
        print(f"non-numeric metric: {metric}")
        data.data[metric] = pd.to_numeric(data.data[metric], errors="coerce")

    data_clean = data.data.dropna(subset=[metric])

    # Perform the Kruskal-Wallis H-test
    groups = [group[metric].values for name, group in data_clean.groupby("label")]
    H, p_kruskal = stats.kruskal(*groups)

    # Perform multiple comparisons
    p_values = []
    labels = []
    control_group = data_clean[data_clean["Genotype"] == "TNTxM7"][metric].values

    for name, group in data_clean[data_clean["Genotype"] != "TNTxM7"].groupby("label"):
        test_group = group[metric].values
        _, p = stats.mannwhitneyu(control_group, test_group, alternative="two-sided")
        p_values.append(p)
        labels.append(name)

    # Apply Bonferroni correction
    reject, p_values_corrected, _, _ = multipletests(p_values, method="bonferroni")

    # Create a DataFrame with the results
    results = pd.DataFrame(
        {
            "label": labels,
            "p_value": p_values,
            "p_value_corrected": p_values_corrected,
            "reject": reject,
        }
    )

    significant_results = results[results["reject"]]

    # If significant_result has values, print a message and save
    if significant_results.shape[0] > 0:
        print(f"Significant results for {metric}")
        print(significant_results)
        significant_results['metric'] = metric
        Significant_results.append(significant_results)
    else:
        print(f"No significant results for {metric}")
    # Save the table as csv with the metric name
    significant_results.to_csv(savepath / f"{metric}_stats_TNTPR.csv", index=False)

# Concatenate all significant results and save to a single CSV
if Significant_results:
    Significant_results_df = pd.concat(Significant_results)
    Significant_results_df.to_csv(savepath / "all_significant_results_TNTPR.csv", index=False)

# PCA on the data

Here I'll try to do PCA on the data to see if I can get something interesting by reducing the dimensionality of the data, including all the summary metrics.

In [None]:
# Load the brain region table

brain_regions_path = Utils.get_labserver() / "Experimental_data/Region_map_240312.csv"

registry = pd.read_csv(brain_regions_path)

registry

In [None]:
# Create a Simplified Nickname column in mydata using the registry table to match Nickname and Simplified Nickname
mydata = data.data

mydata = mydata.merge(registry, left_on="Genotype", right_on="Genotype", how="left")

mydata.head()

In [None]:
# Rename the "Genotype_y" column to "Genotype"
mydata.rename(columns={"Nickname_y": "Nickname"}, inplace=True)

### Subsetting the data

Some genotypes are removed from the analysis because PCA is sensitive to big variations. In particular, some like G74 and G75 are obviously unheathy flies from the videos whereas PR and CS that weren't crossed with TNT are less sensitive to starvation and show much lower activity levels.

In [None]:
# Subset the data to remove some of the genotypes. Let's start with the M6 and M7 and PR genotypes, remove these.

subset = mydata[~mydata["Genotype"].isin(["M6", "M7", "PR", "TNTxG74", "TNTxG75", "TNTxZ1633"])]

Some metrics are not relevant for the analysis; also, some, like Genotype, are more labels than metrics. Here we only keep the metrics that can explain variability.

In [None]:
# Subset the data to only include the label and metrics of interest
subset = subset[
    [
        "NumberEvents",
        "FinalEvent",
        "FinalTime",
        "SignificantEvents",
        "SignificantFirst",
        "SignificantFirstTime",
        "PullingRatio",
        "InteractionProportion",
        "AhaMoment",
        "AhaMomentIndex",
        "InsightEffect",
        "TimeToFinish",
        "SignificantRatio",
        "label",
        "Brain region",
        "fly",
        "Genotype",
        "Nickname",
        "Simplified Nickname",
    ]
]

In [None]:
# Separate out the features from the labels and brain region
features = subset.drop(
    ["label", "Brain region", "fly", "Genotype", "Nickname", "Simplified Nickname"],
    axis=1,
)

# Normalize the features
scaler = StandardScaler()

# Remove NaNs
nan_indices = features.dropna().index
features = features.loc[nan_indices].reset_index(drop=True)

features_normalized = scaler.fit_transform(features)


In [None]:

# Perform PCA
pca = PCA(n_components=2)  # Adjust n_components as needed
principalComponents = pca.fit_transform(features_normalized)

# Convert the principal components for each sample to a DataFrame
PCA_components = pd.DataFrame(principalComponents, columns=["PC1", "PC2"])

# Add your labels and brain region to this DataFrame
PCA_components["label"] = subset.loc[nan_indices, "label"].values
PCA_components["Brain region"] = subset.loc[nan_indices, "Brain region"].values
PCA_components["fly"] = subset.loc[nan_indices, "fly"].values
PCA_components["Genotype"] = subset.loc[nan_indices, "Genotype"].values
PCA_components["Nickname"] = subset.loc[nan_indices, "Nickname"].values
PCA_components["Simplified Nickname"] = subset.loc[nan_indices, "Simplified Nickname"].values

> Here we only kept 2 factors to get a 2D representation. We can also do a 3 (or more) version like this: 

In [None]:
pca3 = PCA(n_components=3)

principalComponents3 = pca3.fit_transform(features_normalized)


#Convert the principal components for each sample to a DataFrame
PCA_components3 = pd.DataFrame(principalComponents3, columns=["PC1", "PC2", "PC3"])

# Add your labels and brain region to this DataFrame
PCA_components3["label"] = subset.loc[nan_indices, "label"].values
PCA_components3["Brain region"] = subset.loc[nan_indices, "Brain region"].values
PCA_components3["fly"] = subset.loc[nan_indices, "fly"].values
PCA_components3["Genotype"] = subset.loc[nan_indices, "Genotype"].values
PCA_components3["Nickname"] = subset.loc[nan_indices, "Nickname"].values
PCA_components3["Simplified Nickname"] = subset.loc[nan_indices, "Simplified Nickname"].values

In [None]:
# Print the composition of the principal components
PCs_compo3 = pd.DataFrame(pca3.components_, columns=features.columns, index=["PC1", "PC2", "PC3"])

# Print the explained variance ratio
print(f"Explained variance of PC1, PC2 and PC3 : {pca3.explained_variance_ratio_}")

PCs_compo3

Out of curiosity, let's check how much variance each of the 10 first PCs can explain

In [None]:
pca10 = PCA(n_components=10)

principalComponents10 = pca10.fit_transform(features_normalized)

# Convert the principal components for each sample to a DataFrame
PCA_components10 = pd.DataFrame(principalComponents10, columns=[f"PC{i}" for i in range(1, 11)])

# Add your labels and brain region to this DataFrame
PCA_components10["label"] = subset.loc[nan_indices, "label"].values
PCA_components10["Brain region"] = subset.loc[nan_indices, "Brain region"].values
PCA_components10["fly"] = subset.loc[nan_indices, "fly"].values
PCA_components10["Genotype"] = subset.loc[nan_indices, "Genotype"].values
PCA_components10["Nickname"] = subset.loc[nan_indices, "Nickname"].values
PCA_components10["Simplified Nickname"] = subset.loc[nan_indices, "Simplified Nickname"].values

# Print the explained variance ratio
print(f"Explained variance of the first 10 PCs : {pca10.explained_variance_ratio_}")

In [None]:
# Print the composition of the principal components and highlight in green the values below -0.3 and in red the values above 0.3
PCs_compo10 = pd.DataFrame(pca10.components_, columns=features.columns, index=[f"PC{i}" for i in range(1, 11)])

PCs_compo10.style.applymap(lambda x: "color: green" if x < -0.3 else "color: red" if x > 0.3 else "")

# PCA summaries

In [None]:
# Print the composition of the principal components
PCs_compo = pd.DataFrame(pca.components_, columns=features.columns, index=["PC1", "PC2"])

# Print the explained variance ratio
print(f"Explained variance of PC1 and PC2 : {pca.explained_variance_ratio_}")

PCs_compo

In [None]:
# Save the composition of the principal components and the explained variance ratio
PCs_compo.to_csv(savepath/"PCs_composition.csv")
pd.DataFrame(pca.explained_variance_ratio_, index=["PC1", "PC2"], columns=["Explained variance"]).to_csv(savepath/"PCA_Explained_variance.csv")

In [None]:
# get all data unique Genotype values

unique_genotypes = subset["Genotype"].unique()

# Check if there is one called "TNTxZ2018"

"TNTxZ2018" in unique_genotypes

In [None]:
# Find the label associated with Genotype "TNTxZ2018"
TNTxZ2018_label = mydata[mydata["Genotype"] == "TNTxZ2018"]["label"].values[0]

TNTxZ2018_label

In [None]:
import holoviews as hv

# Separate the "TNTxZ2018" data from the rest of the data
TNTxZ2018_data = PCA_components[PCA_components["label"] == TNTxZ2018_label]
other_data = PCA_components[PCA_components["label"] != TNTxZ2018_label]

# Initialize an empty Layout
plots = hv.Layout()

# Generate one plot per Brain region
for brain_region in PCA_components["Brain region"].unique():
    df_brain_region = other_data[other_data["Brain region"] == brain_region]

    # Create separate scatter plots for the "TNTxZ2018" genotype and the other genotypes
    plot1 = df_brain_region.hvplot.scatter(
        x="PC1", y="PC2", by="label", hover_cols=["fly"], cmap="nipy_spectral"
    )
    plot2 = TNTxZ2018_data.hvplot.scatter(
        x="PC1",
        y="PC2",
        by="label",
        hover_cols=["fly"],
        color="black",
        marker="x",
        size=100,
    )

    # Combine the plots
    final_plot = (plot1 * plot2).opts(width=1000, height = 750)

    # Add the plot to the Layout
    plots += final_plot.relabel(f"PCA - Brain Region: {brain_region}")

# Save the Layout
hvplot.save(plots.cols(1), savepath/"240306_PCA_plots.html")
# Display the Layout
#hvplot.show(plots.cols(1))

# Plotting the PCs separately

In [None]:
PCA_components

In [None]:
# Make a new column called "Efficiency" that is the inverted values of PC1 (typically, -2 will be 2)

PCA_components["Efficiency"] = -PCA_components["PC1"]

In [None]:
importlib.reload(HoloviewsTemplates)

In [None]:
# Plot PC1 and PC2 as jitterboxplots
HoloviewsTemplates.jitter_boxplot(
    PCA_components,
    "Efficiency",
    folder="240426_TNT_New",
    kdims="label",
    plot_options=HoloviewsTemplates.hv_slides,
    show=True,
    save=True,
    metadata=[],
    bs_controls=True,
    sort_by="median",
    hline_method="boxplot",
    readme=None,
)

In [None]:
hv.help(hv.BoxWhisker)

In [None]:
# Same with PC2
HoloviewsTemplates.jitter_boxplot(
    PCA_components,
    "PC2",
    show=True,
    save=True,
    metadata=[],
    bs_controls=True,
    sort_by="median",
    hline_method="boxplot",
    readme=None,
)

In [None]:
# TODO : Find flies with particular pulling

# Statistical analysis

In [None]:
import pandas as pd
from scipy import stats
from statsmodels.stats.multitest import multipletests

# Perform the Kruskal-Wallis H-test
groups = [group["PC1"].values for name, group in PCA_components.groupby("Genotype")]
H, p_kruskal = stats.kruskal(*groups)

# Perform multiple comparisons
p_values = []
labels = []
genotypes = []
brain_regions = []
control_group = PCA_components[PCA_components["Genotype"] == "TNTxM6"]["PC1"].values

for name, group in PCA_components[PCA_components["Genotype"] != "TNTxM6"].groupby(
    "label"
):
    test_group = group["PC1"].values
    _, p = stats.mannwhitneyu(control_group, test_group, alternative="two-sided")
    p_values.append(p)
    labels.append(name)
    genotypes.append(
        group["Genotype"].iloc[0]
    )  # assuming each group has a single genotype
    brain_regions.append(
        group["Brain region"].iloc[0]
    )  # assuming each group has a single brain region

# Apply Bonferroni correction
reject, p_values_corrected, _, _ = multipletests(p_values, method="bonferroni")

# Create a DataFrame with the results
results = pd.DataFrame(
    {
        "label": labels,
        "Genotype": genotypes,
        "Brain region": brain_regions,
        "p_value": p_values,
        "p_value_corrected": p_values_corrected,
        "reject": reject,
    }
)

In [None]:
results

In [None]:
results[results["reject"] == True]

In [None]:
results[results["p_value"] < 0.05]

In [None]:
results.to_csv(savepath / "PC1_p_values_TNTEmptySplit.csv", index=False)

In [None]:
# Filter only the p_values that are smaller than 0.05
significant_results = results[results["reject"]]

significant_results

In [None]:
groups = [group["PC1"].values for name, group in PCA_components.groupby("label")]

Effect_Sizes = []
# control_group = PCA_components[PCA_components["Genotype"] == "TNTxZ2035"]["PC1"].values

control_genotypes = [
    "TNTxZ2035",
    "TNTxZ2018",
    "TNTxM7",
]  # Replace with your list of genotypes
control_group = PCA_components[PCA_components["Genotype"].isin(control_genotypes)][
    "PC1"
].values

ctrl_bci = Processing.draw_bs_ci(control_group)

# for name, group in PCA_components[PCA_components["Genotype"] != "TNTxZ2035"].groupby(
#     "label"
# ):
for name, group in PCA_components[
    ~PCA_components["Genotype"].isin(control_genotypes)
].groupby("label"):
    bci = Processing.draw_bs_ci(group["PC1"].values)

    effect_size = (bci[0] - ctrl_bci[1], bci[1] - ctrl_bci[0])

    # Get the 'Brain region' of the group
    brain_region = group["Brain region"].iloc[0]

    result = {"Brain region": brain_region, "label": name, "effect_size": effect_size, "bs_ci": bci}

    Effect_Sizes.append(result)

results = pd.DataFrame(Effect_Sizes)

# filter the results to only get those for which effect_size[0] and effect_size[1] are either both postive or negative
significant_results = results[results["effect_size"].apply(lambda x: x[0]*x[1] > 0)]

In [None]:
significant_results

In [None]:
ctrl = pd.DataFrame([{"Brain region": "Control", "label": "Control", "effect_size": None, "bs_ci": ctrl_bci}])

In [None]:
BsCi_dataset = pd.concat([ctrl, significant_results])

In [None]:
BsCi_dataset

In [None]:
# Compute the middle point of each interval
BsCi_dataset[["bs_ci_lower", "bs_ci_upper"]] = pd.DataFrame(
    BsCi_dataset["bs_ci"].tolist(), index=BsCi_dataset.index
)

BsCi_dataset["lower"] = abs(BsCi_dataset["bs_ci_lower"])
BsCi_dataset["upper"] = abs(BsCi_dataset["bs_ci_upper"])

BsCi_dataset['middle'] = (BsCi_dataset['bs_ci_lower'] + BsCi_dataset['bs_ci_upper']) / 2

# Compute the lengths of the error bars
BsCi_dataset["lower_length"] = BsCi_dataset["middle"] - BsCi_dataset["bs_ci_lower"]
BsCi_dataset["upper_length"] = BsCi_dataset["bs_ci_upper"] - BsCi_dataset["middle"]

# Sort the DataFrame by 'Brain region' and 'middle'
BsCi_dataset_sorted = BsCi_dataset.sort_values(["Brain region", "middle"])

Ctrl = BsCi_dataset_sorted[BsCi_dataset_sorted["label"] == "Control"]
# Create an Area plot for the confidence interval
hv_hline = hv.HSpan(Ctrl["bs_ci_lower"][0], Ctrl["bs_ci_upper"][0]).opts(
    fill_alpha=0.2, color="red"
)

# Create the plot
plot = hv.ErrorBars(
    BsCi_dataset_sorted,
    kdims=["label"],
    vdims=["middle", "lower_length", "upper_length", "Brain region"],
).opts(invert_axes=True, color='Brain region', line_width =2)

plot = plot*hv_hline


# Display the plot
hv.extension("bokeh")
plot.opts(width=600, height=1000, xrotation=90)

In [None]:
hv.save(plot, savepath/"PC1_bs_ci_TNT__PooledControls.html")

In [None]:
results = pd.DataFrame(Effect_Sizes)

In [None]:
BsCi_dataset.to_csv(savepath / "PC1_bs_ci_TNTPR.csv", index=False)

In [None]:
BsCi_dataset["Brain region"] ="NA"

In [None]:
importlib.reload(HoloviewsTemplates)

In [None]:
# Split the 'bs_ci' column into two columns 'bs_ci_lower' and 'bs_ci_upper'
BsCi_dataset[["bs_ci_lower", "bs_ci_upper"]] = pd.DataFrame(
    BsCi_dataset["bs_ci"].tolist(), index=BsCi_dataset.index
)


# Create the plot
plot = hv.ErrorBars(
    BsCi_dataset, kdims=["label"], vdims=["PC1", "bs_ci_lower", "bs_ci_upper"]
).opts()

# Display the plot
# hv.extension("bokeh")
plot.opts(width=600, height=1000, xrotation=90, invert_axes = True)

In [None]:
BsCi_dataset['bs_ci_lower']

## Applying the PCA to another dataset

In [None]:
# Get the data path
Datapath = Utils.get_data_path()

# Get all folders with "TNT_Fine" in the name

Folders = []
for folder in Datapath.iterdir():
    minfolder = str(folder).lower()
    if "feedingstate" in minfolder and "tracked" in minfolder and "pm" in minfolder:
        Folders.append(folder)


Folders

In [None]:
importlib.reload(Ballpushing_utils)

In [None]:
directory = Path("/mnt/labserver/DURRIEU_Matthias/Experimental_data/MultiMazeRecorder/Videos/230704_FeedingState_1_PM_Videos_Tracked")

In [None]:
# Find all directories containing at least one .mp4 file
mp4_directories = [
    dir for dir in directory.glob("**/*") if any(dir.glob("*.mp4"))
]

# Find all .mp4 files that are named the same as their parent directory
mp4_files = [
    mp4_file
    for dir in mp4_directories
    if (
        (mp4_file := dir / f"{dir.name}.mp4").exists()
        or (
            mp4_file := dir / f"{dir.parent.name}_corridor_{dir.name[-1]}.mp4"
        ).exists()
    )
]

mp4_files

In [None]:
Experiments = [Ballpushing_utils.Experiment(Datapath / f) for f in Folders]

In [None]:
Experiments

In [None]:
Exp1 = Experiments[0]

In [None]:
TestExp = Ballpushing_utils.Experiment(Path("/mnt/labserver/DURRIEU_Matthias/Experimental_data/MultiMazeRecorder/Videos/230704_FeedingState_1_PM_Videos_Tracked"))

In [None]:
TestFly = Ballpushing_utils.Fly(Path("/mnt/labserver/DURRIEU_Matthias/Experimental_data/MultiMazeRecorder/Videos/230704_FeedingState_1_PM_Videos_Tracked/arena4/corridor4/"))

In [None]:
Exp1.flies

In [None]:
savepath = Utils.get_labserver() / "Experimental_data/MultiMazeRecorder/Datasets/240422_FeedingState_Experiments.pkl"


In [None]:
Ballpushing_utils.save_object(Experiments, savepath.as_posix())

In [None]:
Experiments = Ballpushing_utils.load_object(savepath.as_posix())

In [None]:
FeedingStateData = Ballpushing_utils.Dataset(Experiments)

In [None]:
print(FeedingStateData)

In [None]:
FeedingStateData

In [None]:
FeedingStateData.generate_dataset("summary")

In [None]:
fs_data = FeedingStateData.data

In [None]:
# Get how many rows of Light values I have grouped by unique Light values

FeedingStateData.data.groupby("Light").size()

# Plotting the metrics

In [None]:
importlib.reload(HoloviewsTemplates)

In [None]:
FeedingStateData.data["Condition"] = (
    FeedingStateData.data["Light"].astype(str)
    + "_"
    + FeedingStateData.data["FeedingState"].astype(str)
)

In [None]:
HoloviewsTemplates.jitter_boxplot(
    FeedingStateData.data,
    vdim="NumberEvents",
    kdims="Condition",
    show=True,
    save=True,
    metadata=FeedingStateData.metadata,
    bs_controls=True,
    sort_by="median",
    hline_method="boxplot",
    readme=None,
)

In [None]:
importlib.reload(HoloviewsTemplates)

In [None]:
metrics = [
    "NumberEvents",
    "FinalEvent",
    "FinalTime",
    "SignificantEvents",
    "SignificantFirst",
    "SignificantFirstTime",
    "Pushes",
    "Pulls",
    "PullingRatio",
    "InteractionProportion",
    "AhaMoment",
    "AhaMomentIndex",
    "InsightEffect",
    "TimeToFinish",
    "SignificantRatio",
]

# Loop over the metrics
for metric in metrics:
    # Generate the jitter boxplot for the current metric
    HoloviewsTemplates.jitter_boxplot(
        FeedingStateData.data,
        vdim=metric,
        folder = "240422_FeedingState_Light_summaries/byFeedingState",
        kdims="FeedingState",
        show=True,
        save=True,
        metadata=FeedingStateData.metadata,
        bs_controls=True,
        sort_by="median",
        hline_method="boxplot",
        readme=None,
    )

# Some statistics

In [None]:
# Generate Bootstrapped confidence intervals of InsightEffect grouped by Light

grouped = FeedingStateData.data.groupby("Light")


In [None]:
Lighton = FeedingStateData.data[FeedingStateData.data["Light"]=="on"]
# Remove Nans
Lighton = Lighton.dropna(subset=["InsightEffect"])
LightOff = FeedingStateData.data[FeedingStateData.data["Light"] == "off"]
LightOff = LightOff.dropna(subset=["InsightEffect"])

In [None]:
on_bc = Processing.draw_bs_ci(Lighton['InsightEffect'])

on_bc

In [None]:
off_bc = Processing.draw_bs_ci(LightOff["InsightEffect"])

off_bc

In [None]:
EffectSize = (on_bc[0]/off_bc[1],on_bc[1] / off_bc[0])

EffectSize

In [None]:

results = {}
for name, group in grouped:
    data = group["InsightEffect"].values
    ci = Processing.draw_bs_ci(data)
    results[name] = ci
    
results

In [None]:
# Subset the data to only include the label and metrics of interest
data_fs = FeedingStateData.data
subset_fs = data_fs[
    [
        "NumberEvents",
        "FinalEvent",
        "FinalTime",
        "SignificantEvents",
        "SignificantFirst",
        "SignificantFirstTime",
        "PullingRatio",
        "InteractionProportion",
        "AhaMoment",
        "AhaMomentIndex",
        "InsightEffect",
        "TimeToFinish",
        "SignificantRatio",
        "label",
        "fly",
        "Light",
        "FeedingState",
        "Period",
        "Orientation",
    ]
]

In [None]:
subset_fs["new_label"] = subset_fs["label"]

subset_fs["new_fly"] = subset_fs["fly"]

In [None]:
# Separate out the features from the labels and brain region
new_features = subset_fs.drop(
    ["label", "fly", "new_label", "new_fly", "Light", "FeedingState", "Period", "Orientation"],
    axis=1,
)

# Remove NaNs
nan_indices = new_features.dropna().index
new_features = new_features.loc[nan_indices].reset_index(drop=True)

In [None]:
# Assume that 'new_features' is your new dataset
# Make sure to preprocess 'new_features' in the same way as your original dataset


# Normalize the new features
new_features_normalized = scaler.transform(new_features)  # Use the same scaler fitted on the original dataset

# Apply PCA
new_principalComponents = pca.transform(new_features_normalized)  # Use the same pca fitted on the original dataset

# Convert the principal components for each sample to a DataFrame
new_PCA_components = pd.DataFrame(new_principalComponents, columns=["PC1", "PC2"])

# Add your labels and brain region to this DataFrame
# Make sure 'new_subset' has the same structure as your original 'subset'
new_PCA_components["new_label"] = subset_fs.loc[nan_indices, "new_label"].values
new_PCA_components["new_fly"] = subset_fs.loc[nan_indices, "new_fly"].values
new_PCA_components["Light"] = subset_fs.loc[nan_indices, "Light"].values
new_PCA_components["FeedingState"] = subset_fs.loc[nan_indices, "FeedingState"].values
new_PCA_components["Period"] = subset_fs.loc[nan_indices, "Period"].values

In [None]:
new_PCA_components["Orientation"] = subset_fs.loc[nan_indices, "Orientation"].values

# Plotting

In [None]:
new_PCA_components

In [None]:
new_PCA_components["Brain region"] = "None"

new_PCA_components

In [None]:
new_PCA_components["Condition"] = (
    new_PCA_components["Light"].astype(str)
    + "_"
    + new_PCA_components["FeedingState"].astype(str)
)

In [None]:
# CHange "new_fly" column name to "fly"

new_PCA_components.rename(columns={"new_fly": "fly"}, inplace=True)

In [None]:
HoloviewsTemplates.jitter_boxplot(
    new_PCA_components,
    folder="240422_FeedingState_Light_summaries",
    vdim="PC1",
    kdims="Condition",
    show=True,
    save=True,
    metadata=[],
    bs_controls=True,
    sort_by="median",
    hline_method="boxplot",
    readme=None,
)

In [None]:
# Create separate scatter plots for the "TNTxZ2018" genotype and the other genotypes
PCplot = new_PCA_components.hvplot.scatter(
    x="PC1", y="PC2", by="FeedingState", hover_cols=["fly"], cmap="nipy_spectral"
).opts(width=1000, height = 750)

PCplot

In [None]:
output_path = (
    Utils.get_labserver()
    / "Experimental_data"
    / "MultiMazeRecorder"
    / "Plots"
    / "240422_FeedingState_Light_summaries"
    / "byFeedingState"
    / "PCA_Fullplot.html"
)

hv.save(PCplot, output_path)

# Ball types experiments

In [None]:
# Get the data path
Datapath = Utils.get_data_path()

# Get all folders with "TNT_Fine" in the name

Folders = []
for folder in Datapath.iterdir():
    minfolder = str(folder).lower()
    if "balltype" in minfolder and "tracked" in minfolder:
        Folders.append(folder)
        
        
print(Folders)

In [None]:
BallExps = [Ballpushing_utils.Experiment(Datapath/f) for f in Folders]

In [None]:
BallData = Ballpushing_utils.Dataset(BallExps)
BallData.generate_dataset("summary")

In [None]:
importlib.reload(HoloviewsTemplates)

In [None]:
metrics = [
    "NumberEvents",
    "FinalEvent",
    "FinalTime",
    "SignificantEvents",
    "SignificantFirst",
    "SignificantFirstTime",
    "Pushes",
    "Pulls",
    "PullingRatio",
    "InteractionProportion",
    "AhaMoment",
    "AhaMomentIndex",
    "InsightEffect",
    "TimeToFinish",
    "SignificantRatio",
]

# Loop over the metrics
for metric in metrics:
    # Generate the jitter boxplot for the current metric
    HoloviewsTemplates.jitter_boxplot(
        BallData.data,
        vdim=metric,
        kdims="BallType",
        show=True,
        save=True,
        metadata=BallData.metadata,
        bs_controls=True,
        sort_by="median",
        hline_method="boxplot",
        readme=None,
    )

In [None]:
HoloviewsTemplates.jitter_boxplot(
    BallData.data,
    vdim="InsightEffect",
    kdims="BallType",
    show=True,
    save=True,
    metadata=BallData.metadata,
    bs_controls=True,
    sort_by="median",
    hline_method="boxplot",
    readme=None,
)

# Full plot from data

In [None]:
pooled = PCA_components

pooled.head()

In [None]:
# Find the "Simplified Nickname" that have the word "MBON" two times in the same string
MBONs = pooled[pooled["Simplified Nickname"].str.contains("MBON")]["Simplified Nickname"].unique()

MBONs[10]


In [None]:
from bokeh.models import FuncTickFormatter

# Define the long labels
long_labels = ["MBON-16-GaL4   MBON-17-Gal4 ", "MBON-08-GaL4  MBON-09-GaL4 "]

# Replace the second space in each label with a newline character
new_labels = [label.replace(" ", " \n", 1) for label in long_labels]

# In pooled, replace the long labels with the new labels
pooled["Simplified Nickname"].replace(long_labels, new_labels, inplace=True)

In [None]:
# Reverse the value to the old long labels
pooled["Simplified Nickname"].replace(new_labels, long_labels, inplace=True)

In [None]:
# Check if the new labels are in the "Simplified Nickname" column
pooled[pooled["Simplified Nickname"].isin(new_labels)]

# Sorting data by brain region and Nickname

In [None]:
# Calculate the median for each 'Brain region' and 'Nickname'
median_values = pooled.groupby(["Brain region", "Simplified Nickname"])["PC2"].median()

# Sort 'Brain region' by its median
region_order = median_values.groupby("Brain region").median().sort_values().index

# Within each 'Brain region', sort 'Nickname' by its median
nickname_order_within_region = median_values.groupby("Brain region").apply(
    lambda x: x.sort_values().index.get_level_values("Simplified Nickname")
)

# Create a new category type for 'Brain region' with the calculated order
pooled["Brain region"] = pd.Categorical(
    pooled["Brain region"], categories=region_order, ordered=True
)

# Create a list to hold the correct order of 'Nickname' across all 'Brain regions'
correct_order_global = []

# For each 'Brain region', add the 'Nickname' order to the global list
for region in region_order:
    correct_order_global.extend(nickname_order_within_region[region])

# Convert 'Nickname' to a categorical type with the global order
pooled["Simplified Nickname"] = pd.Categorical(
    pooled["Simplified Nickname"], categories=correct_order_global, ordered=True
)

# Now you can sort
pooled.sort_values(by=["Brain region", "Simplified Nickname"], inplace=True)

# Compute and represent the control area

In [None]:
# Calculate 25% and 75% quantiles for the control group
control_data = pooled[pooled["Genotype"] == "TNTxZ2018"]
hline_values = (
    control_data["PC2"].quantile(0.25),
    control_data["PC2"].quantile(0.75),
)

In [None]:
# Get the limits for the y axis
y_min = pooled["PC2"].min()
# For y_max, use the 95th percentile of the data
y_max = pooled["PC2"].max()

In [None]:
from bokeh.models import HoverTool

# Get the metadata for the tooltips
tooltips = [
    ("Fly", "@fly"),
    ("PC2".capitalize(), "@PC2"),
]


hover = HoverTool(tooltips=tooltips)

# Create the boxplot

In [None]:
hv.extension("bokeh")
pooled_opts = {
    "boxwhisker": {
        #"box_fill_color": None,
        # "box_line_color": "black",
        "outlier_fill_color": None,
        "outlier_line_color": None,
        "framewise": True,
    },
    "scatter": {
        "jitter": 0.15,
        "color": "black",
        "alpha": 0.8,
        "size": 2,
        #"cmap": "Category10",
        "framewise": True,
    },
    "plot": {
        "width": 1100,
        "height": 1423,
        "show_legend": False,
        "xlabel": "",
        "invert_axes": True,
        "show_grid": True,
        "fontscale": 1,
        "title": "",
    },
}

In [None]:
# Aspect ratio computation

# For 1100 width I have 1423 height. What about if I have 1500 width?
# 1500 * 1423 / 1100 = 1935

# For 1800 width?
# 1800 * 1423 / 1100 = 2323

In [None]:
from bokeh.themes import Theme

# Create a custom theme
theme = Theme(
    json={
        "attrs": {
            "Title": {"text_font": "Arial"},
            "AxisLabel": {"text_font": "Arial"},
            "Legend": {"text_font": "Arial"},
            "TickLabel": {"text_font": "Arial"},
        }
    }
)

# Apply the theme
hv.renderer("bokeh").theme = theme

In [None]:
brain_regions = pooled["Brain region"].unique()

plot_options = pooled_opts


In [None]:
boxplot = hv.Overlay(
    [
        hv.BoxWhisker(
            pooled[pooled["Brain region"] == region],
            kdims="Simplified Nickname",
            vdims="PC2",
        ).opts(**plot_options["boxwhisker"], box_color=color)
        for region, color in zip(brain_regions, hv.Cycle("Category10"))
    ]
)

# Display the overlaid boxplots
# boxplot.opts(show_legend=False)

In [None]:
#boxplot

# Create the scatterplot

In [None]:
scatterplot = hv.Scatter(
    data=pooled,
    vdims=["PC2"]
    + ["fly"]
    + ["Brain region"]
    + ["Simplified Nickname"]
    + ["Genotype"]
    + ["label"]
    + ["PC2"],
    kdims=["Simplified Nickname"],
).opts(**plot_options["scatter"], tools=[hover], ylim=(y_min, y_max))

# Create the control area

In [None]:
hv_hline = hv.HSpan(hline_values[0], hline_values[1]).opts(fill_alpha=0.2, color="red")

# Combine the plots

In [None]:
# Get unique values of simplified labels
unique_labels = pooled["Simplified Nickname"].unique()
unique_labels

# Find the "Simplified Nickname" that is nan and find the associated "Nickname"
nan_simplified_nickname = pooled[pooled["Simplified Nickname"].isna()]["Nickname"].unique()

nan_simplified_nickname

In [None]:
jitterboxplot = (
    (hv_hline * boxplot * scatterplot)
    .opts(ylabel="PC2", **plot_options["plot"])
    .opts(show_grid=False, fontsize={"yticks": 10})
)

In [None]:
jitterboxplot

In [None]:
# Save the plot
hv.save(jitterboxplot, savepath/"240315_PCA2_jitterboxplot.html")

In [None]:
hv.extension("bokeh")

from bokeh.io import export_svgs

# Create a custom theme
theme = Theme(
    json={
        "attrs": {
            "Title": {"text_font": "Arial"},
            "AxisLabel": {"text_font": "Arial"},
            "Legend": {"text_font": "Arial"},
            "TickLabel": {"text_font": "Arial"},
        }
    }
)

# Apply the theme
hv.renderer("bokeh").theme = theme


def export_svg(obj, filename):
    plot_state = hv.renderer("bokeh").get_plot(obj).state
    plot_state.output_backend = "svg"
    export_svgs(plot_state, filename=filename)


export_svg(jitterboxplot, savepath / "240306_PCA_jitterboxplot.svg")

In [None]:
dpi = 96  # adjust this to match your intended DPI
width_in = 1100 / dpi
height_in = 1423 / dpi

In [None]:
# Redo the same plotting with matplotlib backend

pooled_opts_matplotlib = {
    "boxwhisker": {
        "showfliers": False,  # equivalent to setting outlier fill and line color to None
        "notch": False,  # equivalent to framewise
        # "patch_artist":True,
    },
    "scatter": {
        # "jitter": 0.15,
        "color": "black",
        "alpha": 0.8,
        "s": 4,  # equivalent to size
    },
    "plot": {
        "fig_size": 2000,  # equivalent to width and height (note: this is in inches)
        "show_legend": False,
        "xlabel": "",
        "invert_axes": True,
        "show_grid": True,
        "fontsize": {
            "title": 16,
            "labels": 14,
            "xticks": 12,
            "yticks": 12,
        },  # equivalent to fontscale
        "title": "",
    },
}

In [None]:
# Remake the jitterboxplot with the matplotlib backend

hv.extension("matplotlib")

In [None]:
from cycler import cycler
import matplotlib.pyplot as plt

# Create a color cycle
color_cycler = cycler(color=plt.cm.tab10.colors)

# Convert the color cycle to a list of colors
color_list = [c["color"] for c in color_cycler]

# Now you can use color_list in your plot
boxplot = hv.Overlay(
    [
        hv.BoxWhisker(
            pooled[pooled["Brain region"] == region], kdims="Nickname", vdims="PC1"
        ).opts(**pooled_opts_matplotlib["boxwhisker"], boxprops=dict(color="black", facecolor = color))
        for region, color in zip(brain_regions, color_list)
    ]
)

In [None]:
boxplot = hv.BoxWhisker(pooled, kdims="Nickname", vdims="PC1", by="Brain region").opts(
    **pooled_opts_matplotlib["boxwhisker"], cmap="category10")

In [None]:
boxplot

# Create the scatterplot

In [None]:
scatterplot = hv.Scatter(
    data=pooled,
    vdims=["PC1"],
    kdims=["Nickname"],
).opts(**pooled_opts_matplotlib["scatter"], ylim=(y_min, y_max))

In [None]:
#scatterplot

# Create the control area

In [None]:
hv_hline = hv.HSpan(hline_values[0], hline_values[1]).opts(alpha=0.2, color="red")

# Combine the plots

In [None]:
jitterboxplot = (hv_hline * boxplot * scatterplot).opts(
    ylabel="PC1", **pooled_opts_matplotlib["plot"]
)

In [None]:
jitterboxplot

In [None]:
# Save the plot as a matplotlib png
hv.save(jitterboxplot, savepath/"240306_PCA_jitterboxplot_matplotlib.png", fmt="png")