In [None]:
import sys
import os
from icecream import ic

from pathlib import Path

sys.path.insert(0, "..")
sys.path.insert(0, "../../../Utilities")

sys.path.insert(0, "../../..")

import Ballpushing_utils
import Utils
import Processing
import HoloviewsTemplates

import pandas as pd
import hvplot.pandas

import importlib

import holoviews as hv

hv.extension("bokeh")

# Get the list of experiments

In [None]:
# Get the data path
Datapath = Utils.get_data_path()

# Get all folders with "TNT_Fine" in the name

Folders = [
    f for f in os.listdir(Datapath) if "TNT_Fine" in f and "Tracked" in f and os.path.isdir(Datapath / f)
]

Folders

In [None]:
importlib.reload(Ballpushing_utils)

In [None]:
# Generate Experiment objects from each folder

Experiments = [Ballpushing_utils.Experiment(Datapath / f) for f in Folders]

In [None]:
# Check some flies nicknames

TestFly = Experiments[15].flies[3].nickname

In [None]:
TestFly

In [None]:
savepath = Utils.get_labserver() / "Experimental_data/MultiMazeRecorder/Datasets/240306_TNT_Fine_Experiments.pkl"


In [None]:
Ballpushing_utils.save_object(Experiments, savepath.as_posix())

In [None]:
# Load the experiments from the saved file
Experiments = Ballpushing_utils.load_object(savepath.as_posix())

In [None]:
type(Experiments)

In [None]:
type(Experiments[0])

In [None]:
importlib.reload(Ballpushing_utils)

In [None]:
data = Ballpushing_utils.Dataset(Experiments)

In [None]:
print(data)

In [None]:
# For each fly in the dataset, if they have 2 nicknames, just keep the first one
# for fly in data.flies:
#     if len(fly.nickname) > 1:
#         fly.nickname = fly.nickname[0]

I used the above method as a hack to get rid of a supplementary nickname in PR flies. It is fixed directly in the brain region registry now and doesn't need to be used anymore.

In [None]:
data.generate_dataset("summary")

# Drop the flies whose genotype is either "M6", M7, PR or CS
data.data=data.data[~data.data["Genotype"].isin(["M6", "M7", "PR", "CS"])]

In [None]:
mydata = data.data

# Plotting methods

In [None]:
# Get the type of the label column
mydata["label"].dtype

In [None]:
savepath = Utils.get_labserver() / "Experimental_data/MultiMazeRecorder/Plots/240306_summaries"

In [None]:
importlib.reload(HoloviewsTemplates)

In [None]:
HoloviewsTemplates.jitter_boxplot(
    data.data,
    "NumberEvents",
    show=True,
    save=True,
    metadata=data.metadata,
    bs_controls=True,
    sort_by="median",
    hline_method="boxplot",
    readme=None,
)

In [None]:
metrics = [
    "NumberEvents",
    "FinalEvent",
    "FinalTime",
    "SignificantEvents",
    "SignificantFirst",
    "SignificantFirstTime",
    "Pushes",
    "Pulls",
    "PullingRatio",
    "InteractionProportion",
    "AhaMoment",
    "AhaMomentIndex",
    "InsightEffect",
    "TimeToFinish",
    "SignificantRatio",
]

# Loop over the metrics
for metric in metrics:
    # Generate the jitter boxplot for the current metric
    HoloviewsTemplates.jitter_boxplot(
        data.data,
        metric,
        show=True,
        save=True,
        metadata=data.metadata,
        bs_controls=True,
        sort_by="median",
        hline_method="boxplot",
        readme=None,
    )

In [None]:
# Resave one of the metrics
HoloviewsTemplates.jitter_boxplot(
    data.data,
    "PullingRatio",
    show=False,
    save=True,
    metadata=data.metadata,
    bs_controls=True,
    sort_by="median",
    hline_method="boxplot",
)

# PCA on the data

Here I'll try to do PCA on the data to see if I can get something interesting by reducing the dimensionality of the data, including all the summary metrics.

In [None]:
mydata = data.data

In [None]:
# Load the brain region table

brain_regions_path = Utils.get_labserver() / "Experimental_data/Region_map_240312.csv"

registry = pd.read_csv(brain_regions_path)

registry

In [None]:
# Get the row 38 of the registry
registry.iloc[38]

In [None]:
# Create a Simplified Nickname column in mydata using the registry table to match Nickname and Simplified Nickname
mydata = data.data

mydata = mydata.merge(registry, left_on="Genotype", right_on="Genotype", how="left")

mydata.head()

In [None]:
# Rename the "Genotype_y" column to "Genotype"
mydata.rename(columns={"Nickname_y": "Nickname"}, inplace=True)

In [None]:
# Subset the data to remove some of the genotypes. Let's start with the M6 and M7 and PR genotypes, remove these.

subset = mydata[~mydata["Genotype"].isin(["M6", "M7", "PR", "TNTxG74", "TNTxG75", "TNTxZ1633"])]

In [None]:
# Subset the data to only include the label and metrics of interest
subset = subset[
    [
        "NumberEvents",
        "FinalEvent",
        "FinalTime",
        "SignificantEvents",
        "SignificantFirst",
        "SignificantFirstTime",
        "PullingRatio",
        "InteractionProportion",
        "AhaMoment",
        "AhaMomentIndex",
        "InsightEffect",
        "TimeToFinish",
        "SignificantRatio",
        "label",
        "Brain region",
        "fly",
        "Genotype",
        "Nickname",
        "Simplified Nickname",
    ]
]

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Separate out the features from the labels and brain region
features = subset.drop(
    ["label", "Brain region", "fly", "Genotype", "Nickname", "Simplified Nickname"],
    axis=1,
)

# Normalize the features
scaler = StandardScaler()

# Remove NaNs
nan_indices = features.dropna().index
features = features.loc[nan_indices].reset_index(drop=True)

features_normalized = scaler.fit_transform(features)

# Perform PCA
pca = PCA(n_components=2)  # Adjust n_components as needed
principalComponents = pca.fit_transform(features_normalized)

# Convert the principal components for each sample to a DataFrame
PCA_components = pd.DataFrame(principalComponents, columns=["PC1", "PC2"])

# Add your labels and brain region to this DataFrame
PCA_components["label"] = subset.loc[nan_indices, "label"].values
PCA_components["Brain region"] = subset.loc[nan_indices, "Brain region"].values
PCA_components["fly"] = subset.loc[nan_indices, "fly"].values
PCA_components["Genotype"] = subset.loc[nan_indices, "Genotype"].values
PCA_components["Nickname"] = subset.loc[nan_indices, "Nickname"].values
PCA_components["Simplified Nickname"] = subset.loc[nan_indices, "Simplified Nickname"].values

# PCA summaries

In [None]:
# Print the composition of the principal components
PCs_compo = pd.DataFrame(pca.components_, columns=features.columns, index=["PC1", "PC2"])

# Print the explained variance ratio
print(f"Explained variance of PC1 and PC2 : {pca.explained_variance_ratio_}")

PCs_compo

In [None]:
# Save the composition of the principal components and the explained variance ratio
PCs_compo.to_csv(savepath/"PCs_composition.csv")
pd.DataFrame(pca.explained_variance_ratio_, index=["PC1", "PC2"], columns=["Explained variance"]).to_csv(savepath/"PCA_Explained_variance.csv")

In [None]:
# get all data unique Genotype values

unique_genotypes = subset["Genotype"].unique()

# Check if there is one called "TNTxZ2018"

"TNTxZ2018" in unique_genotypes

In [None]:
# Find the label associated with Genotype "TNTxZ2018"
TNTxZ2018_label = mydata[mydata["Genotype"] == "TNTxZ2018"]["label"].values[0]

TNTxZ2018_label

In [None]:
import holoviews as hv

# Separate the "TNTxZ2018" data from the rest of the data
TNTxZ2018_data = PCA_components[PCA_components["label"] == TNTxZ2018_label]
other_data = PCA_components[PCA_components["label"] != TNTxZ2018_label]

# Initialize an empty Layout
plots = hv.Layout()

# Generate one plot per Brain region
for brain_region in PCA_components["Brain region"].unique():
    df_brain_region = other_data[other_data["Brain region"] == brain_region]

    # Create separate scatter plots for the "TNTxZ2018" genotype and the other genotypes
    plot1 = df_brain_region.hvplot.scatter(
        x="PC1", y="PC2", by="label", hover_cols=["fly"], cmap="nipy_spectral"
    )
    plot2 = TNTxZ2018_data.hvplot.scatter(
        x="PC1",
        y="PC2",
        by="label",
        hover_cols=["fly"],
        color="black",
        marker="x",
        size=100,
    )

    # Combine the plots
    final_plot = (plot1 * plot2).opts(width=1000, height = 750)

    # Add the plot to the Layout
    plots += final_plot.relabel(f"PCA - Brain Region: {brain_region}")

# Save the Layout
hvplot.save(plots.cols(1), savepath/"240306_PCA_plots.html")
# Display the Layout
#hvplot.show(plots.cols(1))

# Plotting the PCs separately

In [None]:
PCA_components

In [None]:
importlib.reload(HoloviewsTemplates)

In [None]:
# Plot PC1 and PC2 as jitterboxplots
HoloviewsTemplates.jitter_boxplot(
    PCA_components,
    "PC1",
    show=True,
    save=True,
    metadata=[],
    bs_controls=True,
    sort_by="median",
    hline_method="boxplot",
    readme=None,
)

In [None]:
# Same with PC2
HoloviewsTemplates.jitter_boxplot(
    PCA_components,
    "PC2",
    show=True,
    save=True,
    metadata=[],
    bs_controls=True,
    sort_by="median",
    hline_method="boxplot",
    readme=None,
)

In [None]:
# TODO : Find flies with particular pulling

# Full plot from data

In [None]:
pooled = PCA_components

pooled.head()

In [None]:
# Find the "Simplified Nickname" that have the word "MBON" two times in the same string
MBONs = pooled[pooled["Simplified Nickname"].str.contains("MBON")]["Simplified Nickname"].unique()

MBONs[10]


In [None]:
from bokeh.models import FuncTickFormatter

# Define the long labels
long_labels = ["MBON-16-GaL4   MBON-17-Gal4 ", "MBON-08-GaL4  MBON-09-GaL4 "]

# Replace the second space in each label with a newline character
new_labels = [label.replace(" ", " \n", 1) for label in long_labels]

# In pooled, replace the long labels with the new labels
pooled["Simplified Nickname"].replace(long_labels, new_labels, inplace=True)

In [None]:
# Reverse the value to the old long labels
pooled["Simplified Nickname"].replace(new_labels, long_labels, inplace=True)

In [None]:
# Check if the new labels are in the "Simplified Nickname" column
pooled[pooled["Simplified Nickname"].isin(new_labels)]

# Sorting data by brain region and Nickname

In [None]:
# Calculate the median for each 'Brain region' and 'Nickname'
median_values = pooled.groupby(["Brain region", "Simplified Nickname"])["PC2"].median()

# Sort 'Brain region' by its median
region_order = median_values.groupby("Brain region").median().sort_values().index

# Within each 'Brain region', sort 'Nickname' by its median
nickname_order_within_region = median_values.groupby("Brain region").apply(
    lambda x: x.sort_values().index.get_level_values("Simplified Nickname")
)

# Create a new category type for 'Brain region' with the calculated order
pooled["Brain region"] = pd.Categorical(
    pooled["Brain region"], categories=region_order, ordered=True
)

# Create a list to hold the correct order of 'Nickname' across all 'Brain regions'
correct_order_global = []

# For each 'Brain region', add the 'Nickname' order to the global list
for region in region_order:
    correct_order_global.extend(nickname_order_within_region[region])

# Convert 'Nickname' to a categorical type with the global order
pooled["Simplified Nickname"] = pd.Categorical(
    pooled["Simplified Nickname"], categories=correct_order_global, ordered=True
)

# Now you can sort
pooled.sort_values(by=["Brain region", "Simplified Nickname"], inplace=True)

# Compute and represent the control area

In [None]:
# Calculate 25% and 75% quantiles for the control group
control_data = pooled[pooled["Genotype"] == "TNTxZ2018"]
hline_values = (
    control_data["PC2"].quantile(0.25),
    control_data["PC2"].quantile(0.75),
)

In [None]:
# Get the limits for the y axis
y_min = pooled["PC2"].min()
# For y_max, use the 95th percentile of the data
y_max = pooled["PC2"].max()

In [None]:
from bokeh.models import HoverTool

# Get the metadata for the tooltips
tooltips = [
    ("Fly", "@fly"),
    ("PC2".capitalize(), "@PC2"),
]


hover = HoverTool(tooltips=tooltips)

# Create the boxplot

In [None]:
hv.extension("bokeh")
pooled_opts = {
    "boxwhisker": {
        #"box_fill_color": None,
        # "box_line_color": "black",
        "outlier_fill_color": None,
        "outlier_line_color": None,
        "framewise": True,
    },
    "scatter": {
        "jitter": 0.15,
        "color": "black",
        "alpha": 0.8,
        "size": 2,
        #"cmap": "Category10",
        "framewise": True,
    },
    "plot": {
        "width": 1100,
        "height": 1423,
        "show_legend": False,
        "xlabel": "",
        "invert_axes": True,
        "show_grid": True,
        "fontscale": 1,
        "title": "",
    },
}

In [None]:
# Aspect ratio computation

# For 1100 width I have 1423 height. What about if I have 1500 width?
# 1500 * 1423 / 1100 = 1935

# For 1800 width?
# 1800 * 1423 / 1100 = 2323

In [None]:
from bokeh.themes import Theme

# Create a custom theme
theme = Theme(
    json={
        "attrs": {
            "Title": {"text_font": "Arial"},
            "AxisLabel": {"text_font": "Arial"},
            "Legend": {"text_font": "Arial"},
            "TickLabel": {"text_font": "Arial"},
        }
    }
)

# Apply the theme
hv.renderer("bokeh").theme = theme

In [None]:
brain_regions = pooled["Brain region"].unique()

plot_options = pooled_opts


In [None]:
boxplot = hv.Overlay(
    [
        hv.BoxWhisker(
            pooled[pooled["Brain region"] == region],
            kdims="Simplified Nickname",
            vdims="PC2",
        ).opts(**plot_options["boxwhisker"], box_color=color)
        for region, color in zip(brain_regions, hv.Cycle("Category10"))
    ]
)

# Display the overlaid boxplots
# boxplot.opts(show_legend=False)

In [None]:
#boxplot

# Create the scatterplot

In [None]:
scatterplot = hv.Scatter(
    data=pooled,
    vdims=["PC2"]
    + ["fly"]
    + ["Brain region"]
    + ["Simplified Nickname"]
    + ["Genotype"]
    + ["label"]
    + ["PC2"],
    kdims=["Simplified Nickname"],
).opts(**plot_options["scatter"], tools=[hover], ylim=(y_min, y_max))

# Create the control area

In [None]:
hv_hline = hv.HSpan(hline_values[0], hline_values[1]).opts(fill_alpha=0.2, color="red")

# Combine the plots

In [None]:
# Get unique values of simplified labels
unique_labels = pooled["Simplified Nickname"].unique()
unique_labels

# Find the "Simplified Nickname" that is nan and find the associated "Nickname"
nan_simplified_nickname = pooled[pooled["Simplified Nickname"].isna()]["Nickname"].unique()

nan_simplified_nickname

In [None]:
jitterboxplot = (
    (hv_hline * boxplot * scatterplot)
    .opts(ylabel="PC2", **plot_options["plot"])
    .opts(show_grid=False, fontsize={"yticks": 10})
)

In [None]:
jitterboxplot

In [None]:
# Save the plot
hv.save(jitterboxplot, savepath/"240315_PCA2_jitterboxplot.html")

In [None]:
hv.extension("bokeh")

from bokeh.io import export_svgs

# Create a custom theme
theme = Theme(
    json={
        "attrs": {
            "Title": {"text_font": "Arial"},
            "AxisLabel": {"text_font": "Arial"},
            "Legend": {"text_font": "Arial"},
            "TickLabel": {"text_font": "Arial"},
        }
    }
)

# Apply the theme
hv.renderer("bokeh").theme = theme


def export_svg(obj, filename):
    plot_state = hv.renderer("bokeh").get_plot(obj).state
    plot_state.output_backend = "svg"
    export_svgs(plot_state, filename=filename)


export_svg(jitterboxplot, savepath / "240306_PCA_jitterboxplot.svg")

In [None]:
dpi = 96  # adjust this to match your intended DPI
width_in = 1100 / dpi
height_in = 1423 / dpi

In [None]:
# Redo the same plotting with matplotlib backend

pooled_opts_matplotlib = {
    "boxwhisker": {
        "showfliers": False,  # equivalent to setting outlier fill and line color to None
        "notch": False,  # equivalent to framewise
        # "patch_artist":True,
    },
    "scatter": {
        # "jitter": 0.15,
        "color": "black",
        "alpha": 0.8,
        "s": 4,  # equivalent to size
    },
    "plot": {
        "fig_size": 2000,  # equivalent to width and height (note: this is in inches)
        "show_legend": False,
        "xlabel": "",
        "invert_axes": True,
        "show_grid": True,
        "fontsize": {
            "title": 16,
            "labels": 14,
            "xticks": 12,
            "yticks": 12,
        },  # equivalent to fontscale
        "title": "",
    },
}

In [None]:
# Remake the jitterboxplot with the matplotlib backend

hv.extension("matplotlib")

In [None]:
from cycler import cycler
import matplotlib.pyplot as plt

# Create a color cycle
color_cycler = cycler(color=plt.cm.tab10.colors)

# Convert the color cycle to a list of colors
color_list = [c["color"] for c in color_cycler]

# Now you can use color_list in your plot
boxplot = hv.Overlay(
    [
        hv.BoxWhisker(
            pooled[pooled["Brain region"] == region], kdims="Nickname", vdims="PC1"
        ).opts(**pooled_opts_matplotlib["boxwhisker"], boxprops=dict(color="black", facecolor = color))
        for region, color in zip(brain_regions, color_list)
    ]
)

In [None]:
boxplot = hv.BoxWhisker(pooled, kdims="Nickname", vdims="PC1", by="Brain region").opts(
    **pooled_opts_matplotlib["boxwhisker"], cmap="category10")

In [None]:
boxplot

# Create the scatterplot

In [None]:
scatterplot = hv.Scatter(
    data=pooled,
    vdims=["PC1"],
    kdims=["Nickname"],
).opts(**pooled_opts_matplotlib["scatter"], ylim=(y_min, y_max))

In [None]:
#scatterplot

# Create the control area

In [None]:
hv_hline = hv.HSpan(hline_values[0], hline_values[1]).opts(alpha=0.2, color="red")

# Combine the plots

In [None]:
jitterboxplot = (hv_hline * boxplot * scatterplot).opts(
    ylabel="PC1", **pooled_opts_matplotlib["plot"]
)

In [None]:
jitterboxplot

In [None]:
# Save the plot as a matplotlib png
hv.save(jitterboxplot, savepath/"240306_PCA_jitterboxplot_matplotlib.png", fmt="png")