This notebook is to explore methods to compute flies velocity during ball pushing experiments using sleap based fly's tracking data. This is also the first notebook in which I experiment datasets handling with polars in later parts.

# Imports

In [None]:
from pathlib import Path
import json

import numpy as np

import h5py

import pandas as pd
import platform

import sys
sys.path.insert(0, "../../..")

from Utilities.Utils import *
from Utilities.Processing import *

import holoviews as hv
import hvplot.pandas


hv.extension('bokeh')

import iqplot
import bokeh.io
import bokeh.models
import bokeh.plotting

bokeh.io.output_notebook()

# Get a list of the directories containing the tracking data

In [None]:
# Get the DataFolder

if platform.system() == "Darwin":
    DataPath = Path("/Volumes/Ramdya-Lab/DURRIEU_Matthias/Experimental_data/MultiMazeRecorder/Videos")
# Linux Datapath
if platform.system() == "Linux":
    DataPath = Path("/mnt/labserver/DURRIEU_Matthias/Experimental_data/MultiMazeRecorder/Videos")

print(DataPath)

Make a list of the folders I want to use. For instance, I want to use the folders that have the "tnt" in the name as I will explore velocities for different crossings with UAS-TNT. I'm also only getting flies tested in the afternoon.

In [None]:
Folders = []
for folder in DataPath.iterdir():
    minfolder = str(folder).lower()
    if "tnt" in minfolder and "tracked" in minfolder and "pm" in minfolder:
        Folders.append(folder)

Folders

## Importing the tracking data and generating the dataset

In this part, we import the Metatadata .json file and the tracking data .h5 file. Then we compute smoothed fly y positions and generate time column.

In [None]:
Dataset = pd.DataFrame()

# Loop over all the .analysis.h5 files in the folder and store the ball y positions and the arena and corridor numbers as metadata
Flynum = 0
# Loop over all the folders that don't have "Dark" in the name
for folder in Folders:
    print(f"Adding experiment {folder} to the dataset...")
    # Read the metadata.json file
    with open(folder / "Metadata.json", "r") as f:
        metadata = json.load(f)
        variables = metadata["Variable"]
        metadata_dict = {}
        for var in variables:
            metadata_dict[var] = {}
            for arena in range(1, 10):
                arena_key = f"Arena{arena}"
                var_index = variables.index(var)
                metadata_dict[var][arena_key] = metadata[arena_key][var_index]
        
        print (metadata_dict)
        
    for file in folder.glob("**/*tracked_fly*.analysis.h5"):
        
        flypath = file
        with h5py.File(flypath.as_posix(), "r") as f:
            dset_names = list(f.keys())
            fly_locs = f["tracks"][:].T
            node_names = [n.decode() for n in f["node_names"][:]]
            
    for file in folder.glob("**/*.analysis.h5"):
        #print(file)
        with h5py.File(file, "r") as f:
            dset_names = list(f.keys())
            locations = f["tracks"][:].T
            node_names = [n.decode() for n in f["node_names"][:]]

        locations.shape
        
        if "Flipped" in folder.name:
            yball[:, 0, 0] = -yball[:, 0, 0]

        else:
            yball : np.ndarray = locations[:, :, 1, :]    
        yfly = fly_locs[:, :, 1, :]
        
        # Get the filename from the path
        foldername = folder.name

        # Get the arena and corridor numbers from the parent (corridor) and grandparent (arena) folder names
        arena = file.parent.parent.name
        corridor = file.parent.name
        
        # Get the metadata for this arena
        arena_key = arena.capitalize()
        arena_metadata = {var: pd.Categorical([metadata_dict[var][arena_key]]) for var in metadata_dict}
        
        Flynum += 1
        
        # Load the start and end coordinates from coordinates.npy
        start, end = np.load(file.parent / 'coordinates.npy')
        
        # Store the ball y positions, start and end coordinates, and the arena and corridor numbers as metadata
        data = {"Fly": pd.Categorical(["Fly" + str(Flynum)]),
                #"yfly": [list(yfly[:, 0, 0])], 
                "yball": [list(yball[:, 0, 0])],
                "experiment": pd.Categorical([foldername]),
                "arena": pd.Categorical([arena]), 
                "corridor": pd.Categorical([corridor]),
                "start": pd.Categorical([start]),
                "end": pd.Categorical([end])}
        data.update(arena_metadata)

        # Use pandas.concat instead of DataFrame.append
        Dataset = pd.concat([Dataset, pd.DataFrame(data)], ignore_index=True) 

# Explode yfly column to have one row per timepoint

#Dataset.drop(columns=["Genotye", "Date",], inplace=True)

# Dataset = Dataset.explode('yfly')
# Dataset['yfly'] = Dataset['yfly'].astype(float)

Dataset = Dataset.explode('yball')
Dataset['yball'] = Dataset['yball'].astype(float)

# Filter parameters
cutoff = 0.0015  # desired cutoff frequency of the filter, Hz ,      slightly higher than actual 1.2 Hz
order = 1  # sin wave can be approx represented as quadratic


In [None]:

#Dataset['yfly_smooth'] = butter_lowpass_filter(Dataset['yfly_relative'], cutoff, order)
# print('Filtering yfly relative to start...')
# Dataset['yfly_SG'] = savgol_lowpass_filter(Dataset['yfly'], 221, 1)

# Compute yball_relative relative to start
Dataset['yball_relative'] = abs(Dataset['yball'] - Dataset['start'])

# Fill missing values using linear interpolation
Dataset['yball_relative'] = Dataset['yball_relative'].interpolate(method='linear')

Dataset['yball_relative_SG'] = savgol_lowpass_filter(Dataset['yball_relative'], 221, 1)

print('Defining frame and time columns...')
Dataset["Frame"] = Dataset.groupby("Fly").cumcount()

Dataset["time"] = Dataset["Frame"] / 30

# Remove the original yfly column

print('Removing Frame column...')
Dataset.drop(columns=["Frame",], inplace=True)

print('Resetting index...')
Dataset.reset_index(drop=True, inplace=True)

Dataset.head()

## Saving the dataframe

In [None]:
DataPath = Path("/mnt/labserver/DURRIEU_Matthias/Experimental_data/MultiMazeRecorder/Datasets")

Dataset.to_feather(DataPath / "230913_Velocity.feather")

In [None]:
Dataset['CumulDist'] = Dataset['yfly_SG'].diff().abs().cumsum()

In [None]:
Dataset['MaxDist'] = Dataset.groupby('Fly')['CumulDist'].transform('max')

Dataset['MaxDist_cm'] = (Dataset['MaxDist'] / 17) / 10

Dataset['MaxDist_cm'] 

In [None]:
GroupedDF_TNT = Dataset.groupby(['Genotype','Fly'])['MaxDist_cm'].mean().reset_index()

GroupedDF_TNT


670131.334511951

In [None]:
# Get unique fly values
FlyList = Dataset['Fly'].unique()

FlyList

# Find duplicates
duplicates = GroupedDF_TNT[GroupedDF_TNT.duplicated(['Fly'], keep=False)]

duplicates

In [None]:
from bokeh.models import Range1d

p = iqplot.stripbox(
    data=GroupedDF_TNT,
    q="MaxDist_cm",
    cats="Genotype",
    spread='jitter',
)
p.x_range = Range1d(0, max(Dataset['MaxDist_cm']))
p.xaxis.axis_label = 'Distance travelled (cm)'


bokeh.io.show(p)

In [None]:
bokeh.plotting.save(p, DataPath / "Plots/230913_Grouped_TNT_Distance.html")

In [None]:
from bokeh.models import Range1d

pv = iqplot.stripbox(
    data=GroupedDF_TNT,
    q="MaxDist_cm",
    cats="Genotype",
    spread='jitter',
    q_axis = "y",
)
pv.y_range = Range1d(0, max(Dataset['MaxDist_cm']))
pv.yaxis.axis_label = 'Distance travelled (cm)'


bokeh.io.show(pv)

In [None]:
bokeh.plotting.save(pv, DataPath / "Plots/230913_Grouped_TNT_Distance_Vertical.html")

In [None]:
from bokeh.models import Range1d

pv = iqplot.stripbox(
    data=GroupedDF_TNT,
    q="MaxDist_cm",
    cats="Genotype",
    spread='jitter',
    q_axis = "y",
    show_legend = True,
    #legend_location= "center",
    color_column = "Genotype",
)
pv.y_range = Range1d(0, max(Dataset['MaxDist_cm']))
pv.yaxis.axis_label = 'Distance travelled (cm)'
# Remove all text from the x-axis
pv.xaxis.axis_label = None
pv.xaxis.major_label_text_font_size = '0pt'

#pv.legend.title = 'Genotype'


bokeh.io.show(pv)

In [None]:
bokeh.plotting.save(pv, DataPath / "Plots/230913_Grouped_TNT_Distance_Vertical_legend.html")

Weird that the distance is so much grouped among flies, perhaps the day matters? Let's check that.

In [None]:
from bokeh.layouts import gridplot
from bokeh.transform import factor_cmap

from bokeh.palettes import Category10

range = Range1d(0, max(Dataset['MaxDist_cm']))

# Create a new column 'color' in the DataFrame that maps the 'Genotype' values to colors
Dataset['color'] = Dataset['Genotype'].map(color_map)


dates = Dataset['Date'].unique()
plots = []

for d in dates:
    data = Dataset[Dataset['Date'] == d]
    
    grouped_data = data.groupby(['Genotype','Fly'])['MaxDist_cm'].mean().reset_index()
    
    merged_data = pd.merge(grouped_data, data[['Genotype', 'color']].drop_duplicates(), on='Genotype', how='left')
    
    p = iqplot.stripbox(
        data= merged_data,
        q="MaxDist_cm",
        cats="Genotype",
        spread='jitter',
        color_column='color',
    )
    p.x_range = range
    p.xaxis.axis_label = 'Distance travelled (cm)'
    
    plots.append([p])


In [None]:
from bokeh.layouts import layout
# Create a grid of plots
grid = layout([plots[0], plots[1]], 
              [plots[2], plots[3]])

# Show the grid
bokeh.io.show(grid)


In [None]:
bokeh.plotting.save(grid, DataPath / "Plots/230913_Datewise_TNT_Distance.html")

## Velocity computation

In [None]:
# Compute fly velocity as the distance travelled per second a.k.a distance traveled by the fly between 30 consecutive frames
Dataset['Velocity'] = Dataset['yfly_SG'].diff().abs() * 30
Dataset['Velocity_cm'] = (Dataset['Velocity'] / 17) / 10
# Plot the velocity of the first fly
hv.Curve(Dataset[Dataset['Fly'] == 'Fly1'], 'time', 'Velocity_cm')

In [None]:
# Smooth the velocity using a Savitzky-Golay filter and store it in a new column
Dataset['Velocity_SG'] = savgol_lowpass_filter(Dataset['Velocity_cm'], 221, 1)

In [None]:
# Plot the velocity of the first fly
hv.Curve(Dataset[Dataset['Fly'] == 'Fly1'], 'time', 'Velocity_SG')

In [None]:
Grouped_velocity = Dataset.groupby(['Genotype','time'])['Velocity_SG'].mean().reset_index()

# Assuming df is your DataFrame and 'Genotype' is the column with the genotypes
selected_genotypes = ["PR", "TNTxE-PG", "TNTxTH"]
subset_df = Grouped_velocity[Grouped_velocity['Genotype'].isin(selected_genotypes)]

# Plot the mean velocity across time for colored by Genotype

# Assuming df is your DataFrame
plot = subset_df.hvplot.line(x='time', y='Velocity_SG', by='Genotype', width=600, height=400)
#plot.labels(y="Average Velocity (cm/s)", x="time (s)")
plot.opts(xlabel="time (s)", ylabel="Average Velocity (cm/s)")




In [None]:
plo

In [None]:
hv.save(plot, DataPath / "Plots/230913_Grouped_TNT_Velocity.html")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Filter the data for 'Fly'


# Create a figure and axes
fig, ax = plt.subplots()

# Get the unique genotypes
genotypes = Dataset['Genotype'].unique()

# For each genotype, create a stripplot and add it to the axes
for i, genotype in enumerate(genotypes):
    data = Dataset[Dataset['Genotype'] == genotype]
    sns.stripplot(x=data['time'], y=data['yball_relative_SG'], hue=data['yball_relative_SG'], ax=ax, dodge=True, jitter=True, palette='viridis')

    # Add some space between the stripplots for clarity
    if i < len(genotypes) - 1:
        ax.axhline(i + 0.5, color='grey', linestyle='--')

# Set the yticks to be the genotypes
ax.set_yticks(range(len(genotypes)))
ax.set_yticklabels(genotypes)

# Show the plot
plt.show()
