In [1]:
import pandas as pd
import seaborn as sns

from pathlib import Path
import sys

src_path = Path.cwd().parent / "src"
if src_path not in sys.path:
    sys.path.append(str(src_path))

# Loading a dataset

In [2]:
summary_data = pd.read_feather("/mnt/upramdya_data/MD/Ballpushing_TNTScreen/Datasets/250414_summary_TNT_screen_Data/summary/pooled_summary.feather"
)

summary_data.head()

Unnamed: 0,index,nb_events,max_event,max_event_time,max_distance,final_event,final_event_time,nb_significant_events,significant_ratio,first_significant_event,...,Brain region,Simplified Nickname,Split,Date,Genotype,Period,FeedingState,Orientation,Light,Crossing
0,fly_0_ball_0,2.81384,9.0,715.068966,209.832451,9.0,715.068966,3.0,0.3,1.0,...,MB,KC-α/βc-GaL4,y,231121,TNTxG78,PM14,starved_noWater,std,on,1
1,fly_0_ball_0,1.927508,,,214.106526,5.0,802.517241,2.0,0.333333,2.0,...,MB,KC-α/βc-GaL4,y,231121,TNTxG78,PM14,starved_noWater,std,on,1
2,fly_0_ball_0,1.689074,5.0,795.137931,212.178843,5.0,795.137931,5.0,0.833333,0.0,...,MB,KC-α/βc-GaL4,y,231121,TNTxG78,PM14,starved_noWater,std,on,1
3,fly_0_ball_0,1.399817,4.0,190.310345,234.114862,4.0,190.310345,5.0,1.0,0.0,...,MB,KC-α/βc-GaL4,y,231121,TNTxG78,PM14,starved_noWater,std,on,1
4,fly_0_ball_0,0.84707,2.0,80.965517,230.840794,2.0,80.965517,2.0,0.666667,0.0,...,MB,KC-α/βc-GaL4,y,231121,TNTxG78,PM14,starved_noWater,std,on,1


In [3]:
# Check missing values per column
missing_values = summary_data.isnull().sum()
missing_values = missing_values[missing_values > 0]

print("Missing values in summary data:")
print(missing_values)

Missing values in summary data:
max_event                         1326
max_event_time                    1326
first_significant_event             28
first_significant_event_time        28
major_event                         66
major_event_time                    66
insight_effect                     483
insight_effect_log                 483
pulling_ratio                       28
exit_time                         3201
avg_displacement_after_success     676
avg_displacement_after_failure    1451
influence_ratio                   1619
dtype: int64


Check some overarching statistics about the data

In [None]:
# For each column, handle missing values gracefully

# For max event, first significant event, major event final event, if missing, set to -1
summary_data["max_event"].fillna(-1, inplace=True)
summary_data["first_significant_event"].fillna(-1, inplace=True)
summary_data["major_event"].fillna(-1, inplace=True)
summary_data["final_event"].fillna(-1, inplace=True)

# For max event time, first significant event time, major event final event time, if missing, set to 3600
summary_data["max_event_time"].fillna(3600, inplace=True)
summary_data["first_significant_event_time"].fillna(3600, inplace=True)
summary_data["major_event_time"].fillna(3600, inplace=True)
summary_data["final_event_time"].fillna(3600, inplace=True)

# Remove columns insight_effect, insight_effect_log, exit_time
summary_data.drop(columns=["insight_effect", "insight_effect_log", "exit_time"], inplace=True)

# for pulling_ratio, avg_displacement_after_success, avg_displacement_before_success, and influence_ratio, if missing set to 0
summary_data["pulling_ratio"].fillna(0, inplace=True)
summary_data["avg_displacement_after_success"].fillna(0, inplace=True)
summary_data["avg_displacement_before_success"].fillna(0, inplace=True)
summary_data["influence_ratio"].fillna(0, inplace=True)

In [None]:
# Compute how many unique fly IDs are in each group

len(summary_data["fly"].unique())

# Event metrics

In [2]:
events_data = pd.read_feather("/mnt/upramdya_data/MD/Ballpushing_TNTScreen/Datasets/250414_summary_TNT_screen_Data/event_metrics/pooled_event_metrics.feather"
)

events_data.head()

Unnamed: 0,index,start_time,end_time,duration,displacement,start_distance,end_distance,direction,significant,major_event,...,Brain region,Simplified Nickname,Split,Date,Genotype,Period,FeedingState,Orientation,Light,Crossing
0,0,563.0,571.793103,8.793103,32.136499,2.261024,34.284172,1,1,1,...,LH,LH1139,y,240220,TNTxZ1866,PM15,starved_noWater,std,on,1
1,1,678.448276,692.655172,14.206897,160.88383,48.156891,209.04072,1,1,1,...,LH,LH1139,y,240220,TNTxZ1866,PM15,starved_noWater,std,on,1
2,2,1072.241379,1081.034483,8.793103,0.052046,196.999768,197.051799,0,0,0,...,LH,LH1139,y,240220,TNTxZ1866,PM15,starved_noWater,std,on,1
3,3,1246.137931,1260.344828,14.206897,6.574149,211.884069,205.468388,-1,1,0,...,LH,LH1139,y,240220,TNTxZ1866,PM15,starved_noWater,std,on,1
4,0,2265.137931,2276.0,10.862069,10.391194,1.986432,8.412389,-1,1,0,...,LH,LH1139,y,240220,TNTxZ1866,PM15,starved_noWater,std,on,1


In [3]:
# Get columns
event_columns = events_data.columns.tolist()

print("Event columns:")
print(event_columns)

Event columns:
['index', 'start_time', 'end_time', 'duration', 'displacement', 'start_distance', 'end_distance', 'direction', 'significant', 'major_event', 'max_event', 'final_event', 'ball_velocity', 'efficiency_diff', 'event_type', 'fly_idx', 'ball_idx', 'event_id', 'fly', 'flypath', 'experiment', 'Nickname', 'Brain region', 'Simplified Nickname', 'Split', 'Date', 'Genotype', 'Period', 'FeedingState', 'Orientation', 'Light', 'Crossing']
