In [5]:
import pandas as pd
import json

# Load train_split.csv
csv_file = "../../data/splits/train_split.csv"
df = pd.read_csv(csv_file)

# Load train_phases.json
json_file = "../../data/splits/train_phases.json"
with open(json_file, "r") as f:
    phase_data = json.load(f)


In [None]:
# Convert train_split.csv into a dictionary for fast lookup
csv_dict = {}
for _, row in df.iterrows():
    key = f"{row['video_id']}/frames {row['frame_number']}"
    csv_dict[key] = row['phase']


In [None]:
# Track misalignments
missing_frames = []
wrong_phase_assignments = []

In [None]:
# Iterate through train_phases.json and check alignment
for phase, video_ranges in phase_data.items():
    for range_entry in video_ranges:
        video_id, frame_range = range_entry.split("/frames ")
        start_frame, end_frame = map(int, frame_range.split(" - "))

        # Check every frame in the range
        for frame_number in range(start_frame, end_frame + 5, 5):  # Step size of 5
            key = f"{video_id}/frames {frame_number}"

            if key not in csv_dict:
                missing_frames.append(key)
            elif csv_dict[key] != phase:
                wrong_phase_assignments.append((key, csv_dict[key], phase))

In [None]:
# Print verification results
if not missing_frames and not wrong_phase_assignments:
    print("All frames and phases in train_phases.json align correctly with train_split.csv!")
else:
    if missing_frames:
        print(f"{len(missing_frames)} frames in train_phases.json are missing from train_split.csv:")
        print(missing_frames[:10])  

    if wrong_phase_assignments:
        print(f"⚠️ {len(wrong_phase_assignments)} frames have incorrect phase assignments:")
        for entry in wrong_phase_assignments[:10]: 
            print(f"Expected {entry[2]}, but train_split.csv has {entry[1]} for {entry[0]}")
