## Gaze Analysis from FacesDir JSON files

In [1]:
import os
import re
import tarfile
import shutil
import json
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
# --- Config ---
FPS = int(1800/120) #frame number in chunk / chunk duration
base_path = Path("/home/liubov/Desktop/BNF/work")
directory = '8-5-2024_#18_INDIVIDUAL_[12]'
json_folder = 'FacesDir'
directory_path = base_path / directory / json_folder
directory_path.mkdir(parents=True, exist_ok=True)

mapping_csv = base_path / directory / "first_id_mapping_camera_a.csv"

# --- Read mapping ---
mapping_df = pd.read_csv(mapping_csv)
mapping_df.drop(columns='Unnamed: 0', inplace=True)


In [4]:
# --- Config ---
FPS = int(1800/120)  # frame number in chunk / chunk duration
base_path = Path("/home/liubov/Desktop/BNF/work")
directory = '8-5-2024_#18_INDIVIDUAL_[12]'
json_folder = 'FacesDir'
directory_path = base_path / directory / json_folder
directory_path.mkdir(parents=True, exist_ok=True)
mapping_csv = base_path / directory / "first_id_mapping_camera_a.csv"

# --- Read mapping ---
print(f"Reading camera mapping from: {mapping_csv}")
mapping_df = pd.read_csv(mapping_csv)
if 'Unnamed: 0' in mapping_df.columns:
    mapping_df.drop(columns='Unnamed: 0', inplace=True)

print(f"Camera mapping loaded with {len(mapping_df)} entries")
print(f"Mapping columns: {list(mapping_df.columns)}")
print(f"Processing directory: {directory_path}")

# Create ID to camera mapping dictionary
# Assuming the CSV has columns like 'camera_id', 'participant_id', or similar
# Adjust these column names based on your actual CSV structure
try:
    if 'camera_id' in mapping_df.columns and 'participant_id' in mapping_df.columns:
        id_to_camera = dict(zip(mapping_df['participant_id'], mapping_df['camera_id']))
    elif 'id' in mapping_df.columns and 'camera' in mapping_df.columns:
        id_to_camera = dict(zip(mapping_df['id'], mapping_df['camera']))
    else:
        # Fallback: use first two columns
        cols = mapping_df.columns[:2]
        id_to_camera = dict(zip(mapping_df[cols[0]], mapping_df[cols[1]]))
        print(f"Using columns {cols[0]} -> {cols[1]} for ID to camera mapping")
    
    print(f"ID to camera mapping: {id_to_camera}")
except Exception as e:
    print(f"Error creating mapping: {e}")
    id_to_camera = {}

# ---------------- 1. Extract tar.gz and flatten Face JSONs ----------------
face_files_with_id = []  # List of tuples: (file_path, archive_id)
chunk_dirs = [d for d in directory_path.iterdir() if d.is_dir()]

print(f"Found {len(chunk_dirs)} chunk directories")

for chunk_dir in chunk_dirs:
    tar_files = list(chunk_dir.glob("*.tar.gz"))
    if not tar_files:
        continue
    
    print(f"Processing chunk directory: {chunk_dir.name}")
    
    for tar_file in tar_files:
        # Extract ID from archive name (e.g., "1.tar.gz" -> ID 1)
        archive_name = tar_file.stem  # removes .tar.gz
        archive_name = archive_name.strip()  # Remove any whitespace
        
        print(f"Debug: Processing archive '{archive_name}' from file '{tar_file.name}'")
        
        try:
            archive_id = int(archive_name)
        except ValueError:
            # Try to extract number from more complex names
            number_match = re.search(r'(\d+)', archive_name)
            if number_match:
                archive_id = int(number_match.group(1))
                print(f"Debug: Extracted ID {archive_id} from '{archive_name}'")
            else:
                print(f"Warning: Could not extract ID from archive name '{tar_file.name}', skipping...")
                continue
            
        print(f"Extracting face data from {tar_file} (ID: {archive_id})...")
        extract_dir = chunk_dir / f"{tar_file.stem}_extracted"
        extract_dir.mkdir(exist_ok=True)
        
        try:
            with tarfile.open(tar_file, "r:gz") as tar:
                for member in tar.getmembers():
                    member.path = Path(member.name).name  # flatten
                    tar.extract(member, path=extract_dir)
        except Exception as e:
            print(f"Error extracting {tar_file}: {e}")
            continue
            
        # Look for JSON files (face detection results)
        json_count = 0
        for jf in extract_dir.rglob("*.json"):
            final_path = directory_path / jf.name
            if final_path.exists():
                final_path.unlink()
            shutil.move(str(jf), final_path)
            face_files_with_id.append((final_path, archive_id))
            json_count += 1
            
        print(f"  Extracted {json_count} JSON files for ID {archive_id}")
            
        # Cleanup extracted folder
        for f in extract_dir.rglob("*"):
            if f.is_file():
                f.unlink()
        if extract_dir.exists():
            extract_dir.rmdir()

print(f"Total face JSON files collected: {len(face_files_with_id)}")

# ---------------- 2. Sort face JSON files by frame number within each ID ----------------
def get_frame_number_from_face_file(file_path):
    """Extract frame number from face detection JSON filename"""
    name = file_path.name.strip()
    # Look for patterns like frame_000001.json, 000001.json, etc.
    frame_patterns = [
        r'frame_(\d+)\.json$',
        r'(\d+)\.json$',
        r'face_(\d+)\.json$',
        r'faces_(\d+)\.json$'
    ]
    
    for pattern in frame_patterns:
        m = re.search(pattern, name)
        if m:
            return int(m.group(1))
    
    return float('inf')

def frame_to_timestamp(frame_number, fps=FPS):
    """Convert frame number to timestamp in seconds"""
    return frame_number / fps if fps > 0 else 0

# Group files by archive ID
faces_by_id = {}
for file_path, archive_id in face_files_with_id:
    if archive_id not in faces_by_id:
        faces_by_id[archive_id] = []
    faces_by_id[archive_id].append(file_path)

# Sort files by frame number within each ID group
for id_key in faces_by_id:
    faces_by_id[id_key].sort(key=get_frame_number_from_face_file)
    print(f"ID {id_key}: {len(faces_by_id[id_key])} face detection files")

# ---------------- 3. Merge Face JSONs by archive ID with enhanced metadata ----------------
def parse_face_filename_info(file_path):
    """Extract frame number from face detection filename"""
    frame_value = get_frame_number_from_face_file(file_path)
    return frame_value if frame_value != float('inf') else None

merged_faces_folder = directory_path / "merged_faces"
merged_faces_folder.mkdir(exist_ok=True)

# Create merged file for each archive ID
for archive_id, file_list in faces_by_id.items():
    merged_face_data = []
    camera_id = id_to_camera.get(archive_id, f"unknown_camera_{archive_id}")
    
    print(f"Processing ID {archive_id} (Camera: {camera_id}) - {len(file_list)} files...")
    
    for file_path in file_list:
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)
                frame_value = parse_face_filename_info(file_path)
                timestamp = frame_to_timestamp(frame_value, FPS)
                
                # Enhanced metadata for face data
                metadata = {
                    'participant_id': archive_id,
                    'camera_id': camera_id,
                    'frame_number': frame_value,
                    'timestamp_seconds': timestamp,
                    'fps': FPS,
                    'source_file': file_path.name
                }
                
                # Inject metadata into face data
                if isinstance(data, dict):
                    data.update(metadata)
                    # If there are detected faces, add metadata to each face
                    if 'faces' in data and isinstance(data['faces'], list):
                        for i, face in enumerate(data['faces']):
                            if isinstance(face, dict):
                                face.update(metadata)
                                face['face_id'] = f"{archive_id}_{frame_value}_{i}"
                    elif 'detections' in data and isinstance(data['detections'], list):
                        for i, detection in enumerate(data['detections']):
                            if isinstance(detection, dict):
                                detection.update(metadata)
                                detection['face_id'] = f"{archive_id}_{frame_value}_{i}"
                                
                elif isinstance(data, list):
                    # If data is a list of face detections
                    for i, entry in enumerate(data):
                        if isinstance(entry, dict):
                            entry.update(metadata)
                            entry['face_id'] = f"{archive_id}_{frame_value}_{i}"
                
                merged_face_data.append(data)
                
        except (json.JSONDecodeError, UnicodeDecodeError) as e:
            print(f"Error reading {file_path}: {e}")
            continue
        except Exception as e:
            print(f"Unexpected error processing {file_path}: {e}")
            continue
    
    # Create output file for this archive ID
    output_file = merged_faces_folder / f"faces_id_{archive_id}_camera_{camera_id}_merged.json"
    
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(merged_face_data, f, indent=4, ensure_ascii=False)
    
    print(f"✓ Merged {len(merged_face_data)} face JSON files for ID {archive_id} -> {output_file}")

print(f"\nCreated {len(faces_by_id)} merged face files, one for each participant ID")

# ---------------- 4. Create enhanced summary statistics ----------------
summary_file = merged_faces_folder / "face_detection_summary.json"
detailed_summary_file = merged_faces_folder / "face_detection_detailed_summary.csv"

summary_stats = {}
detailed_records = []

for archive_id, file_list in faces_by_id.items():
    camera_id = id_to_camera.get(archive_id, f"unknown_camera_{archive_id}")
    total_frames = len(file_list)
    frames_with_faces = 0
    total_faces_detected = 0
    max_faces_in_frame = 0
    
    # Count faces across all frames for this ID
    for file_path in file_list:
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)
                faces_in_frame = 0
                
                if isinstance(data, dict):
                    if 'faces' in data and isinstance(data['faces'], list):
                        faces_in_frame = len(data['faces'])
                    elif 'detections' in data and isinstance(data['detections'], list):
                        faces_in_frame = len(data['detections'])
                elif isinstance(data, list):
                    faces_in_frame = len(data)
                
                if faces_in_frame > 0:
                    frames_with_faces += 1
                    total_faces_detected += faces_in_frame
                    max_faces_in_frame = max(max_faces_in_frame, faces_in_frame)
                    
                # Add to detailed records
                frame_number = get_frame_number_from_face_file(file_path)
                detailed_records.append({
                    'participant_id': archive_id,
                    'camera_id': camera_id,
                    'frame_number': frame_number,
                    'timestamp_seconds': frame_to_timestamp(frame_number, FPS),
                    'faces_detected': faces_in_frame,
                    'source_file': file_path.name
                })
                
        except Exception as e:
            print(f"Error processing {file_path} for summary: {e}")
            continue
    
    avg_faces_per_frame = total_faces_detected / frames_with_faces if frames_with_faces > 0 else 0
    
    summary_stats[f"id_{archive_id}"] = {
        "participant_id": archive_id,
        "camera_id": camera_id,
        "total_frames": total_frames,
        "frames_with_faces": frames_with_faces,
        "total_faces_detected": total_faces_detected,
        "max_faces_in_frame": max_faces_in_frame,
        "avg_faces_per_frame": round(avg_faces_per_frame, 2),
        "face_detection_rate": round(frames_with_faces / total_frames, 4) if total_frames > 0 else 0,
        "fps_config": FPS
    }

# Save summary statistics
with open(summary_file, "w", encoding="utf-8") as f:
    json.dump(summary_stats, f, indent=4, ensure_ascii=False)

# Save detailed CSV
if detailed_records:
    detailed_df = pd.DataFrame(detailed_records)
    detailed_df.to_csv(detailed_summary_file, index=False)
    print(f"Detailed summary saved to {detailed_summary_file}")

print(f"Face detection summary saved to {summary_file}")

# ---------------- 5. Cleanup intermediate JSONs ----------------
all_face_files = []
for file_list in faces_by_id.values():
    all_face_files.extend(file_list)

for file_path in all_face_files:
    if file_path.exists():
        file_path.unlink()

print(f"\nAll {len(all_face_files)} intermediate face JSON files removed.")
print("Only final merged files remain.")

# ---------------- 6. Final Report ----------------
print("\n" + "="*60)
print("FACE PROCESSING COMPLETE")
print("="*60)
print(f"Configuration:")
print(f"  - FPS: {FPS}")
print(f"  - Directory: {directory}")
print(f"  - Base path: {base_path}")
print(f"  - JSON folder: {json_folder}")
print(f"\nResults:")
for archive_id in sorted(faces_by_id.keys()):
    stats = summary_stats[f"id_{archive_id}"]
    camera = stats['camera_id']
    print(f"  ID {archive_id} (Camera {camera}): {stats['total_frames']} frames, "
          f"{stats['frames_with_faces']} with faces ({stats['face_detection_rate']:.1%}), "
          f"avg {stats['avg_faces_per_frame']:.1f} faces/frame")

print(f"\nOutput files created in: {merged_faces_folder}")
print(f"  - {len(faces_by_id)} merged JSON files")
print(f"  - 1 summary statistics file")
print(f"  - 1 detailed CSV file")

Reading camera mapping from: /home/liubov/Desktop/BNF/work/8-5-2024_#18_INDIVIDUAL_[12]/first_id_mapping_camera_a.csv
Camera mapping loaded with 18 entries
Mapping columns: ['frame_number', 'original_id', 'updated_id']
Processing directory: /home/liubov/Desktop/BNF/work/8-5-2024_#18_INDIVIDUAL_[12]/FacesDir
Using columns frame_number -> original_id for ID to camera mapping
ID to camera mapping: {1800: 3, 3600: 3, 5400: 3, 7200: 3, 9000: 3, 10800: 3}
Found 8 chunk directories
Processing chunk directory: chunk_01_00120-00240s
Debug: Processing archive '2.tar' from file '2.tar.gz'
Debug: Extracted ID 2 from '2.tar'
Extracting face data from /home/liubov/Desktop/BNF/work/8-5-2024_#18_INDIVIDUAL_[12]/FacesDir/chunk_01_00120-00240s/2.tar.gz (ID: 2)...
  Extracted 1801 JSON files for ID 2
Debug: Processing archive '3.tar' from file '3.tar.gz'
Debug: Extracted ID 3 from '3.tar'
Extracting face data from /home/liubov/Desktop/BNF/work/8-5-2024_#18_INDIVIDUAL_[12]/FacesDir/chunk_01_00120-00240s/3

In [6]:
filepath = '/home/liubov/Desktop/BNF/work/8-5-2024_#18_INDIVIDUAL_[12]/FacesDir/merged_faces/faces_id_1_camera_unknown_camera_1_merged.json'

with open(filepath, 'r') as f:
    data = json.load(f)  

df = pd.DataFrame(data)

     gaze_right_3d     gaze_left_3d gaze_averaged_angle_3d  \
0  [0.0, 0.0, 0.0]  [0.0, 0.0, 0.0]             [0.0, 0.0]   
1  [0.0, 0.0, 0.0]  [0.0, 0.0, 0.0]             [0.0, 0.0]   
2  [0.0, 0.0, 0.0]  [0.0, 0.0, 0.0]             [0.0, 0.0]   
3  [0.0, 0.0, 0.0]  [0.0, 0.0, 0.0]             [0.0, 0.0]   
4  [0.0, 0.0, 0.0]  [0.0, 0.0, 0.0]             [0.0, 0.0]   

                              eye_right_keypoints_2d  \
0  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
1  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
2  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
3  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
4  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   

                               eye_left_keypoints_2d  \
0  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
1  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
2  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
3  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
4  [0.0, 0

In [9]:
df.drop(["camera_id", "fps", "source_file"], axis=1,inplace=True)

In [13]:
df.columns

Index(['gaze_right_3d', 'gaze_left_3d', 'gaze_averaged_angle_3d',
       'eye_right_keypoints_2d', 'eye_left_keypoints_2d',
       'eye_right_keypoints_3d', 'eye_left_keypoints_3d', 'R', 't',
       'face_keypoints_2d', 'face_keypoints_3d', 'pdm_rigid_parameters',
       'pdm_non_rigid_parameters', 'action_units_score', 'action_units_binary',
       'participant_id', 'frame_number', 'timestamp_seconds'],
      dtype='object')

In [41]:
# df.columns
# Index(['gaze_right_3d', 'gaze_left_3d', 'gaze_averaged_angle_3d',
#        'eye_right_keypoints_2d', 'eye_left_keypoints_2d',
#        'eye_right_keypoints_3d', 'eye_left_keypoints_3d', 'R', 't',
#        'face_keypoints_2d', 'face_keypoints_3d', 'pdm_rigid_parameters',
#        'pdm_non_rigid_parameters', 'action_units_score', 'action_units_binary',
#        'participant_id', 'frame_number', 'timestamp_seconds'],
#       dtype='object')
# len(df.eye_right_keypoints_2d.loc[0])=56
# len(df.eye_right_keypoints_3d.loc[0])=84
#df.R.value_counts() [-0.367, 0.451, 0.015]    11785 Name: count, dtype: int64 Looks like the same for all rows
# df.t.value_counts() t [-28757728.0, -16176221.0, 39317204.0]    11785 Name: count, dtype: int64
# df.face_keypoints_2d.value_counts() [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...]    11785
# df.face_keypoints_3d.value_counts() face_keypoints_3d  [-546.9, -307.6, 747.7, -545.6,  -493.0, ...]    11785  Name: count, dtype: int64
# df.pdm_rigid_parameters.value_counts() [1.0, 0.0, 0.0, 0.0, 0.0, 0.0]    11785 Name: count, dtype: int64
# pdm_non_rigid_parameters [0.0, 0.0, 0.0, 0.0, 0.0, 0.0,... 0.0, 0.0, 0.0, 0.0, 0.0]    11785
# action_units_score [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]    11785 Name: count, dtype: int64 
# action_units_binary [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]    11785 Name: count, dtype: int64