In [1]:
import os
import json

from concurrent.futures import ThreadPoolExecutor, as_completed

import uuid
import ollama
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from tqdm import tqdm
from PIL import Image
from torchvision import transforms as T
from transformers import CLIPProcessor, CLIPModel, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
PATH_TO_IMG_LABELS = "/media/luchocode/Extra vol/thesis/data/new_labels/val/labels.csv"
IMG_DIR = "/media/luchocode/Extra vol/thesis/data/selected_exoimages/val"
PROGRESS_FILE = "/media/luchocode/Extra vol/thesis/data/new_labels/val/llava_progress.json"
FRAMES = 6

In [102]:
def get_image_path(row, root_dir):
    filename = f"['{row['video']}'] frame {row['frame']}.jpg"
    return os.path.join(root_dir, row['class'], filename)

def transform_image(img_path):
    raw_image = Image.open(img_path).convert('RGB')
    vis_transform = T.Compose([
        T.Resize(256),
        T.CenterCrop(224)
    ])
    model_view_image = vis_transform(raw_image)
    return model_view_image


# Create an image appending 6 frames horizontally
def create_context_strip(image_paths):
    if len(image_paths) != 6:
        raise ValueError(f"Must provide exactly 6 image paths. Current len: {len(image_paths)}")

    images = [transform_image(p) for p in image_paths]
    
    tile_size = 224
    # Resize all to match the first image's height for consistency
    base_w, base_h = images[0].size
    target_w = int(base_w * (tile_size / base_h))
    
    resized_imgs = [img.resize((tile_size, tile_size)) for img in images]
    
    # Create Blank Canvas (3 cols wide, 2 rows high)
    grid_w = tile_size * 3
    grid_h = tile_size * 2
    grid_img = Image.new('RGB', (grid_w, grid_h))
    
    # Paste them in
    # Row 1
    grid_img.paste(resized_imgs[0], (0, 0))
    grid_img.paste(resized_imgs[1], (tile_size, 0))
    grid_img.paste(resized_imgs[2], (tile_size * 2, 0))
    
    # Row 2
    grid_img.paste(resized_imgs[3], (0, tile_size))
    grid_img.paste(resized_imgs[4], (tile_size, tile_size))
    grid_img.paste(resized_imgs[5], (tile_size * 2, tile_size)) # Target Frame

    return grid_img


def get_new_label(img):
    prompt = """
    You are an autonomous exoskeleton vision system.
    The image is a "History Grid" of 6 frames (Order: Top-Left to Bottom-Right).
    - Frames 1-5: Past history.
    - Frame 6 (Bottom-Right): The CURRENT VIEW.

    YOUR TASK: Classify the "Locomotion Mode" for Frame 6.

    ### THE 12 EXONET CLASSES:
    1. LG-S     (Level Ground - Steady)
    2. LG-T-IS  (Level Ground -> Transition to Incline Stairs)
    3. LG-T-DS  (Level Ground -> Transition to Decline Stairs)
    4. LG-T-DW  (Level Ground -> Transition to Door/Wall)
    5. LG-T-O   (Level Ground -> Transition to Obstacle)
    6. IS-S     (Incline Stairs - Steady)
    7. IS-T-LG  (Incline Stairs -> Transition to Level Ground)
    8. IS-T-DW  (Incline Stairs -> Transition to Door/Wall)
    9. DS-S     (Decline Stairs - Steady)
    10. DS-T-LG (Decline Stairs -> Transition to Level Ground)
    11. DS-T-DW (Decline Stairs -> Transition to Door/Wall)
    12. DW-S    (Door/Wall - Steady)

    ### DECISION LOGIC:
    - Focus on Frame 6.
    - If the terrain is consistent for 2 meters -> STEADY (-S).
    - If a NEW terrain (Stairs/Door) is within 2 steps -> TRANSITION (-T-).

    Return ONLY the exact Label Code.
    """
    OLLAMA_OPTIONS = {
        "num_ctx": 100000, 
        "temperature": 0  # 0 makes the model strictly deterministic (good for labels)
    }
    exonet_schema = {
    "type": "object",
    "properties": {
        "label": {
            "type": "string",
            "description": "The ExoNet classification code",
            "enum": [
                "LG-S", "LG-T-IS", "LG-T-DS", "LG-T-DW", "LG-T-O",
                "IS-S", "IS-T-LG", "IS-T-DW",
                "DS-S", "DS-T-LG", "DS-T-DW",
                "DW-S"
            ]
        },
        "reasoning": {
            "type": "string",
            "description": "Brief explanation of why this label was chosen based on visual evidence"
        },
        "confidence": {
            "type": "number",
            "description": "Confidence score between 0.0 and 1.0"
        }
    },
    "required": ["label", "reasoning", "confidence"]
}
    response = ollama.chat(model='llava', messages=[
        {
            'role': 'user',
            'content': prompt,
            'images': [img]
        },
    ], options=OLLAMA_OPTIONS, format=exonet_schema)
    return json.loads(response['message']['content'].strip())


def store_progress(frame, video, idx):
    data = {"frame": int(frame), "video": video, "idx": idx}
    with open(PROGRESS_FILE, 'w') as f:
        json.dump(data, f)


def process_frames(path_to_images):

    df = pd.read_csv(path_to_images)
    videos = df["video"].unique()
    
    for idx_video, video in enumerate(videos, start=1):
        print(f"\nVideo ({idx_video}/{len(videos)}): '{video}'")

        df_video:pd.DataFrame = df[df['video']==video].reset_index()

        print(f"Frames to process: {len(df_video)}")

        img_buffer = []
        for idx in range(len(df_video)):
            row = df_video.iloc[idx]
            image_path = get_image_path(row, IMG_DIR)
            
            if len(img_buffer) == 0:
                img_buffer = [image_path for _ in range(FRAMES)]
            
            else:
                img_buffer.pop(0)
                img_buffer.append(image_path)

            grid_image = create_context_strip(img_buffer)
            grid_image.save("grid_image.jpg", format='JPEG', quality=95)
            new_label_assignation = get_new_label("grid_image.jpg")
            
            index = int(row["index"])
            df.at[index, "new_class"] = new_label_assignation["label"]
            df.to_csv(path_to_images, index=False)

            store_progress(frame=row["frame"], video=video, idx=index)

            if idx % 100 == 0:
                print(f"Frames processed: {idx}/{len(df)}")
    

In [None]:
# --- CONFIGURATION ---
CSV_PATH = "/media/luchocode/Extra vol/thesis/data/new_labels/val/labels.csv"
TEMP_DIR = "temp_grids"               # Directory for grid images
MAX_WORKERS = 8                       # Set to 4 for your 32GB VRAM (allows 4 parallel Ollama requests)
SAVE_INTERVAL = 50                    # Save CSV every 50 processed rows

# Creates temp folder if it doesn't exist
os.makedirs(TEMP_DIR, exist_ok=True)

# --- HELPER FUNCTIONS ---

def get_image_path(row, root_dir):
    # Adjust this logic to match your actual file structure
    filename = f"['{row['video']}'] frame {row['frame']}.jpg"
    return os.path.join(root_dir, row['class'], filename)

def transform_image(img_path):
    try:
        raw_image = Image.open(img_path).convert('RGB')
        vis_transform = T.Compose([
            T.Resize(256),
            T.CenterCrop(224)
        ])
        return vis_transform(raw_image)
    except Exception as e:
        print(f"Error loading image {img_path}: {e}")
        # Return black square on failure to prevent crash
        return Image.new('RGB', (224, 224))

def create_context_strip(image_paths):
    if len(image_paths) != 6:
        raise ValueError(f"Must provide exactly 6 image paths. Current len: {len(image_paths)}")

    images = [transform_image(p) for p in image_paths]
    
    tile_size = 224
    grid_w = tile_size * 3
    grid_h = tile_size * 2
    grid_img = Image.new('RGB', (grid_w, grid_h))
    
    # Row 1
    grid_img.paste(images[0], (0, 0))
    grid_img.paste(images[1], (tile_size, 0))
    grid_img.paste(images[2], (tile_size * 2, 0))
    
    # Row 2
    grid_img.paste(images[3], (0, tile_size))
    grid_img.paste(images[4], (tile_size, tile_size))
    grid_img.paste(images[5], (tile_size * 2, tile_size)) # Target Frame

    return grid_img

def get_context_window(df, current_idx):
    """
    Stateless function to get the 6 image paths for a specific row index.
    Handles padding if we are at the start of a video.
    """
    current_row = df.loc[current_idx]
    video_id = current_row['video']
    
    # Get previous 5 indices + current index
    # We look back 5 steps, but we must stay within the bounds of the dataframe
    start_lookback = max(0, current_idx - 5)
    
    potential_rows = df.loc[start_lookback : current_idx]
    
    # Filter to ensure we only get frames from the SAME video
    # (In case the dataframe has multiple videos concatenated)
    same_video_rows = potential_rows[potential_rows['video'] == video_id]
    
    paths = [get_image_path(row, IMG_DIR) for _, row in same_video_rows.iterrows()]
    
    # Padding Logic:
    # If we are at frame 0, we have 1 path. We need 6.
    # We pad the START with the first available frame (replicating your buffer logic)
    while len(paths) < 6:
        paths.insert(0, paths[0])
        
    return paths

def get_new_label(img_path):
    prompt = """
    You are the vision system for a robotic exoskeleton.
    
    ### INPUT DATA
    The image is a composite grid of 6 frames showing the user's movement history.
    - **Layout:** 2 Rows x 3 Columns.
    - **Sequence:** Frame 1 (Top-Left) is the oldest. Frame 6 (Bottom-Right) is the CURRENT VIEW.
    
    ### TASK
    Analyze **Frame 6 (Bottom-Right)** to classify the user's current locomotion mode.
    
    ### THE 12 EXONET CLASSES
    1. LG-S     (Level Ground - Steady)
    2. LG-T-IS  (Level Ground -> Transition to Incline Stairs)
    3. LG-T-DS  (Level Ground -> Transition to Decline Stairs)
    4. LG-T-DW  (Level Ground -> Transition to Door/Wall)
    5. LG-T-O   (Level Ground -> Transition to Obstacle)
    6. IS-S     (Incline Stairs - Steady)
    7. IS-T-LG  (Incline Stairs -> Transition to Level Ground)
    8. IS-T-DW  (Incline Stairs -> Transition to Door/Wall)
    9. DS-S     (Decline Stairs - Steady)
    10. DS-T-LG (Decline Stairs -> Transition to Level Ground)
    11. DS-T-DW (Decline Stairs -> Transition to Door/Wall)
    12. DW-S    (Door/Wall - Steady)

    ### GEOMETRIC DECISION LOGIC (The "2-Meter Rule")
    Look at the floor in Frame 6. Estimate the path for the next 2 meters (approx 2 human steps).
    - **STEADY (-S):** The terrain type (flat, stairs) remains exactly the same for the next 2 meters.
    - **TRANSITION (-T-):** A *new* terrain feature (e.g., the first step of a staircase, a door threshold) is visible and will be reached within 2 meters.
    """
    
    # JSON Schema
    exonet_schema = {
        "type": "object",
        "properties": {
            "label": {
                "type": "string",
                "enum": [
                    "LG-S", "LG-T-IS", "LG-T-DS", "LG-T-DW", "LG-T-O",
                    "IS-S", "IS-T-LG", "IS-T-DW",
                    "DS-S", "DS-T-LG", "DS-T-DW",
                    "DW-S"
                ]
            },
            "reasoning": {"type": "string"},
            "confidence": {"type": "number"}
        },
        "required": ["label", "reasoning", "confidence"]
    }
    
    OLLAMA_OPTIONS = {
        "num_ctx": 16384, # 16k is safer/faster than 100k and plenty for 6 frames
        "temperature": 0
    }

    try:
        response = ollama.chat(
            model='minicpm-v:latest', 
            messages=[{'role': 'user', 'content': prompt, 'images': [img_path]}],
            options=OLLAMA_OPTIONS, 
            format=exonet_schema
        )
        return json.loads(response['message']['content'])
    except Exception as e:
        # print(f"Ollama Error: {e}") 
        return None

# --- WORKER FUNCTION ---

def process_single_row(index, df_ref):
    """
    Everything needed to process ONE row independently.
    Returns: (index, label_data)
    """
    try:
        # 1. Get Context Paths (Stateless)
        paths = get_context_window(df_ref, index)
        
        # 2. Create Grid
        grid = create_context_strip(paths)
        
        # 3. Save to Unique Temp File (Thread Safety)
        unique_name = f"grid_{index}_{uuid.uuid4().hex[:6]}.jpg"
        temp_path = os.path.join(TEMP_DIR, unique_name)
        grid.save(temp_path, format='JPEG', quality=95)
        
        # 4. Inference
        result = get_new_label(temp_path)
        
        # 5. Cleanup
        if os.path.exists(temp_path):
            os.remove(temp_path)
            
        return index, result

    except Exception as e:
        print(f"Row {index} failed: {e}")
        return index, None

# --- MAIN EXECUTION ---

def process_frames_parallel(csv_path):
    print(f"Loading Dataset: {csv_path}")
    df = pd.read_csv(csv_path)
    
    # Filter for rows that haven't been processed yet (optional)
    # if 'new_class' in df.columns:
    #     to_process = df[df['new_class'].isna()]
    # else:
    to_process = df

    print(f"Starting Parallel Processing on {len(to_process)} frames...")
    print(f"Workers: {MAX_WORKERS}")
    
    # Dictionary to hold results temporarily
    results_cache = {}
    
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Submit all tasks
        # We pass 'df' (read-only) to every worker
        future_to_idx = {
            executor.submit(process_single_row, idx, df): idx 
            for idx in to_process.index
        }
        
        # Process as they complete
        for i, future in tqdm(enumerate(as_completed(future_to_idx)), total=len(future_to_idx)):
            idx = future_to_idx[future]
            idx, data = future.result()
            
            if data and "label" in data:
                # Update our cache
                results_cache[idx] = data["label"]
                
                # Update DataFrame immediately (safe in main thread)
                df.at[idx, "new_class"] = data["label"]
                # Optional: Store confidence/reasoning if you want
                # df.at[idx, "reason"] = data["reasoning"] 
            else:
                print(f"Failed to get label for index {idx}")

            # Save periodically (e.g., every 50 frames)
            if i % SAVE_INTERVAL == 0 and i > 0:
                df.to_csv(csv_path, index=False)
    
    # Final Save
    df.to_csv(csv_path, index=False)
    print("Processing Complete. Saved to CSV.")
    
    # Clean up temp dir
    try:
        os.rmdir(TEMP_DIR)
    except:
        pass # Directory likely not empty or permission issue, ignore

In [None]:
# start labeling process
process_frames_parallel(PATH_TO_IMG_LABELS)

Loading Dataset: /media/luchocode/Extra vol/thesis/data/new_labels/val/labels.csv
Starting Parallel Processing on 181087 frames...
Workers: 8


  1%|          | 912/181087 [20:09<66:23:56,  1.33s/it]
