# Multi-Prompt Dataset Generation for SigLIP Training

This notebook demonstrates generating multiple prompts per video following the SigLIP-inspired approach.

In [None]:
import sys
import pandas as pd
import yaml
from pathlib import Path

# Add dataset_creation to path
sys.path.append('dataset_creation')

from generate_dataset_multiprompt import (
    generate_multiprompt_dataset,
    process_dataset_multiprompt,
    create_default_config
)

## Configuration

In [None]:
# Create configuration
config = create_default_config()

print("Configuration:")
print(yaml.dump(config, default_flow_style=False))

## Process Full Dataset

This will process the full parquet file and generate multiple prompts per video.

In [None]:
# Process the full dataset
input_path = '/media/data1/datasets/DeepCoro/2b_CathReport_HEMO_MHI_MERGED_2017-2024_VIDEO_LEVEL.parquet'
output_dir = 'outputs/multiprompt_full'

# Note: This will take a while for the full dataset
# Uncomment to run:
# process_dataset_multiprompt(input_path, output_dir, config)

## Test with Sample Data

Let's test with a smaller sample to verify the approach.

In [None]:
# Load a sample of data
df_sample = pd.read_parquet(input_path).head(100)
print(f"Loaded {len(df_sample)} rows")
print(f"Columns: {df_sample.shape[1]}")

In [None]:
# Generate multi-prompts
prompt_df = generate_multiprompt_dataset(df_sample)
print(f"Generated {len(prompt_df)} prompts from {len(df_sample)} videos")
print(f"Average prompts per video: {len(prompt_df) / len(df_sample):.1f}")

In [None]:
# Analyze prompt distribution
print("Prompt type distribution:")
print(prompt_df['prompt_type'].value_counts())
print("\nPrompt weights:")
print(prompt_df.groupby('prompt_type')['prompt_weight'].first())

In [None]:
# Display sample prompts
print("Sample prompts:")
print("="*80)

# Show examples of each prompt type
for prompt_type in prompt_df['prompt_type'].unique():
    sample = prompt_df[prompt_df['prompt_type'] == prompt_type].iloc[0]
    print(f"\nType: {prompt_type} (weight={sample['prompt_weight']})")
    print(f"Study: {sample['StudyInstanceUID']}")
    print(f"Text: {sample['prompt_text']}")
    print("-"*40)

In [None]:
# Save sample output
output_path = Path('outputs/multiprompt_sample.parquet')
output_path.parent.mkdir(exist_ok=True)
prompt_df.to_parquet(output_path, index=False)
print(f"Saved to {output_path}")

## Prompt Type Details

The system generates 4 types of prompts per video:

1. **Global Summary** (weight=0.5): Complete study description preserving distribution
2. **Abnormal Focus** (weight=1.0): Critical findings â‰¥70% stenosis for high-signal training
3. **Atomic Lesions** (weight=0.6): Individual lesion descriptions for compositional understanding
4. **Negative Coverage** (weight=0.6): Normal territory descriptions for comprehensive coverage

In [None]:
# Example showing all prompt types for one study
study_id = prompt_df['StudyInstanceUID'].iloc[0]
study_prompts = prompt_df[prompt_df['StudyInstanceUID'] == study_id]

print(f"All prompts for study {study_id}:")
print("="*80)

for idx, row in study_prompts.iterrows():
    print(f"\n[{row['prompt_type']}] (weight={row['prompt_weight']})")
    print(f"  {row['prompt_text']}")