# Phase 4: Visual Deduplication & Enriched Transcript
## Final step: Cluster similar frames and link speech to visuals

This notebook:
- Loads Phase 1, 2, 3 outputs
- Uses CLIP embeddings to find visually similar frames
- Clusters duplicates using DBSCAN
- Selects best representative per cluster
- Creates final enriched transcript linking speech ‚Üí visuals

**Output:** JSON with timestamped transcript + visual database

In [None]:
# ====================================================================
# CELL 1: Install Dependencies
# ====================================================================
print("üì¶ Installing dependencies...")

# Core dependencies
!pip install -q transformers torch pillow scikit-learn numpy

# CLIP model
!pip install -q sentence-transformers  # Includes CLIP

print("‚úÖ Dependencies installed!")

In [None]:
# ====================================================================
# CELL 2: Import Libraries
# ====================================================================
import os
import json
import numpy as np
from typing import Dict, List, Any, Tuple
from collections import defaultdict
from pathlib import Path

import torch
from PIL import Image
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
import gc

print(f"üñ•Ô∏è  Device: {'CUDA (GPU)' if torch.cuda.is_available() else 'CPU'}")
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()

print("‚úÖ Libraries imported")

In [None]:
# ====================================================================
# CELL 3: Configuration
# ====================================================================

CONFIG = {
    # Input files from previous phases
    "phase1_file": "/kaggle/working/output/transcript_XXXXX.json",  # ‚ö†Ô∏è UPDATE
    "phase2_file": "/kaggle/working/output/phase2_references_XXXXX.json",  # ‚ö†Ô∏è UPDATE
    "phase3_file": "/kaggle/working/output/phase3_frames_XXXXX.json",  # ‚ö†Ô∏è UPDATE
    
    # CLIP model for visual similarity
    "clip_model": "clip-ViT-B-32",  # Options: clip-ViT-B-32, clip-ViT-L-14 (larger, better)
    
    # Clustering parameters (DBSCAN)
    "clustering": {
        "eps": 0.3,  # Distance threshold (0.2-0.4, lower=stricter)
        "min_samples": 1,  # Minimum frames to form cluster
        "metric": "cosine"  # Cosine distance for CLIP embeddings
    },
    
    # Representative selection
    "select_best_by": "quality",  # "quality" or "central" (most similar to cluster center)
    
    # Output
    "output_dir": "/kaggle/working/output",
    "final_output_file": "enriched_transcript.json",
    "save_intermediate": True
}

os.makedirs(CONFIG['output_dir'], exist_ok=True)

print("‚úÖ Configuration loaded")
print(f"   CLIP model: {CONFIG['clip_model']}")
print(f"   DBSCAN eps: {CONFIG['clustering']['eps']}")
print(f"   Min samples: {CONFIG['clustering']['min_samples']}")

In [None]:
# ====================================================================
# CELL 4: Load All Previous Phase Data
# ====================================================================

print("üìÇ Loading Phase 1, 2, 3 outputs...\n")

# Load Phase 1 (transcript)
with open(CONFIG['phase1_file'], 'r', encoding='utf-8') as f:
    phase1_data = json.load(f)
print(f"‚úÖ Phase 1: {len(phase1_data['utterances'])} utterances")

# Load Phase 2 (references)
with open(CONFIG['phase2_file'], 'r', encoding='utf-8') as f:
    phase2_data = json.load(f)
print(f"‚úÖ Phase 2: {len(phase2_data['references'])} references")

# Load Phase 3 (frames)
with open(CONFIG['phase3_file'], 'r', encoding='utf-8') as f:
    phase3_data = json.load(f)
print(f"‚úÖ Phase 3: {len(phase3_data['frames'])} frames")

video_id = phase1_data['video_id']
frames = phase3_data['frames']

# Filter valid frames (with existing paths)
valid_frames = [
    f for f in frames 
    if f.get('frame_path') and os.path.exists(f['frame_path'])
]

print(f"\nüìä Summary:")
print(f"   Video ID: {video_id}")
print(f"   Total frames: {len(frames)}")
print(f"   Valid frames: {len(valid_frames)}")

if len(valid_frames) == 0:
    print("\n‚ö†Ô∏è  No valid frames found! Check Phase 3 output.")

In [None]:
# ====================================================================
# CELL 5: CLIP Embedding Functions
# ====================================================================

class CLIPEmbedder:
    """Generate CLIP embeddings for images."""
    
    def __init__(self, model_name: str = "clip-ViT-B-32"):
        print(f"ü§ñ Loading CLIP model: {model_name}...")
        device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = SentenceTransformer(model_name, device=device)
        self.device = device
        print(f"‚úÖ CLIP model loaded on {device}")
    
    def embed_image(self, image_path: str) -> np.ndarray:
        """Generate embedding for single image."""
        try:
            img = Image.open(image_path).convert('RGB')
            embedding = self.model.encode(img, convert_to_numpy=True)
            return embedding
        except Exception as e:
            print(f"‚ö†Ô∏è  Error encoding {image_path}: {e}")
            return None
    
    def embed_images_batch(self, image_paths: List[str], batch_size: int = 8) -> np.ndarray:
        """Generate embeddings for multiple images in batches."""
        embeddings = []
        
        print(f"   Encoding {len(image_paths)} images in batches of {batch_size}...")
        
        for i in range(0, len(image_paths), batch_size):
            batch_paths = image_paths[i:i+batch_size]
            
            # Load images
            images = []
            for path in batch_paths:
                try:
                    img = Image.open(path).convert('RGB')
                    images.append(img)
                except Exception as e:
                    print(f"‚ö†Ô∏è  Skipping {path}: {e}")
                    images.append(None)
            
            # Encode batch
            valid_images = [img for img in images if img is not None]
            if valid_images:
                batch_embeddings = self.model.encode(
                    valid_images,
                    convert_to_numpy=True,
                    show_progress_bar=False
                )
                embeddings.extend(batch_embeddings)
            
            if (i // batch_size + 1) % 10 == 0:
                print(f"   Progress: {i+len(batch_paths)}/{len(image_paths)}")
        
        return np.array(embeddings)

print("‚úÖ CLIP embedder class defined")

In [None]:
# ====================================================================
# CELL 6: Clustering & Selection Functions
# ====================================================================

def cluster_embeddings(embeddings: np.ndarray, eps: float = 0.3, 
                      min_samples: int = 1, metric: str = 'cosine') -> np.ndarray:
    """Cluster embeddings using DBSCAN."""
    print(f"\nüîç Clustering {len(embeddings)} embeddings...")
    print(f"   eps={eps}, min_samples={min_samples}, metric={metric}")
    
    # DBSCAN clustering
    clustering = DBSCAN(eps=eps, min_samples=min_samples, metric=metric)
    labels = clustering.fit_predict(embeddings)
    
    # Count clusters
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1)
    
    print(f"‚úÖ Found {n_clusters} clusters + {n_noise} noise points")
    
    return labels

def select_best_representative(frames: List[Dict], frame_indices: List[int],
                              embeddings: np.ndarray, method: str = "quality") -> int:
    """Select best representative frame from cluster."""
    if len(frame_indices) == 1:
        return frame_indices[0]
    
    if method == "quality":
        # Select frame with highest quality score
        qualities = [
            frames[idx].get('quality', {}).get('quality_score', 0)
            for idx in frame_indices
        ]
        best_local_idx = np.argmax(qualities)
        return frame_indices[best_local_idx]
    
    elif method == "central":
        # Select frame closest to cluster centroid
        cluster_embeddings = embeddings[frame_indices]
        centroid = cluster_embeddings.mean(axis=0)
        
        # Find closest to centroid
        similarities = cosine_similarity([centroid], cluster_embeddings)[0]
        best_local_idx = np.argmax(similarities)
        return frame_indices[best_local_idx]
    
    return frame_indices[0]

print("‚úÖ Clustering functions defined")

In [None]:
# ====================================================================
# CELL 7: Generate CLIP Embeddings
# ====================================================================

if len(valid_frames) > 0:
    print("="*70)
    print("üöÄ GENERATING CLIP EMBEDDINGS")
    print("="*70)
    
    # Initialize embedder
    embedder = CLIPEmbedder(CONFIG['clip_model'])
    
    # Extract frame paths
    frame_paths = [f['frame_path'] for f in valid_frames]
    
    # Generate embeddings
    embeddings = embedder.embed_images_batch(frame_paths, batch_size=8)
    
    print(f"\n‚úÖ Generated {len(embeddings)} embeddings")
    print(f"   Embedding dimension: {embeddings.shape[1]}")
    
    # Clean up
    del embedder
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    gc.collect()
    
else:
    print("‚ö†Ô∏è  No valid frames to embed")
    embeddings = np.array([])

In [None]:
# ====================================================================
# CELL 8: Cluster Similar Frames
# ====================================================================

if len(embeddings) > 0:
    print("\n" + "="*70)
    print("üîç CLUSTERING VISUAL FRAMES")
    print("="*70)
    
    # Cluster embeddings
    labels = cluster_embeddings(
        embeddings,
        eps=CONFIG['clustering']['eps'],
        min_samples=CONFIG['clustering']['min_samples'],
        metric=CONFIG['clustering']['metric']
    )
    
    # Organize clusters
    clusters = defaultdict(list)
    frame_to_cluster = {}
    
    for i, (frame, label) in enumerate(zip(valid_frames, labels)):
        frame_id = frame['frame_id']
        
        if label == -1:
            # Noise point - unique visual
            cluster_id = f"VISUAL_{i:03d}_SINGLE"
        else:
            # Part of cluster
            cluster_id = f"VISUAL_{label:03d}"
        
        clusters[cluster_id].append(i)
        frame_to_cluster[frame_id] = cluster_id
    
    print(f"\nüìä Clustering results:")
    print(f"   Total clusters: {len(clusters)}")
    print(f"   Average cluster size: {np.mean([len(v) for v in clusters.values()]):.1f}")
    print(f"   Largest cluster: {max(len(v) for v in clusters.values())} frames")
    
    # Select representatives
    print(f"\nüéØ Selecting best representatives (method: {CONFIG['select_best_by']})...")
    cluster_representatives = {}
    
    for cluster_id, frame_indices in clusters.items():
        best_idx = select_best_representative(
            valid_frames,
            frame_indices,
            embeddings,
            method=CONFIG['select_best_by']
        )
        cluster_representatives[cluster_id] = best_idx
    
    print(f"‚úÖ Selected {len(cluster_representatives)} representatives")
    
else:
    print("‚ö†Ô∏è  No embeddings to cluster")
    clusters = {}
    frame_to_cluster = {}
    cluster_representatives = {}
    labels = np.array([])

In [None]:
# ====================================================================
# CELL 9: Create Visual Database
# ====================================================================

print("\n" + "="*70)
print("üé® CREATING VISUAL DATABASE")
print("="*70)

visuals = {}

for cluster_id, frame_indices in clusters.items():
    if not frame_indices:
        continue
    
    # Get representative frame
    rep_idx = cluster_representatives.get(cluster_id, frame_indices[0])
    rep_frame = valid_frames[rep_idx]
    
    # Collect all timestamps where this visual appears
    timestamps = [valid_frames[idx]['timestamp_ms'] for idx in frame_indices]
    
    # Build visual entry
    visual_entry = {
        "representative_frame": rep_frame['frame_path'],
        "thumbnail": rep_frame.get('thumbnail_path'),
        "appears_at_ms": sorted(timestamps),
        "appearance_count": len(timestamps),
        "quality": rep_frame.get('quality', {}),
        "frame_ids": [valid_frames[idx]['frame_id'] for idx in frame_indices]
    }
    
    # Add OCR data if available
    ocr_data = rep_frame.get('ocr_data')
    if ocr_data and ocr_data.get('text'):
        visual_entry['ocr_text'] = ocr_data['text']
        visual_entry['ocr_confidence'] = ocr_data.get('confidence', 0)
    
    # Add AI description if available
    ai_desc = rep_frame.get('ai_description')
    if ai_desc:
        visual_entry['description'] = ai_desc.get('description', '')
        visual_entry['visual_type'] = ai_desc.get('type', 'unknown')
        visual_entry['concept'] = ai_desc.get('concept', '')
    
    visuals[cluster_id] = visual_entry

print(f"‚úÖ Created visual database with {len(visuals)} unique visuals")

# Show distribution
multi_appearance = sum(1 for v in visuals.values() if v['appearance_count'] > 1)
print(f"   Visuals appearing once: {len(visuals) - multi_appearance}")
print(f"   Visuals appearing multiple times: {multi_appearance}")

In [None]:
# ====================================================================
# CELL 10: Create Enriched Transcript
# ====================================================================

print("\n" + "="*70)
print("üìù CREATING ENRICHED TRANSCRIPT")
print("="*70)

# Build mapping from reference to cluster
reference_to_visual = {}

for frame in valid_frames:
    frame_id = frame['frame_id']
    reference_id = frame['reference_id']
    
    if frame_id in frame_to_cluster:
        cluster_id = frame_to_cluster[frame_id]
        
        # Map reference to visual (use first occurrence)
        if reference_id not in reference_to_visual:
            reference_to_visual[reference_id] = cluster_id

# Create transcript entries
transcript_entries = []
references = phase2_data.get('references', [])

for reference in references:
    reference_id = reference['reference_id']
    visual_id = reference_to_visual.get(reference_id)
    
    entry = {
        "timestamp_ms": reference['timestamp_ms'],
        "text": reference['text'],
        "visual_id": visual_id,
        "reference_type": reference.get('reference_type', 'unknown'),
        "detection_method": reference.get('detection_method', 'unknown')
    }
    
    # Add visual metadata if available
    if visual_id and visual_id in visuals:
        visual = visuals[visual_id]
        entry['visual_description'] = visual.get('description')
        entry['visual_type'] = visual.get('visual_type')
        entry['ocr_text'] = visual.get('ocr_text')
    
    transcript_entries.append(entry)

print(f"‚úÖ Created {len(transcript_entries)} transcript entries")
print(f"   Entries with visuals: {sum(1 for e in transcript_entries if e['visual_id'])}")

In [None]:
# ====================================================================
# CELL 11: Save Final Results
# ====================================================================

# Create final enriched transcript
enriched_transcript = {
    "video_id": video_id,
    "video_url": phase1_data.get('video_url', ''),
    "video_title": phase1_data.get('video_title', ''),
    "duration_ms": phase1_data['duration_ms'],
    
    # Main outputs
    "transcript": transcript_entries,
    "visuals": visuals,
    
    # Metadata
    "metadata": {
        "total_utterances": len(phase1_data['utterances']),
        "total_references": len(references),
        "total_frames_extracted": len(frames),
        "valid_frames": len(valid_frames),
        "unique_visuals": len(visuals),
        "clip_model": CONFIG['clip_model'],
        "clustering_eps": CONFIG['clustering']['eps']
    }
}

# Save main output
output_file = os.path.join(CONFIG['output_dir'], CONFIG['final_output_file'])
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(enriched_transcript, f, ensure_ascii=False, indent=2)

# Save clustering details (intermediate)
if CONFIG['save_intermediate']:
    clustering_file = f"{CONFIG['output_dir']}/phase4_clustering_{video_id}.json"
    clustering_data = {
        "clusters": {k: v for k, v in clusters.items()},
        "frame_to_cluster": frame_to_cluster,
        "cluster_representatives": cluster_representatives,
        "cluster_sizes": {k: len(v) for k, v in clusters.items()}
    }
    with open(clustering_file, 'w', encoding='utf-8') as f:
        json.dump(clustering_data, f, ensure_ascii=False, indent=2)
    print(f"üíæ Clustering details: {clustering_file}")

# Save human-readable report
report_file = f"{CONFIG['output_dir']}/phase4_report_{video_id}.txt"
with open(report_file, 'w', encoding='utf-8') as f:
    f.write("ENRICHED TRANSCRIPT REPORT\n")
    f.write(f"Video ID: {video_id}\n")
    f.write("="*70 + "\n\n")
    
    f.write("VISUAL DATABASE\n")
    f.write("-"*70 + "\n")
    for visual_id, visual in list(visuals.items())[:20]:  # First 20
        f.write(f"\n{visual_id}:\n")
        f.write(f"  Appears: {visual['appearance_count']} times\n")
        f.write(f"  Times: {', '.join([str(t/1000) + 's' for t in visual['appears_at_ms'][:5]])}...\n")
        if visual.get('description'):
            f.write(f"  Description: {visual['description'][:100]}...\n")
        if visual.get('ocr_text'):
            f.write(f"  OCR: {visual['ocr_text'][:100]}...\n")
    
    if len(visuals) > 20:
        f.write(f"\n... and {len(visuals) - 20} more visuals\n")
    
    f.write("\n" + "="*70 + "\n")
    f.write("\nTRANSCRIPT ENTRIES\n")
    f.write("-"*70 + "\n")
    for i, entry in enumerate(transcript_entries[:10], 1):  # First 10
        f.write(f"\n[{i}] {entry['timestamp_ms']/1000:.1f}s\n")
        f.write(f"  Text: {entry['text'][:100]}...\n")
        f.write(f"  Visual: {entry['visual_id'] or 'None'}\n")
        if entry.get('visual_description'):
            f.write(f"  Description: {entry['visual_description'][:80]}...\n")

file_size = os.path.getsize(output_file) / 1024

print("\n" + "="*70)
print("‚úÖ PHASE 4 COMPLETE - PIPELINE FINISHED!")
print("="*70)
print(f"üìä Final Statistics:")
print(f"   Video ID: {video_id}")
print(f"   Duration: {enriched_transcript['duration_ms']/1000:.1f}s")
print(f"   Total references: {enriched_transcript['metadata']['total_references']}")
print(f"   Unique visuals: {enriched_transcript['metadata']['unique_visuals']}")
print(f"   Frames analyzed: {enriched_transcript['metadata']['total_frames_extracted']}")
print(f"\nüíæ Output files:")
print(f"   Main: {output_file} ({file_size:.1f} KB)")
print(f"   Report: {report_file}")
print("="*70)

In [None]:
# ====================================================================
# CELL 12: Display Sample Results
# ====================================================================

print("\nüìã Sample Transcript Entries (first 5):")
print("="*70)

for i, entry in enumerate(transcript_entries[:5], 1):
    print(f"\n[{i}] {entry['timestamp_ms']/1000:.1f}s")
    print(f"  Text: {entry['text'][:100]}...")
    print(f"  Visual ID: {entry['visual_id'] or 'None'}")
    
    if entry.get('visual_description'):
        print(f"  Description: {entry['visual_description'][:80]}...")
    
    if entry.get('ocr_text'):
        print(f"  OCR: {entry['ocr_text'][:60]}...")

print("\n" + "="*70)
print("\nüé® Top Recurring Visuals:")
print("="*70)

# Sort by appearance count
sorted_visuals = sorted(
    visuals.items(),
    key=lambda x: x[1]['appearance_count'],
    reverse=True
)

for i, (visual_id, visual) in enumerate(sorted_visuals[:5], 1):
    print(f"\n[{i}] {visual_id}")
    print(f"  Appears: {visual['appearance_count']} times")
    print(f"  First at: {visual['appears_at_ms'][0]/1000:.1f}s")
    
    if visual.get('description'):
        print(f"  Description: {visual['description'][:80]}...")
    
    if visual.get('visual_type'):
        print(f"  Type: {visual['visual_type']}")

print("\n" + "="*70)
print("\n‚úÖ All done! Your enriched transcript is ready.")
print(f"üìÅ Download from: {CONFIG['output_dir']}/")

In [None]:
# ====================================================================
# CELL 13: Visualize Clustering (Optional)
# ====================================================================

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

if len(embeddings) > 1:
    print("\nüìä Generating cluster visualization...")
    
    # Reduce dimensions for visualization
    if len(embeddings) > 50:
        perplexity = min(30, len(embeddings) - 1)
        tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
        embeddings_2d = tsne.fit_transform(embeddings)
    else:
        # Use PCA for small datasets
        from sklearn.decomposition import PCA
        pca = PCA(n_components=2)
        embeddings_2d = pca.fit_transform(embeddings)
    
    # Plot
    fig, ax = plt.subplots(figsize=(14, 10))
    
    # Color by cluster
    unique_labels = set(labels)
    colors = plt.cm.rainbow(np.linspace(0, 1, len(unique_labels)))
    
    for label, color in zip(unique_labels, colors):
        if label == -1:
            # Noise points in black
            color = 'black'
            marker = 'x'
            label_name = 'Noise'
        else:
            marker = 'o'
            label_name = f'Cluster {label}'
        
        mask = labels == label
        ax.scatter(
            embeddings_2d[mask, 0],
            embeddings_2d[mask, 1],
            c=[color],
            marker=marker,
            s=100,
            alpha=0.6,
            label=label_name if label in [-1, 0, 1, 2] else None,
            edgecolors='black',
            linewidth=0.5
        )
    
    # Mark representatives
    rep_indices = list(cluster_representatives.values())
    ax.scatter(
        embeddings_2d[rep_indices, 0],
        embeddings_2d[rep_indices, 1],
        c='none',
        marker='o',
        s=300,
        edgecolors='red',
        linewidth=2,
        label='Representatives'
    )
    
    ax.set_title('Visual Frame Clustering (CLIP + DBSCAN)\nRed circles = cluster representatives',
                 fontsize=14, fontweight='bold')
    ax.set_xlabel('Dimension 1', fontsize=12)
    ax.set_ylabel('Dimension 2', fontsize=12)
    ax.legend(loc='best', fontsize=10)
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(f"{CONFIG['output_dir']}/phase4_clusters_{video_id}.png", 
                dpi=150, bbox_inches='tight')
    plt.show()
    
    print("‚úÖ Cluster visualization saved!")
else:
    print("‚ö†Ô∏è  Not enough data for visualization")