In [None]:
# -*- coding: utf-8 -*-
"""
🎬 DSPy Multi-Agent Movie Recommendation System - OPTIMIZATION DEMONSTRATION

This notebook demonstrates how to optimize a sophisticated multi-agent system using DSPy.
We'll take the existing movie recommendation system and systematically improve it.

📊 What You'll See:
1. Original multi-agent system baseline
2. Training data generation for optimization
3. Custom evaluation metrics for movie recommendations
4. DSPy optimization applied to individual agents
5. Side-by-side comparison of original vs optimized
6. Performance analytics and improvement measurement
"""

# =============================================================================
# 📦 SECTION 1: Environment Setup & Dependencies
# =============================================================================

# Install required packages
!pip install dspy-ai
!pip install mlflow
!pip install requests
!pip install gradio
!pip install openai
!pip install -U 'mlflow[databricks]>=3.1'


Collecting dspy-ai
  Downloading dspy_ai-2.6.27-py3-none-any.whl.metadata (286 bytes)
Collecting dspy>=2.6.5 (from dspy-ai)
  Downloading dspy-2.6.27-py3-none-any.whl.metadata (7.0 kB)
Collecting backoff>=2.2 (from dspy>=2.6.5->dspy-ai)
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Collecting ujson>=5.8.0 (from dspy>=2.6.5->dspy-ai)
  Downloading ujson-5.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.3 kB)
Collecting datasets>=2.14.6 (from dspy>=2.6.5->dspy-ai)
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting optuna>=3.4.0 (from dspy>=2.6.5->dspy-ai)
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting magicattr>=0.1.6 (from dspy>=2.6.5->dspy-ai)
  Downloading magicattr-0.1.6-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting litellm>=1.60.3 (from dspy>=2.6.5->dspy-ai)
  Downloading litellm-1.73.6-py3-none-any.whl.metadata (39 kB)
Collecting diskcache>=5.6.0 (from dspy>=2.6.5->dspy-ai)
  Downloading

In [None]:

# Import required libraries
import dspy
import os
import json
import requests
import pandas as pd
import numpy as np
from typing import List, Dict, Optional, Any, Tuple
from dataclasses import dataclass
from datetime import datetime
import mlflow
import gradio as gr
import random
import re
from collections import defaultdict

print("✅ All packages installed successfully!")


✅ All packages installed successfully!


In [None]:

# =============================================================================
# 🔑 SECTION 2: API Configuration
# =============================================================================

from getpass import getpass

# Set up your API keys
OPENAI_API_KEY = getpass("Enter your OpenAI API key: ")
TMDB_API_KEY = getpass("Enter your TMDB API key (or use demo key): ")

# Configure environment
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# Configure DSPy
llm = dspy.LM(model="openai/gpt-4o-mini", max_tokens=1000)
dspy.settings.configure(lm=llm)

# Set Databricks authentication details as environment variables
DATABRICKS_HOST = "https://<your_id>.cloud.databricks.com"
DATABRICKS_TOKEN = getpass("Enter your Databricks PAT: ")  # Prompt for new PAT

os.environ["DATABRICKS_HOST"] = DATABRICKS_HOST
os.environ["DATABRICKS_TOKEN"] = DATABRICKS_TOKEN

# Explicitly set tracking URI with Databricks format
mlflow.set_tracking_uri("databricks")

# Set experiment with correct user path (replace with your actual email)
mlflow.set_experiment("/Users/<email_id>/<experiment_name>")  # Adjust email

# Enable autologging
mlflow.autolog()

print("🔑 API configuration complete!")
# # Configure DSPy
# llm = dspy.LM(model="openai/gpt-4o-mini", max_tokens=1000)
# dspy.settings.configure(lm=llm)

# # Configure MLflow for tracking optimization
# mlflow.set_experiment("dspy-movie-optimization-demo")
# mlflow.autolog()

print("🔑 API configuration complete!")


Enter your OpenAI API key: ··········
Enter your TMDB API key (or use demo key): ··········
Enter your Databricks PAT: ··········


2025/06/30 00:56:09 INFO mlflow.tracking.fluent: Experiment with name '/Users/movcro5@gmail.com/dspy-movie-optimization-demo' does not exist. Creating a new experiment.
2025/06/30 00:56:09 INFO mlflow.tracking.fluent: Autologging successfully enabled for dspy.
2025/06/30 00:56:10 INFO mlflow.tracking.fluent: Autologging successfully enabled for litellm.
2025/06/30 00:56:10 INFO mlflow.tracking.fluent: Autologging successfully enabled for openai.
2025/06/30 00:56:10 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.


🔑 API configuration complete!
🔑 API configuration complete!


In [None]:

# =============================================================================
# 🎬 SECTION 3: Enhanced TMDB Client (From Original System)
# =============================================================================

class EnhancedTMDBClient:
    """Enhanced TMDB API client with quality filtering"""

    def __init__(self, api_key: str):
        self.api_key = api_key
        self.base_url = "https://api.themoviedb.org/3"

    def search_movie(self, title: str) -> Dict:
        """Search for a movie by title"""
        url = f"{self.base_url}/search/movie"
        params = {
            "api_key": self.api_key,
            "query": title,
            "language": "en-US"
        }
        try:
            response = requests.get(url, params=params)
            if response.status_code == 200:
                results = response.json().get("results", [])
                return results[0] if results else {}
        except Exception as e:
            print(f"Error searching movie: {e}")
        return {}

    def get_movie_details(self, movie_id: int) -> Dict:
        """Get detailed movie information"""
        url = f"{self.base_url}/movie/{movie_id}"
        params = {
            "api_key": self.api_key,
            "append_to_response": "credits,keywords,similar,recommendations"
        }
        try:
            response = requests.get(url, params=params)
            if response.status_code == 200:
                return response.json()
        except Exception as e:
            print(f"Error getting movie details: {e}")
        return {}

    def extract_movie_metadata(self, movie_details: Dict) -> Dict:
        """Extract structured metadata from TMDB movie details"""
        if not movie_details:
            return {}

        # Extract genres
        genres = [genre["name"].lower() for genre in movie_details.get("genres", [])]
        genre_ids = [genre["id"] for genre in movie_details.get("genres", [])]

        # Extract cast (top 5)
        cast = []
        credits = movie_details.get("credits", {})
        for actor in credits.get("cast", [])[:5]:
            cast.append(actor.get("name", ""))

        # Extract director
        crew = credits.get("crew", [])
        director = ""
        for person in crew:
            if person.get("job") == "Director":
                director = person.get("name", "")
                break

        # Extract themes from keywords
        keywords_data = movie_details.get("keywords", {})
        themes = [kw["name"].lower() for kw in keywords_data.get("keywords", [])[:10]]

        return {
            "title": movie_details.get("title", ""),
            "release_date": movie_details.get("release_date", ""),
            "overview": movie_details.get("overview", ""),
            "genres": genres,
            "genre_ids": genre_ids,
            "themes": themes,
            "director": director,
            "cast": cast,
            "runtime": movie_details.get("runtime", 0),
            "vote_average": movie_details.get("vote_average", 0),
            "tmdb_id": movie_details.get("id", 0)
        }

    def get_comprehensive_movie_data(self, title: str) -> Dict:
        """Get comprehensive movie data for a title"""
        search_result = self.search_movie(title)
        if not search_result:
            return {"error": f"Movie '{title}' not found"}

        movie_id = search_result.get("id")
        details = self.get_movie_details(movie_id)
        if not details:
            return {"error": f"Could not retrieve details for '{title}'"}

        metadata = self.extract_movie_metadata(details)

        # Get recommendations
        similar_movies = [movie["title"] for movie in details.get("similar", {}).get("results", [])[:5]]
        recommendations = [movie["title"] for movie in details.get("recommendations", {}).get("results", [])[:5]]

        metadata["similar_movies"] = similar_movies
        metadata["recommended_movies"] = recommendations

        return metadata

# Initialize TMDB client
tmdb = EnhancedTMDBClient(TMDB_API_KEY)
print("🎬 TMDB client initialized!")


🎬 TMDB client initialized!


In [None]:

# =============================================================================
# 🛠️ SECTION 4: Original Agent Tools
# =============================================================================

def movie_metadata_lookup_tool(movie_title: str) -> str:
    """Tool for looking up movie metadata from TMDB API"""
    try:
        metadata = tmdb.get_comprehensive_movie_data(movie_title)
        if "error" in metadata:
            return f"Error: {metadata['error']}"
        return json.dumps(metadata, indent=2)
    except Exception as e:
        return f"Error retrieving movie data: {str(e)}"

def movie_hypothesis_generator_tool(movie_title: str, metadata: str) -> str:
    """Tool for generating hypotheses about why user loved a movie"""
    class HypothesisGenerator(dspy.Signature):
        movie_title = dspy.InputField(desc="The movie the user loved")
        movie_metadata = dspy.InputField(desc="Real movie metadata from TMDB")
        hypotheses = dspy.OutputField(desc="Three specific hypotheses about what drew them to the movie")

    generator = dspy.ChainOfThought(HypothesisGenerator)
    result = generator(movie_title=movie_title, movie_metadata=metadata)
    return result.hypotheses

def recommendation_generator_tool(movie_title: str, user_hypothesis: str) -> str:
    """Tool for generating movie recommendations based on user preferences"""
    class RecommendationGenerator(dspy.Signature):
        original_movie = dspy.InputField(desc="Movie the user loved")
        user_preference_hypothesis = dspy.InputField(desc="What the user likely enjoyed about the movie")
        recommendations = dspy.OutputField(desc="Three movie recommendations with brief explanations")

    generator = dspy.ChainOfThought(RecommendationGenerator)
    result = generator(original_movie=movie_title, user_preference_hypothesis=user_hypothesis)
    return result.recommendations

def narrative_constructor_tool(movie_title: str, reason: str) -> str:
    """Tool for constructing compelling narrative explanations"""
    class NarrativeConstructor(dspy.Signature):
        recommended_movie = dspy.InputField(desc="The movie being recommended")
        connection_reason = dspy.InputField(desc="Why this movie connects to user's taste")
        narrative_explanation = dspy.OutputField(desc="A compelling story-driven explanation")

    constructor = dspy.ChainOfThought(NarrativeConstructor)
    result = constructor(recommended_movie=movie_title, connection_reason=reason)
    return result.narrative_explanation

print("🛠️ Original agent tools ready!")


🛠️ Original agent tools ready!


In [None]:

# =============================================================================
# 🤖 SECTION 5: Original Multi-Agent System
# =============================================================================

class MovieAnalysisSignature(dspy.Signature):
    """Analyze a movie to understand user preferences and generate recommendations."""
    movie_title: str = dspy.InputField()
    analysis_result: str = dspy.OutputField(desc="Complete analysis with movie recommendations")

class NarrativeSignature(dspy.Signature):
    """Create compelling narrative explanations for movie recommendations."""
    movie_recommendations: str = dspy.InputField()
    narrative_explanations: str = dspy.OutputField(desc="Compelling narrative explanations")

class OrchestratorSignature(dspy.Signature):
    """Master orchestrator coordinating movie analysis and narrative agents."""
    user_input: str = dspy.InputField()
    final_recommendations: str = dspy.OutputField(desc="Final movie recommendations with narratives")

# Create original agents
original_movie_agent = dspy.ReAct(
    MovieAnalysisSignature,
    tools=[movie_metadata_lookup_tool, movie_hypothesis_generator_tool, recommendation_generator_tool]
)

original_narrative_agent = dspy.ReAct(
    NarrativeSignature,
    tools=[narrative_constructor_tool]
)

# Tools for orchestrator
def call_movie_analysis_agent(movie_title: str) -> str:
    """Call the Movie Analysis Agent"""
    result = original_movie_agent(movie_title=movie_title)
    return result.analysis_result

def call_narrative_agent(recommendations: str) -> str:
    """Call the Narrative Agent"""
    result = original_narrative_agent(movie_recommendations=recommendations)
    return result.narrative_explanations

# Original orchestrator
original_orchestrator = dspy.ReAct(
    OrchestratorSignature,
    tools=[call_movie_analysis_agent, call_narrative_agent, movie_metadata_lookup_tool]
)

print("🤖 Original multi-agent system ready!")


🤖 Original multi-agent system ready!


In [None]:

# =============================================================================
# 📊 SECTION 6: Training Dataset Generation
# =============================================================================

def generate_training_dataset(size: int = 60) -> List[dspy.Example]:
    """Generate training dataset for optimization"""

    # Curated examples of good movie taste patterns
    training_patterns = [
        {
            "input_movie": "Inception",
            "expected_recommendations": ["Memento", "Shutter Island", "The Prestige"],
            "expected_themes": ["mind-bending", "psychological", "complex narrative"],
            "quality_narrative": "If you loved Inception's layered reality and complex storytelling..."
        },
        {
            "input_movie": "The Matrix",
            "expected_recommendations": ["Blade Runner 2049", "Ex Machina", "Ghost in the Shell"],
            "expected_themes": ["artificial intelligence", "reality questioning", "cyberpunk"],
            "quality_narrative": "Like The Matrix, these films explore the nature of reality..."
        },
        {
            "input_movie": "Pulp Fiction",
            "expected_recommendations": ["Reservoir Dogs", "Kill Bill", "Snatch"],
            "expected_themes": ["non-linear narrative", "crime", "dark humor"],
            "quality_narrative": "These films share Tarantino's distinctive storytelling style..."
        },
        {
            "input_movie": "Interstellar",
            "expected_recommendations": ["Arrival", "Contact", "2001: A Space Odyssey"],
            "expected_themes": ["space exploration", "scientific concepts", "emotional depth"],
            "quality_narrative": "Like Interstellar, these films blend hard science with human emotion..."
        },
        {
            "input_movie": "The Dark Knight",
            "expected_recommendations": ["Heat", "The Departed", "Zodiac"],
            "expected_themes": ["crime thriller", "moral complexity", "psychological depth"],
            "quality_narrative": "These films share The Dark Knight's serious approach to crime..."
        }
    ]

    # Generate more patterns programmatically
    additional_patterns = []
    movie_clusters = {
        "horror": ["The Exorcist", "Hereditary", "The Babadook", "Get Out"],
        "comedy": ["The Grand Budapest Hotel", "In Bruges", "Kiss Kiss Bang Bang", "The Nice Guys"],
        "drama": ["There Will Be Blood", "No Country for Old Men", "Moonlight", "Parasite"],
        "action": ["Mad Max: Fury Road", "John Wick", "The Raid", "Baby Driver"],
        "sci-fi": ["Blade Runner", "Alien", "Dune", "Her"]
    }

    for genre, movies in movie_clusters.items():
        for i, movie in enumerate(movies):
            recommendations = [m for j, m in enumerate(movies) if j != i][:3]
            additional_patterns.append({
                "input_movie": movie,
                "expected_recommendations": recommendations,
                "expected_themes": [genre, "quality filmmaking", "genre excellence"],
                "quality_narrative": f"As a fan of {movie}, you'll appreciate these {genre} masterpieces..."
            })

    all_patterns = training_patterns + additional_patterns[:size-len(training_patterns)]

    # Convert to DSPy examples
    training_examples = []
    for pattern in all_patterns:
        # Create the input as a user query
        user_query = f"I loved the movie {pattern['input_movie']}. Can you recommend similar movies?"

        # Create expected output format
        expected_output = f"""
**Recommendations for {pattern['input_movie']} lovers:**

1. **{pattern['expected_recommendations'][0]}**: {pattern['quality_narrative']}
2. **{pattern['expected_recommendations'][1]}**: Connected through {', '.join(pattern['expected_themes'][:2])}
3. **{pattern['expected_recommendations'][2]}**: Shares the same {pattern['expected_themes'][0]} appeal

**Analysis**: Based on your love for {pattern['input_movie']}, I identified these key themes: {', '.join(pattern['expected_themes'])}. These recommendations match those preferences perfectly.
"""

        example = dspy.Example(
            user_input=user_query,
            final_recommendations=expected_output,
            input_movie=pattern['input_movie'],
            expected_themes=pattern['expected_themes'],
            expected_recs=pattern['expected_recommendations']
        ).with_inputs("user_input")

        training_examples.append(example)

    return training_examples

# Generate training and validation sets
print("📊 Generating training dataset...")
full_dataset = generate_training_dataset(80)
random.shuffle(full_dataset)

# Split into train/val
train_size = int(0.7 * len(full_dataset))
trainset = full_dataset[:train_size]
valset = full_dataset[train_size:]

print(f"✅ Dataset created: {len(trainset)} training examples, {len(valset)} validation examples")
print(f"📋 Sample training example:")
print(f"Input: {trainset[0].user_input}")
print(f"Expected output (first 200 chars): {trainset[0].final_recommendations[:200]}...")


In [None]:

# =============================================================================
# 📏 SECTION 7: Custom Evaluation Metrics
# =============================================================================

def extract_recommended_movies(response: str) -> List[str]:
    """Extract movie titles from agent response"""
    # Look for patterns like "1. **Movie Title**" or "**Movie Title**"
    patterns = [
        r'\d+\.\s*\*\*([^*]+)\*\*',  # "1. **Movie Title**"
        r'\*\*([^*]+)\*\*(?=:)',      # "**Movie Title**:"
        r'recommend[^:]*:\s*([^,\n]+)',  # "I recommend: Movie Title"
    ]

    movies = []
    for pattern in patterns:
        matches = re.findall(pattern, response, re.IGNORECASE)
        for match in matches:
            movie = match.strip()
            if len(movie) > 2 and movie not in movies:
                movies.append(movie)

    return movies[:3]  # Return top 3

def recommendation_relevance_metric(example, pred, trace=None) -> float:
    """
    Measure how relevant the recommendations are to the expected movies/themes
    Returns score between 0.0 and 1.0
    """
    try:
        # Extract predicted movies
        predicted_movies = extract_recommended_movies(pred.final_recommendations)
        expected_movies = example.expected_recs
        expected_themes = set(theme.lower() for theme in example.expected_themes)

        if not predicted_movies:
            return 0.0

        # Score 1: Direct movie matches (40% weight)
        movie_score = 0.0
        for pred_movie in predicted_movies:
            for exp_movie in expected_movies:
                # Check for exact or partial matches
                if pred_movie.lower() in exp_movie.lower() or exp_movie.lower() in pred_movie.lower():
                    movie_score += 1.0
                elif any(word in exp_movie.lower().split() for word in pred_movie.lower().split() if len(word) > 3):
                    movie_score += 0.5

        movie_score = min(movie_score, len(expected_movies)) / len(expected_movies)

        # Score 2: Theme relevance (40% weight)
        response_lower = pred.final_recommendations.lower()
        theme_matches = sum(1 for theme in expected_themes if theme in response_lower)
        theme_score = theme_matches / len(expected_themes)

        # Score 3: Response quality (20% weight)
        quality_indicators = [
            "recommendation" in response_lower,
            "similar" in response_lower,
            "love" in response_lower or "enjoy" in response_lower,
            len(pred.final_recommendations) > 200,  # Substantial response
            "**" in pred.final_recommendations  # Formatted properly
        ]
        quality_score = sum(quality_indicators) / len(quality_indicators)

        # Combined score
        total_score = (movie_score * 0.4) + (theme_score * 0.4) + (quality_score * 0.2)
        return min(total_score, 1.0)

    except Exception as e:
        print(f"Error in relevance metric: {e}")
        return 0.0

def narrative_quality_metric(example, pred, trace=None) -> float:
    """
    Measure the quality of narrative explanations
    Returns score between 0.0 and 1.0
    """
    try:
        response = pred.final_recommendations

        # Quality indicators
        indicators = {
            "compelling_language": any(word in response.lower() for word in [
                "captivating", "compelling", "brilliant", "masterpiece", "extraordinary",
                "remarkable", "stunning", "powerful", "moving", "unforgettable"
            ]),
            "connection_explanation": any(phrase in response.lower() for phrase in [
                "like", "similar to", "if you loved", "shares", "connects",
                "reminiscent of", "echoes", "parallels"
            ]),
            "specific_details": any(word in response.lower() for word in [
                "director", "cinematography", "themes", "style", "genre",
                "plot", "character", "atmosphere"
            ]),
            "emotional_appeal": any(word in response.lower() for word in [
                "feel", "experience", "journey", "emotion", "heart",
                "soul", "passion", "depth"
            ]),
            "structure": "**" in response and len(response.split("**")) >= 4,
            "length": 300 <= len(response) <= 1500,  # Optimal length
            "coherence": response.count(".") >= 5,  # Multiple sentences
        }

        score = sum(indicators.values()) / len(indicators)
        return score

    except Exception as e:
        print(f"Error in narrative quality metric: {e}")
        return 0.0

def combined_movie_recommendation_metric(example, pred, trace=None) -> float:
    """
    Combined metric weighing both relevance and narrative quality
    """
    relevance = recommendation_relevance_metric(example, pred, trace)
    narrative = narrative_quality_metric(example, pred, trace)

    # Weight relevance slightly higher than narrative
    combined = (relevance * 0.6) + (narrative * 0.4)
    return combined

print("📏 Custom evaluation metrics ready!")


📏 Custom evaluation metrics ready!


In [None]:

# =============================================================================
# 🎯 SECTION 8: DSPy Optimization Process
# =============================================================================

def run_optimization():
    """Run the complete optimization process"""

    print("🎯 Starting DSPy optimization process...")

    # Set up the optimizer
    optimizer = dspy.MIPROv2(
        metric=combined_movie_recommendation_metric,
        auto="light",  # Use light mode for faster optimization
        num_threads=4,
        verbose=True
    )

    # Create evaluator for baseline measurement
    evaluator = dspy.Evaluate(
        metric=combined_movie_recommendation_metric,
        devset=valset[:10],  # Use smaller set for demo
        display_table=True,
        display_progress=True
    )

    print("📊 Measuring baseline performance...")
    with mlflow.start_run(run_name="baseline_measurement"):
        baseline_score = evaluator(original_orchestrator)
        mlflow.log_metric("baseline_score", baseline_score)
        print(f"🔍 Baseline Score: {baseline_score:.3f}")

    print("🚀 Running optimization...")
    with mlflow.start_run(run_name="optimization_process"):
        optimized_orchestrator = optimizer.compile(
            original_orchestrator,
            trainset=trainset[:30],  # Use subset for demo
            valset=valset[:10],
            requires_permission_to_run=False
        )

        print("📊 Measuring optimized performance...")
        optimized_score = evaluator(optimized_orchestrator)
        mlflow.log_metric("optimized_score", optimized_score)
        mlflow.log_metric("improvement", optimized_score - baseline_score)

        print(f"🎉 Optimization Results:")
        print(f"   Baseline Score: {baseline_score:.3f}")
        print(f"   Optimized Score: {optimized_score:.3f}")
        print(f"   Improvement: {optimized_score - baseline_score:.3f} ({((optimized_score - baseline_score) / baseline_score * 100):.1f}%)")

    return optimized_orchestrator, baseline_score, optimized_score

# Run the optimization
print("⚡ Running optimization process...")
optimized_orchestrator, baseline_score, optimized_score = run_optimization()
print("✅ Optimization complete!")


In [None]:

# =============================================================================
# 🚀 SECTION 9: Comparison Interface
# =============================================================================

def create_comparison_interface():
    """Create interface showing original vs optimized system"""

    def compare_systems(movie_title: str):
        """Compare original vs optimized recommendations"""
        if not movie_title.strip():
            return "Please enter a movie title!", "", "", ""

        user_query = f"I loved the movie {movie_title}. Can you recommend similar movies?"

        try:
            # Get original system response
            print(f"🤖 Getting original system response for '{movie_title}'...")
            original_result = original_orchestrator(user_input=user_query)
            original_response = original_result.final_recommendations

            # Get optimized system response
            print(f"🎯 Getting optimized system response for '{movie_title}'...")
            optimized_result = optimized_orchestrator(user_input=user_query)
            optimized_response = optimized_result.final_recommendations

            # Analyze improvements
            improvements = analyze_improvements(original_response, optimized_response, movie_title)

            # Performance comparison
            performance_comparison = f"""
**🔍 Performance Analysis:**

**Baseline System Score**: {baseline_score:.3f}
**Optimized System Score**: {optimized_score:.3f}
**Improvement**: {optimized_score - baseline_score:.3f} ({((optimized_score - baseline_score) / baseline_score * 100):.1f}%)

**Key Optimization Areas**:
• Recommendation Relevance: Better theme matching
• Narrative Quality: More compelling explanations
• Agent Coordination: Improved multi-agent workflow
• TMDB Integration: Enhanced real data usage

**Training Data**: {len(trainset)} examples
**Validation Data**: {len(valset)} examples
**Optimization Method**: DSPy MIPROv2 with custom metrics
"""

            return original_response, optimized_response, improvements, performance_comparison

        except Exception as e:
            error_msg = f"Error comparing systems: {str(e)}"
            return error_msg, "", "", ""

    def analyze_improvements(original: str, optimized: str, movie: str) -> str:
        """Analyze specific improvements between versions"""

        orig_movies = extract_recommended_movies(original)
        opt_movies = extract_recommended_movies(optimized)

        analysis = f"""
**🎯 Improvement Analysis for "{movie}":**

**Original Recommendations**: {', '.join(orig_movies) if orig_movies else 'None extracted'}
**Optimized Recommendations**: {', '.join(opt_movies) if opt_movies else 'None extracted'}

**Length Comparison**:
• Original Response: {len(original)} characters
• Optimized Response: {len(optimized)} characters

**Quality Indicators**:
• **Formatting**: {'✅' if '**' in optimized else '❌'} Better formatting
• **Detail Level**: {'✅' if len(optimized) > len(original) else '❌'} More detailed explanations
• **Movie Count**: {'✅' if len(opt_movies) >= len(orig_movies) else '❌'} Adequate recommendations
• **Narrative Elements**: {'✅' if any(word in optimized.lower() for word in ['love', 'enjoy', 'similar', 'like']) else '❌'} Compelling language

**Optimization Impact**:
The optimized system was trained on {len(trainset)} examples to improve:
1. **Thematic Relevance**: Better matching of movie themes and genres
2. **Narrative Coherence**: More compelling "why you'll love this" explanations
3. **Response Structure**: Clearer formatting and organization
4. **Agent Coordination**: Improved multi-agent collaboration

**Training Focus**: The optimization specifically targeted recommendation accuracy and narrative quality using custom evaluation metrics.
"""
        return analysis

    # Create Gradio interface
    interface = gr.Interface(
        fn=compare_systems,
        inputs=[
            gr.Textbox(
                label="🎬 Movie Title",
                placeholder="Try: Inception, The Matrix, Pulp Fiction, Interstellar",
                lines=1
            )
        ],
        outputs=[
            gr.Textbox(label="🤖 Original System Response", lines=15),
            gr.Textbox(label="🎯 Optimized System Response", lines=15),
            gr.Textbox(label="📊 Improvement Analysis", lines=12),
            gr.Textbox(label="⚖️ Performance Metrics", lines=10)
        ],
        title="🎯 DSPy Multi-Agent Optimization: Before vs After",
        description=f"""
        **Real DSPy Optimization Demonstration**

        🎯 **Optimization Results**: {optimized_score:.3f} vs {baseline_score:.3f} baseline ({((optimized_score - baseline_score) / baseline_score * 100):.1f}% improvement)

        📊 **What Was Optimized**:
        • **Training Data**: {len(trainset)} curated movie recommendation examples
        • **Evaluation Metrics**: Custom metrics for recommendation relevance + narrative quality
        • **Optimization Method**: DSPy MIPROv2 with multi-agent coordination
        • **Focus Areas**: Thematic accuracy, compelling narratives, better agent collaboration

        🎬 **Try It**: Enter any movie title to see how optimization improved the recommendations!
        """,
        examples=[
            ["Inception"],
            ["The Matrix"],
            ["Pulp Fiction"],
            ["Interstellar"],
            ["The Dark Knight"]
        ],
        theme=gr.themes.Soft()
    )

    return interface

# Create and launch the comparison interface
comparison_demo = create_comparison_interface()

print("🚀 Comparison interface ready!")


🚀 Comparison interface ready!


In [None]:

# =============================================================================
# 🎉 SECTION 10: Results Summary and Next Steps
# =============================================================================

def display_optimization_summary():
    """Display comprehensive summary of optimization results"""

    summary = f"""
# 🎯 DSPy Multi-Agent Optimization - Complete Results

## 📊 Performance Improvement
- **Baseline Score**: {baseline_score:.3f}
- **Optimized Score**: {optimized_score:.3f}
- **Improvement**: {optimized_score - baseline_score:.3f} ({((optimized_score - baseline_score) / baseline_score * 100):.1f}%)

## 🎬 System Architecture
**Original Multi-Agent Components**:
- Movie Analysis Agent (hypothesis generation)
- Narrative Agent (story construction)
- Master Orchestrator (agent coordination)
- TMDB Integration (real movie data)

**Optimization Enhancements**:
- Custom evaluation metrics for movie recommendations
- Training dataset with {len(trainset)} curated examples
- DSPy MIPROv2 optimizer with light configuration
- Focus on recommendation relevance + narrative quality

## 🎯 Key Improvements Demonstrated
1. **Better Recommendations**: More thematically relevant movie suggestions
2. **Enhanced Narratives**: More compelling "why you'll love this" explanations
3. **Improved Structure**: Better formatting and organization
4. **Smarter Coordination**: More efficient multi-agent collaboration

## 🚀 Technical Implementation
- **Framework**: DSPy ReAct agents with tool usage
- **Data Source**: TMDB API for real movie metadata
- **Optimization**: MIPROv2 with custom movie recommendation metrics
- **Evaluation**: Combined relevance + narrative quality scoring
- **Tracking**: MLflow for experiment management

## 🎬 Next Steps for Production
1. **Expand Training Data**: Include more diverse movie preferences
2. **Advanced Metrics**: Add user satisfaction and click-through rate simulation
3. **A/B Testing**: Deploy optimized system alongside baseline for comparison
4. **Continuous Learning**: Regular retraining with new user preference data
5. **Domain Expansion**: Apply same optimization approach to TV shows, books, music

## 🔧 Code Availability
This complete optimization pipeline demonstrates:
- Real multi-agent system optimization using DSPy
- Custom evaluation metrics for domain-specific tasks
- Training data generation for recommendation systems
- Before/after comparison interface
- Production-ready optimization workflow

🎉 **Ready to scale and deploy!**
"""

    print(summary)
    return summary

# Display final summary
optimization_summary = display_optimization_summary()

print("\n" + "="*80)
print("🎉 DSPy MULTI-AGENT OPTIMIZATION DEMONSTRATION COMPLETE!")
print("="*80)
print(f"🎯 Achieved {((optimized_score - baseline_score) / baseline_score * 100):.1f}% improvement in movie recommendation quality")
print("🚀 Launch the Gradio interface below to try the optimized system!")
print("="*80)

# Launch the comparison interface
comparison_demo.launch(share=True, debug=True)