# ü§ù commonGround - Multi-Model Consensus Analyzer

A tool to compare responses from multiple LLMs and find common ground through consensus analysis.

In [None]:
# Install required packages
#!pip install gradio openai python-dotenv --quiet

In [1]:
import gradio as gr
import os
from openai import OpenAI
from dotenv import load_dotenv
import json
from typing import List, Dict, Tuple
import time

# Load environment variables
load_dotenv(override=True)
api_key = os.getenv('OPENROUTER_API_KEY')

# Initialize OpenRouter client
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=api_key
)

In [3]:
# Available models (VERIFIED December 2025 - Free + Paid)
AVAILABLE_MODELS = [
    # Google Gemini - Free, fast, reliable
    "google/gemini-2.0-flash-exp:free",
    "google/gemini-2.5-flash-lite-preview:free",
    
    # Meta Llama - Free, good quality
    "meta-llama/llama-3.3-70b-instruct:free",
    
    # Anthropic Claude - PAID but excellent
    "anthropic/claude-3.5-haiku",
    "anthropic/claude-3.5-sonnet",
    "anthropic/claude-haiku-4.5",
    
    # OpenAI - PAID, very good
    "openai/gpt-4o-mini",
    
    # DeepSeek - Free, excellent for reasoning
    "deepseek/deepseek-r1:free",
]

EXAMPLE_QUESTIONS = [
    "What are the most important metrics for evaluating LLM performance?",
    "How do you design an effective prompt for summarization tasks?",
    "What's the difference between BLEU and ROUGE scores?",
    "Explain the concept of few-shot learning in LLMs"
]

In [4]:
def call_model(model: str, question: str, temperature: float, top_p: float, max_tokens: int = 500) -> str:
    """Call a single model and return its response."""
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": question}],
            temperature=min(temperature, 1.0),
            top_p=top_p,
            max_tokens=max_tokens
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        error_msg = str(e).lower()
        if "429" in str(e) or "rate" in error_msg:
            return f"‚ö†Ô∏è **Rate Limited**: {model} - Free tier exhausted"
        elif "402" in str(e) or "insufficient" in error_msg:
            return f"üí≥ **Insufficient Credits**: Add credits for {model}"
        elif "404" in str(e) or "not found" in error_msg:
            return f"‚ùå **Unavailable**: {model} temporarily down"
        elif "401" in str(e) or "auth" in error_msg:
            return f"‚ùå **Auth Error**: Check OPENROUTER_API_KEY"
        else:
            return f"‚ùå **Error**: {str(e)[:150]}"

In [5]:
def generate_responses(
    question: str,
    model1: str, model2: str, model3: str, model4: str,
    temperature: float, top_p: float,
    progress=gr.Progress()
) -> Tuple[str, str, str, str, str, List[str], List[str]]:
    """Step 1: Generate responses from 4 models."""
    
    # Validate inputs
    if not question.strip():
        return ("‚ùå Please enter a question", "", "", "", 
                "‚ö†Ô∏è No question provided", [], [])
    
    models = [model1, model2, model3, model4]
    if len(set(models)) != 4:
        return ("‚ùå Please select 4 different models", "", "", "",
                "‚ö†Ô∏è Duplicate models selected", [], [])
    
    # Generate responses with progress tracking
    responses = []
    response_texts = []
    
    for i, model in enumerate(models, 1):
        progress((i-1)/4, desc=f"ü§ñ Calling {model.split('/')[-1]}...")
        response = call_model(model, question, temperature, top_p)
        responses.append(response)
        response_texts.append(f"### ü§ñ Model {i}: {model}\n\n{response}")
    
    progress(1.0, desc="‚úÖ All responses generated!")
    
    status_msg = "‚úÖ **Responses generated successfully!** You can now analyze them."
    
    return (*response_texts, status_msg, responses, models)

In [None]:
def generate_responses(
    question: str,
    model1: str, model2: str, model3: str, model4: str,
    temperature: float, top_p: float,
    progress=gr.Progress()
) -> Tuple[str, str, str, str, str, List[str], List[str]]:
    """Step 1: Generate responses from 4 models."""
    
    # Validate inputs
    if not question.strip():
        return ("‚ùå Please enter a question", "", "", "", 
                "‚ö†Ô∏è No question provided", [], [])
    
    models = [model1, model2, model3, model4]
    if len(set(models)) != 4:
        return ("‚ùå Please select 4 different models", "", "", "",
                "‚ö†Ô∏è Duplicate models selected", [], [])
    
    # Generate responses
    responses = []
    response_texts = []
    
    for i, model in enumerate(models, 1):
        # Simplified progress - only show percentage
        progress((i-1)/4)
        response = call_model(model, question, temperature, top_p)
        responses.append(response)
        response_texts.append(f"### ü§ñ Model {i}: {model}\n\n{response}")
    
    status_msg = "‚úÖ **Responses generated!** Click 'Step 2' to analyze."
    
    return (*response_texts, status_msg, responses, models)


def analyze_responses_step(
    responses: List[str],
    models: List[str],
    objective: str,
    progress=gr.Progress()
) -> Tuple[str, str, str, Dict]:
    """Step 2: Analyze responses for consensus and metrics."""
    
    if not responses or len(responses) != 4:
        return ("", "", "‚ö†Ô∏è Please generate responses first!", {})
    
    # Check ONLY for actual error messages (starting with emoji)
    if any(r.startswith(("‚ùå", "‚ö†Ô∏è", "üí≥")) for r in responses):
        return ("", "", "‚ö†Ô∏è Some models failed. Please regenerate responses.", {})
    
    # Simplified progress
    progress(0.3)
    
    # Create analysis prompt
    analysis_prompt = f"""You are an expert AI evaluator. Analyze these 4 model responses.

Objective: {objective}

Responses:
{''.join([f"\n\nModel {i+1} ({models[i]}):\n{resp}" for i, resp in enumerate(responses)])}

Provide analysis in JSON format:
{{
    "consensus": ["point 1", "point 2", "point 3"],
    "coherence_score": "1-10 with brief explanation",
    "repetition_simplicity": "assessment of redundancy and complexity",
    "model_metrics": [
        {{
            "model": "Model 1",
            "hallucination": "yes/no/NA",
            "imprecise": "yes/no/NA",
            "off_topic": "yes/no/NA",
            "subjective": "yes/no/NA",
            "overly_enthusiastic": "yes/no/NA",
            "tone": "description"
        }}
    ]
}}

Return ONLY valid JSON."""
    
    try:
        progress(0.6)
        analysis = client.chat.completions.create(
            model="anthropic/claude-3.5-sonnet",
            messages=[{"role": "user", "content": analysis_prompt}],
            temperature=0.3,
            max_tokens=1500
        )
        analysis_data = json.loads(analysis.choices[0].message.content)
    except Exception as e:
        analysis_data = {
            "error": f"Analysis failed: {str(e)}",
            "consensus": ["Unable to analyze"],
            "coherence_score": "N/A",
            "repetition_simplicity": "N/A",
            "model_metrics": []
        }
    
    # Format consensus
    consensus_text = "### üéØ Consensus Points\n\n"
    for i, point in enumerate(analysis_data.get('consensus', []), 1):
        consensus_text += f"{i}. {point}\n"
    consensus_text += f"\n**Coherence Score:** {analysis_data.get('coherence_score', 'N/A')}\n"
    consensus_text += f"\n**Repetition/Simplicity:** {analysis_data.get('repetition_simplicity', 'N/A')}"
    
    # Format metrics table
    metrics = analysis_data.get('model_metrics', [])
    metrics_html = format_metrics_table(metrics)
    
    status_msg = "‚úÖ **Analysis complete!** Click 'Step 3' for winner."
    
    return (consensus_text, metrics_html, status_msg, analysis_data)


def determine_winner_step(
    responses: List[str],
    models: List[str],
    analysis_data: Dict,
    objective: str,
    progress=gr.Progress()
) -> Tuple[str, str]:
    """Step 3: Determine the winning model."""
    
    if not responses or not analysis_data:
        return ("", "‚ö†Ô∏è Please complete Steps 1 and 2 first!")
    
    # Simplified progress
    progress(0.5)
    
    verdict_prompt = f"""You are an expert judge evaluating LLM responses.

Objective: {objective}

Responses:
{''.join([f"\n\n{models[i]}:\n{resp}" for i, resp in enumerate(responses)])}

Analysis Summary:
- Consensus: {', '.join(analysis_data.get('consensus', []))}
- Coherence: {analysis_data.get('coherence_score', 'N/A')}

Evaluate each model based on:
1. **Objective fulfillment**: Did it meet the stated objective?
2. **Clarity**: Clear and well-structured?
3. **Consensus alignment**: Aligns with common ground?
4. **Structure**: Well-organized?

Declare the WINNER and provide brief justification (3-4 sentences)."""
    
    try:
        verdict = client.chat.completions.create(
            model="anthropic/claude-3.5-sonnet",
            messages=[{"role": "user", "content": verdict_prompt}],
            temperature=0.5,
            max_tokens=500
        )
        verdict_text = f"### üèÜ Final Verdict\n\n{verdict.choices[0].message.content.strip()}"
    except Exception as e:
        verdict_text = f"### üèÜ Final Verdict\n\nUnable to determine verdict: {str(e)}"
    
    status_msg = "‚úÖ **All done!** Winner determined."
    
    return (verdict_text, status_msg)

In [7]:
def determine_winner_step(
    responses: List[str],
    models: List[str],
    analysis_data: Dict,
    objective: str,
    progress=gr.Progress()
) -> Tuple[str, str]:
    """Step 3: Determine the winning model."""
    
    if not responses or not analysis_data:
        return ("", "‚ö†Ô∏è Please generate and analyze responses first!")
    
    progress(0.5, desc="üèÜ Evaluating models to determine winner...")
    
    verdict_prompt = f"""You are an expert judge evaluating LLM responses.

Objective: {objective}

Responses:
{''.join([f"\n\n{models[i]}:\n{resp}" for i, resp in enumerate(responses)])}

Analysis Summary:
- Consensus: {', '.join(analysis_data.get('consensus', []))}
- Coherence: {analysis_data.get('coherence_score', 'N/A')}

Evaluate each model based on:
1. **Objective fulfillment**: Did it meet the stated objective?
2. **Clarity**: Clear and well-structured?
3. **Consensus alignment**: Aligns with common ground?
4. **Structure**: Well-organized?

Declare the WINNER and provide brief justification (3-4 sentences)."""
    
    try:
        verdict = client.chat.completions.create(
            model="anthropic/claude-3.5-sonnet",
            messages=[{"role": "user", "content": verdict_prompt}],
            temperature=0.5,
            max_tokens=500
        )
        verdict_text = f"### üèÜ Final Verdict\n\n{verdict.choices[0].message.content.strip()}"
    except Exception as e:
        verdict_text = f"### üèÜ Final Verdict\n\nUnable to determine verdict: {str(e)}"
    
    progress(1.0, desc="‚úÖ Winner determined!")
    
    status_msg = "‚úÖ **Winner determined!** Analysis complete."
    
    return (verdict_text, status_msg)

In [8]:
def format_metrics_table(metrics: List[Dict]) -> str:
    """Format model metrics as HTML table."""
    if not metrics:
        return "<p>No metrics available</p>"
    
    html = """<table style='width:100%; border-collapse: collapse; margin-top: 20px;'>
    <tr style='background-color: #f0f0f0;'>
        <th style='border: 1px solid #ddd; padding: 8px;'>Model</th>
        <th style='border: 1px solid #ddd; padding: 8px;'>Hallucination</th>
        <th style='border: 1px solid #ddd; padding: 8px;'>Imprecise</th>
        <th style='border: 1px solid #ddd; padding: 8px;'>Off-Topic</th>
        <th style='border: 1px solid #ddd; padding: 8px;'>Subjective</th>
        <th style='border: 1px solid #ddd; padding: 8px;'>Overly Enthusiastic</th>
        <th style='border: 1px solid #ddd; padding: 8px;'>Tone</th>
    </tr>"""
    
    for metric in metrics:
        html += f"""<tr>
        <td style='border: 1px solid #ddd; padding: 8px;'><strong>{metric.get('model', 'N/A')}</strong></td>
        <td style='border: 1px solid #ddd; padding: 8px; text-align: center;'>{metric.get('hallucination', 'NA')}</td>
        <td style='border: 1px solid #ddd; padding: 8px; text-align: center;'>{metric.get('imprecise', 'NA')}</td>
        <td style='border: 1px solid #ddd; padding: 8px; text-align: center;'>{metric.get('off_topic', 'NA')}</td>
        <td style='border: 1px solid #ddd; padding: 8px; text-align: center;'>{metric.get('subjective', 'NA')}</td>
        <td style='border: 1px solid #ddd; padding: 8px; text-align: center;'>{metric.get('overly_enthusiastic', 'NA')}</td>
        <td style='border: 1px solid #ddd; padding: 8px;'>{metric.get('tone', 'N/A')}</td>
    </tr>"""
    
    html += "</table>"
    return html

In [9]:
# Create Gradio interface with 3-step workflow
with gr.Blocks(title="commonGround - Multi-Model Consensus", theme=gr.themes.Soft()) as demo:
    
    gr.Markdown("""
    # ü§ù commonGround
    ### Multi-Model Consensus Analyzer
    
    Compare responses from 4 different LLMs through a **3-step workflow**:
    1. üöÄ Generate responses from selected models
    2. üìä Analyze consensus and quality metrics  
    3. üèÜ Determine the winning model
    """)
    
    # === INPUTS SECTION ===
    with gr.Row():
        with gr.Column(scale=2):
            question_input = gr.Textbox(
                label="‚ùì Your Question",
                placeholder="Enter your question here...",
                lines=3,
                value=EXAMPLE_QUESTIONS[0]
            )
            
            objective_input = gr.Textbox(
                label="üéØ Objective (What should the ideal response achieve?)",
                placeholder="E.g., Provide a comprehensive explanation with examples...",
                lines=2,
                value="Provide an accurate, clear, and comprehensive response with practical examples"
            )
            
            gr.Markdown("### ü§ñ Select 4 Different Models")
            
            with gr.Row():
                model1 = gr.Dropdown(
                    choices=AVAILABLE_MODELS,
                    value=AVAILABLE_MODELS[0],
                    label="Model 1"
                )
                model2 = gr.Dropdown(
                    choices=AVAILABLE_MODELS,
                    value=AVAILABLE_MODELS[2],
                    label="Model 2"
                )
            
            with gr.Row():
                model3 = gr.Dropdown(
                    choices=AVAILABLE_MODELS,
                    value=AVAILABLE_MODELS[3],
                    label="Model 3"
                )
                model4 = gr.Dropdown(
                    choices=AVAILABLE_MODELS,
                    value=AVAILABLE_MODELS[6],
                    label="Model 4"
                )
        
        with gr.Column(scale=1):
            gr.Markdown("### ‚öôÔ∏è Parameters")
            
            temperature = gr.Slider(
                minimum=0.0,
                maximum=2.0,
                value=0.7,
                step=0.1,
                label="Temperature",
                info="Higher = more creative"
            )
            
            top_p = gr.Slider(
                minimum=0.0,
                maximum=1.0,
                value=0.9,
                step=0.05,
                label="Top P",
                info="Nucleus sampling"
            )
            
            gr.Markdown("### üìö Examples")
            gr.Examples(
                examples=[[q] for q in EXAMPLE_QUESTIONS],
                inputs=[question_input],
                label="Try these:"
            )
    
    # === 3 ACTION BUTTONS ===
    gr.Markdown("---")
    gr.Markdown("### üéØ Workflow Steps")
    
    with gr.Row():
        btn_generate = gr.Button(
            "üöÄ Step 1: Generate Responses", 
            variant="primary", 
            size="lg",
            scale=1
        )
        btn_analyze = gr.Button(
            "üìä Step 2: Analyze & Evaluate", 
            variant="secondary", 
            size="lg",
            scale=1
        )
        btn_verdict = gr.Button(
            "üèÜ Step 3: Determine Winner", 
            variant="secondary", 
            size="lg",
            scale=1
        )
    
    # Status message
    status_msg = gr.Markdown(
        "üí° **Start by clicking 'Generate Responses' to begin the analysis**",
        elem_classes="status-message"
    )
    
    # === HIDDEN STATE VARIABLES ===
    responses_state = gr.State([])  # Store responses
    models_state = gr.State([])  # Store model names
    analysis_state = gr.State({})  # Store analysis data
    
    # === OUTPUTS SECTION ===
    gr.Markdown("---")
    gr.Markdown("## üìä Model Responses")
    
    with gr.Row():
        response1 = gr.Markdown()
        response2 = gr.Markdown()
    
    with gr.Row():
        response3 = gr.Markdown()
        response4 = gr.Markdown()
    
    gr.Markdown("---")
    gr.Markdown("## üîç Analysis Results")
    
    consensus_output = gr.Markdown()
    
    gr.Markdown("### üìã Quality Metrics Table")
    metrics_output = gr.HTML()
    
    verdict_output = gr.Markdown()
    
    # === BUTTON CONNECTIONS ===
    
    # Step 1: Generate Responses
    btn_generate.click(
        fn=generate_responses,
        inputs=[
            question_input, model1, model2, model3, model4,
            temperature, top_p
        ],
        outputs=[
            response1, response2, response3, response4,
            status_msg, responses_state, models_state
        ]
    )
    
    # Step 2: Analyze
    btn_analyze.click(
        fn=analyze_responses_step,
        inputs=[responses_state, models_state, objective_input],
        outputs=[consensus_output, metrics_output, status_msg, analysis_state]
    )
    
    # Step 3: Determine Winner
    btn_verdict.click(
        fn=determine_winner_step,
        inputs=[responses_state, models_state, analysis_state, objective_input],
        outputs=[verdict_output, status_msg]
    )
    
    gr.Markdown("""
    ---
    **Note:** This tool uses OpenRouter to access multiple LLM providers. Make sure your API key is set in the `.env` file.
    
    **Workflow Tips:**
    - You can review responses before analyzing (saves tokens!)
    - Each step builds on the previous one
    - Watch the progress indicators for real-time feedback
    """)

# Launch the app
if __name__ == "__main__":
    demo.launch(share=True, server_name="0.0.0.0")

* Running on local URL:  http://0.0.0.0:7860
* Running on public URL: https://64ad9fa86867d92381.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
