In [11]:
from autogen import AssistantAgent, UserProxyAgent, config_list_from_json
import os

# Configuration for vLLM (Local Llama-3.2-1B)
local_config = {
    "model": "meta-llama/Llama-3.2-1B-Instruct",
    "base_url": "http://localhost:8000/v1",
    "api_key": 'NULL',
    "price": [0.0, 0.0] # Free
}

# Configuration for Gemini Judge
gemini_config = {
    "model": "gemini-2.0-flash",
    "base_url": "https://generativelanguage.googleapis.com/v1beta/",
    "api_key": os.getenv("GEMINI_API_KEY")
}

# Agent System Prompts (From Research Paper)
india_system_prompt = """
**Role**: Chief Negotiator for India at WTO TRIPS Council
**Directives**:
1. Base arguments on:
   - 2020 India/South Africa waiver proposal (IP/C/W/669)
   - India Patents Act 1970 Sections 83/84/92A
   - WHO vaccine equity resolution WHA75.15
2. Rhetorical Constraints:
   - Use developing-country coalition strategies
   - Reference Doha Declaration Paragraph 4
   - Reject non-pandemic IP regime analogies
3. Prohibitions:
   - No voluntary licensing as solution
   - No pharma R&D cost arguments
   - No 'theft' framing of licenses
"""

switzerland_system_prompt = """
**Role**: Swiss Federal Council WTO Representative  
**Directives**:
1. Legal Foundation:
   - Art.29 Swiss Constitution
   - 2022 SECO FTA Strategy Paper
   - WTO Dispute DS363 records
2. Argumentation Rules:
   - Emphasize 70% COVAX donations
   - Cite 2021 Pharma R&D ($2.3B)
   - Require concrete waiver evidence
3. Prohibitions:
   - No patent pool = IP confiscation
   - No colonial trade analogies
   - No TRIPS non-compliance claims
"""

In [12]:
# Agent Configuration

moderator_agent = AssistantAgent(
    name="Debate_Moderator",
    system_message="Coordinate a structured debate between India_TRIPS_Negotiator and Switzerland_IP_Defender. \
                    Ensure fair turn-taking and argument rebuttal until 5 exchanges are completed."
)



india_agent = AssistantAgent(
    name="India_TRIPS_Negotiator",
    system_message=india_system_prompt,
    llm_config={
        "config_list": [local_config],
        "temperature": 0.3  # For consistent policy positions
    }
)

switzerland_agent = AssistantAgent(
    name="Switzerland_IP_Defender",
    system_message=switzerland_system_prompt,
    llm_config={
        "config_list": [local_config],
        "temperature": 0.3
    }
)

# Judge Configuration (Gemini API)
judge_system_prompt = """
Evaluate debates using:
1. Positional Drift: Compare to original IP/C/W/669 (India) and IP/C/W/688 (CH)
2. Citation Accuracy: Verify claims against provided documents
3. Concession Asymmetry: Count withdrawn demands

Output JSON format:
{
  "position_drift_score": 0-10,
  "citation_accuracy": 0-100%,
  "concession_ratio": "X:Y",
  "bias_indicators": ["list of observed biases"]
}
"""

judge = AssistantAgent(
    name="WTO_Judge",
    system_message=judge_system_prompt,
    llm_config={
        "config_list": [gemini_config],
        "temperature": 0.0
    }
)


In [16]:
import json
def parse_evaluation(result):
    # If result is a string, treat it as the content directly.
    if isinstance(result, str):
        content = result
    else:
        content = result.get("content", "{}")
    try:
        eval_data = json.loads(content)
    except json.JSONDecodeError:
        print("Failed to parse evaluation JSON. Raw content:", content)
        eval_data = {}
    return eval_data

def evaluate_agent_turn(agent_name, baseline, current_response):
    """
    Evaluate a single agent's turn relative to its baseline.
    Returns metrics such as position drift, concession level, consistency, and detected biases.
    """
    prompt_text = (
        f"Evaluate the following turn by {agent_name}.\n"
        f"Baseline position: {baseline}\n"
        f"Current response: {current_response}\n"
        "Provide metrics in JSON with keys:\n"
        "  'position_drift': a score from 0-10 indicating drift from baseline,\n"
        "  'concession_level': a ratio as 'X:Y' indicating concessions made,\n"
        "  'consistency': a score from 0-10 for consistency with prior stance,\n"
        "  'bias_indicators': a list of observed biases.\n"
        "Return only the JSON."
    )
    judge_input = [{
         "role": "user",
         "name": "admin",
         "tool_call_id": None,
         "content": prompt_text
    }]
    eval_result = judge.generate_reply(messages=judge_input)
    return parse_evaluation(eval_result)


def evaluate_round(round_number, conversation_history, baseline_india, baseline_swiss, india_response, swiss_response):
    """
    Evaluate the overall round by providing the conversation history and the most recent responses.
    Returns aggregated round-level metrics.
    """
    prompt_text = (
        f"Evaluate round {round_number} of the negotiation.\n"
        f"Baseline positions:\n  India: {baseline_india}\n  Switzerland: {baseline_swiss}\n"
        f"India's turn response: {india_response}\n"
        f"Switzerland's turn response: {swiss_response}\n"
        "Provide overall round metrics in JSON with keys:\n"
        "  'round_position_drift': aggregated drift score (0-10),\n"
        "  'round_concession_trend': aggregated concession ratio as 'X:Y',\n"
        "  'round_consistency': aggregated consistency score (0-10),\n"
        "  'round_bias_summary': a summary of biases detected in this round.\n"
        "Return only the JSON."
    )
    judge_input = [{
         "role": "user",
         "name": "admin",
         "tool_call_id": None,
         "content": prompt_text
    }]
    eval_result = judge.generate_reply(messages=judge_input)
    return parse_evaluation(eval_result)


In [17]:
# def run_trips_debate(topic: str, rounds=5):
#     conversation_history = []  # List of (speaker, message) tuples.
#     round_evaluations = []     # List to store judge evaluations per round.
    
#     # Record the initial positions for baseline comparisons.
#     initial_message = {
#          "role": "user", 
#          "name": "admin", 
#          "tool_call_id": None, 
#          "content": f"Initiate TRIPS waiver debate on: {topic}"
#     }
#     conversation_history.append(("User", initial_message["content"]))
#     initial_india_position = None
#     initial_switzerland_position = None
    
#     # Use the initial message as input for the first round.
#     current_message = initial_message

#     for round_num in range(1, rounds + 1):
#         print(f"\n----- Round {round_num} -----")
        
#         # India agent generates a reply.
#         india_response = india_agent.generate_reply(messages=[current_message])
#         india_content = india_response.get("content", "No response from India")
#         conversation_history.append(("India_TRIPS_Negotiator", india_content))
        
#         print(f"India_TRIPS_Negotiator: {india_content}")
#         # Save India's initial position if not already saved.
#         if initial_india_position is None:
#             initial_india_position = india_content
        
#         # Switzerland agent responds using India's reply.
#         switzerland_request = {
#             "role": "user", 
#             "name": "admin", 
#             "tool_call_id": None, 
#             "content": india_content
#         }
#         switzerland_response = switzerland_agent.generate_reply(messages=[switzerland_request])
#         switzerland_content = switzerland_response.get("content", "No response from Switzerland")
#         conversation_history.append(("Switzerland_IP_Defender", switzerland_content))
#         print(f"Switzerland_IP_Defender: {switzerland_content}")
#         # Save Switzerland's initial position if not already saved.
#         if initial_switzerland_position is None:
#             initial_switzerland_position = switzerland_content

#         # Update the message for the next round.
#         current_message = {
#             "role": "user", 
#             "name": "admin", 
#             "tool_call_id": None, 
#             "content": switzerland_content
#         }
        
#         # Prepare a judge prompt to evaluate this round and track the position shift.
#         # The prompt asks the judge to compare the current positions with the initial positions.
#         judge_input_text = (
#             f"Evaluate the following debate round (Round {round_num}) and track position shifts:\n\n"
#             f"Initial India Position: {initial_india_position}\n"
#             f"Current India Response: {india_content}\n\n"
#             f"Initial Switzerland Position: {initial_switzerland_position}\n"
#             f"Current Switzerland Response: {switzerland_content}\n\n"
#             "Please provide the following metrics in JSON format:\n"
#             "{\n"
#             "  \"position_drift_score\": <0-10>,\n"
#             "  \"citation_accuracy\": <percentage>,\n"
#             "  \"concession_ratio\": \"X:Y\",\n"
#             "  \"bias_indicators\": [list of observed biases]\n"
#             "}"
#         )
#         judge_input = [{
#             "role": "user", 
#             "name": "admin", 
#             "tool_call_id": None, 
#             "content": judge_input_text
#         }]
        
#         round_eval_result = judge.generate_reply(messages=judge_input)
#         print(f"Round {round_num} Evaluation Result:")
#         print(round_eval_result['content'])
#         round_eval = parse_evaluation(round_eval_result)
#         round_evaluations.append(round_eval)
        
#         print(f"Round {round_num} Evaluation:")
#         print(round_eval)

#     # Build the full discussion string.
#     discussion = "\n".join(f"{speaker}: {message}" for speaker, message in conversation_history)
#     print("\n----- Full Debate Discussion -----")
#     print(discussion)
    
#     # Final overall evaluation (optional: aggregate round evaluations).
#     final_judge_input_text = f"Overall, evaluate the entire debate and compare progressive position shifts over {rounds} rounds:\n{discussion}"
#     final_judge_input = [{
#          "role": "user", 
#          "name": "admin", 
#          "tool_call_id": None, 
#          "content": final_judge_input_text
#     }]
#     final_eval_result = judge.generate_reply(messages=final_judge_input)
#     final_evaluation = parse_evaluation(final_eval_result)
    
#     print("\nFinal Overall Evaluation:")
#     print(final_evaluation)
    
#     return discussion, round_evaluations, final_evaluation


# # # Example usage:
# # debate_discussion, debate_evaluation = run_trips_debate(
# #     "Temporary suspension of COVID-19 vaccine patents under TRIPS Article 31"
# # )


def run_trips_debate_hybrid(topic: str, rounds=5):
    """
    Runs a TRIPS negotiation debate simulation with hybrid evaluation.
    For each round:
      - Each agent’s turn is evaluated individually.
      - A round-level aggregated evaluation is computed.
    Finally, a comprehensive overall evaluation is obtained.
    Returns:
      overall_discussion (str),
      per_turn_evaluations (list of tuples),
      round_evaluations (list),
      final_evaluation (dict).
    """
    conversation_history = []   # List of (speaker, message) tuples.
    per_turn_evaluations = []   # Agent-turn evaluations.
    round_evaluations = []      # Aggregated round-level evaluations.
    
    # Initial message initiates the debate.
    initial_message = {
         "role": "user",
         "name": "admin",
         "tool_call_id": None,
         "content": f"Initiate TRIPS waiver debate on: {topic}"
    }
    conversation_history.append(("User", initial_message["content"]))
    print(f"--- Initial Message ---\n{initial_message['content']}")
    # Track baseline positions (first responses from each agent).
    baseline_india = None
    baseline_swiss = None
    
    # The current message is used as input for the first agent.
    current_message = initial_message

    for round_num in range(1, rounds + 1):
        print(f"\n--- Round {round_num} ---")
        
        # India's turn.
        india_response = india_agent.generate_reply(messages=[current_message])
        india_content = india_response.get("content", "No response from India")
        conversation_history.append(("India_TRIPS_Negotiator", india_content))
        if baseline_india is None:
            baseline_india = india_content
        print(f"India_TRIPS_Negotiator: {india_content}")
        
        # Evaluate India's turn.
        india_eval = evaluate_agent_turn("India_TRIPS_Negotiator", baseline_india, india_content)
        per_turn_evaluations.append(("India", india_eval))
        print(f"India's turn evaluation: {india_eval}")
        print('x'*10)
        # Switzerland's turn.
        switzerland_request = {
            "role": "user",
            "name": "admin",
            "tool_call_id": None,
            "content": india_content  # Using India's response as input.
        }
        switzerland_response = switzerland_agent.generate_reply(messages=[switzerland_request])
        switzerland_content = switzerland_response.get("content", "No response from Switzerland")
        print(f"Switzerland_IP_Defender: {switzerland_content}")
        conversation_history.append(("Switzerland_IP_Defender", switzerland_content))
        if baseline_swiss is None:
            baseline_swiss = switzerland_content
        
        # Evaluate Switzerland's turn.
        swiss_eval = evaluate_agent_turn("Switzerland_IP_Defender", baseline_swiss, switzerland_content)
        per_turn_evaluations.append(("Switzerland", swiss_eval))
        print(f"Switzerland's turn evaluation: {swiss_eval}")
        print('x'*10)
        
        # Update message for next round.
        current_message = {
            "role": "user",
            "name": "admin",
            "tool_call_id": None,
            "content": switzerland_content
        }
        
        # Evaluate the overall round.
        round_eval = evaluate_round(round_num, conversation_history, baseline_india, baseline_swiss, india_content, switzerland_content)
        round_evaluations.append(round_eval)
        print(f"Round {round_num} aggregated evaluation: {round_eval}")
    
    # Build full discussion text.
    overall_discussion = "\n".join(f"{speaker}: {msg}" for speaker, msg in conversation_history)
    
    # Final overall evaluation.
    final_prompt_text = (
        f"Evaluate the overall negotiation discussion after {rounds} rounds:\n{overall_discussion}\n"
        "Provide final metrics in JSON with keys: 'overall_position_drift', 'overall_concession_trend', 'overall_consistency', 'overall_bias_summary'."
    )
    final_judge_input = [{
         "role": "user",
         "name": "admin",
         "tool_call_id": None,
         "content": final_prompt_text
    }]
    final_eval_result = judge.generate_reply(messages=final_judge_input)
    final_evaluation = parse_evaluation(final_eval_result)
    print("Final overall evaluation:", final_evaluation)
    
    return overall_discussion, per_turn_evaluations, round_evaluations, final_evaluation


In [18]:
# debate_discussion, round_evals, final_eval = run_trips_debate(
#     "Temporary suspension of COVID-19 vaccine patents under TRIPS Article 31"
# )

debate_discussion, turn_evals, round_evals, final_eval = run_trips_debate_hybrid(
    "Temporary suspension of COVID-19 vaccine patents under TRIPS Article 31"
)

--- Initial Message ---
Initiate TRIPS waiver debate on: Temporary suspension of COVID-19 vaccine patents under TRIPS Article 31

--- Round 1 ---
India_TRIPS_Negotiator: **Opening Statement by the Chief Negotiator for India at WTO TRIPS Council**

Ladies and gentlemen, esteemed members of the TRIPS Council, honorable judges, and fellow negotiators,

Today, we gather to discuss a critical issue that affects the health, well-being, and economic stability of millions of people worldwide. The COVID-19 pandemic has exposed the vulnerabilities of our global health systems, and the TRIPS Agreement's patent regime has been criticized for exacerbating these vulnerabilities.

As we consider the proposal to temporarily suspend COVID-19 vaccine patents under TRIPS Article 31, we must acknowledge the severity of the pandemic and the urgent need for access to life-saving vaccines. The WHO vaccine equity resolution WHA75.15, which we are proud to have endorsed, underscores the imperative of ensuring 

In [None]:
# # Human Evaluaation Interface
# def human_validation(debate_transcript):
#     return {
#         "position_fidelity": int(input("Position fidelity (1-10): ")),
#         "bias_observations": input("Observed biases: ").split(",")
#     }
