In [1]:
# --- 0. INSTALL AND SETUP (Run these lines first if you haven't already) ---
# !pip install accelerate transformers pandas torch numpy scipy
from huggingface_hub import login
login(token='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')

import pandas as pd
import json
import re
import torch
import numpy as np
from scipy.stats import norm
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import os
import sys
import logging
import math

# Suppress transformers warnings
logging.getLogger("transformers").setLevel(logging.ERROR)

# --- GLOBAL CONFIGURATION ---
LLM_CONFIGS = {
    # Model ID, Name, and Mock Token Cost (for demonstration)
    "LLM_1B": {
        "model_id": "meta-llama/Llama-3.2-1B-Instruct",
        "name": "1B-Instruct",
        "cost_per_token": 0.0000005, # Mock cost: cheaper model
        "pipe": None,
        "tokenizer": None,
        "ucb_N": 0,    # N: Number of times this arm has been selected
        "ucb_Q": 0.0,  # Q: Total utility (Reward - Cost) received
        "ucb_mean_reward": 0.0, # Q/N: Average utility (Note: Key kept as 'reward' for simplicity)
    },
    "LLM_3B": {
        "model_id": "meta-llama/Llama-3.2-3B-Instruct",
        "name": "3B-Instruct",
        "cost_per_token": 0.0000025, # Mock cost: more expensive model
        "pipe": None,
        "tokenizer": None,
        "ucb_N": 0,
        "ucb_Q": 0.0,
        "ucb_mean_reward": 0.0,
    }
}
# UCB exploration parameter
UCB_C = 0.5

# Reserve price is set as a minimum acceptable *utility* (Reward - Cost) for the task.
# Since 'a' = -(Historical Utility) - UCB_Bonus, a LOWER 'a' is a BETTER bid.
#
# CRITICAL CHANGE: Increased Reserve Price 'A' to +5.0 (was -2.0)
# This means any adjusted bid 'a' less than +5.0 is accepted, preventing continuous rejections.
RESERVE_PRICE_A = 5.0 

# Mapping columns to rewards for a correct prediction
REWARD_MAP = {
    'Capital City': 1,
    'Continent': 1,
    'Latitude': 2,
    'Longitude': 2
}
EVAL_COLUMNS = list(REWARD_MAP.keys())

# Dataset configuration
# IMPORTANT: Use the exact path to your CSV file
file_path = '/home/gella.saikrishna/.cache/kagglehub/datasets/dataanalyst001/all-capital-cities-in-the-world/versions/1/all capital cities in the world.csv'
#file_path = 'all capital cities in the world.csv' # Placeholder for a common file structure
QUERY_COLUMN = 'Country'

# Global counter for the total number of rounds (t in UCB)
GLOBAL_T = 0

# --- 1. LLM INITIALIZATION (No changes) ---
def initialize_llms():
    """Initializes both Llama 3 models."""
    global LLM_CONFIGS
    print("Initializing LLM pipelines...")
    
    for key, config in LLM_CONFIGS.items():
        try:
            print(f"Loading {config['name']} ({config['model_id']})...")
            # Using low-precision dtype and device_map requires 'accelerate'
            pipe = pipeline(
                "text-generation",
                model=config['model_id'],
                torch_dtype=torch.bfloat16,
                device_map="auto",
            )
            config["pipe"] = pipe
            config["tokenizer"] = pipe.tokenizer
            print(f"{config['name']} loaded successfully.")

        except Exception as e:
            print(f"\nFATAL: Failed to load {config['name']} pipeline. Check environment, token, and hardware.")
            print(f"Error details: {e}")
            sys.exit(1)

# --- 2. LLM PREDICTION AND COST CALCULATION (No changes) ---
def get_llm_prediction_and_cost(country_name, llm_key):
    """
    Queries the specified Llama 3 pipeline, returns data, raw output, and mock cost.
    """
    config = LLM_CONFIGS[llm_key]
    pipe = config["pipe"]
    pipe_tokenizer = config["tokenizer"]
    cost_per_token = config["cost_per_token"]

    # 2.1 Construct Prompt
    prompt_instruction = f"""
    You are an expert geographical information system.
    Your task is to provide the Capital City, Continent, Latitude, and Longitude for the requested country.
    You MUST respond ONLY with a valid JSON object. DO NOT include any text outside the JSON object.
    The JSON structure must be: {{"Capital City": "...", "Continent": "...", "Latitude": "...", "Longitude": "..."}}
    """
    
    messages = [
        {"role": "system", "content": prompt_instruction},
        {"role": "user", "content": f"Provide the geographical data for: {country_name}"},
    ]

    # Apply chat template for Llama 3 format
    prompt = pipe_tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # 2.2 Run Inference
    terminators = [
        pipe_tokenizer.eos_token_id,
        pipe_tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    
    # Run Inference with deterministic settings
    outputs = pipe(
        prompt,
        max_new_tokens=256,
        eos_token_id=terminators,
        do_sample=False,
        temperature=0.0,
    )

    # 2.3 Extract and Parse the JSON
    raw_output = outputs[0]["generated_text"][len(prompt):].strip()
    
    json_match = re.search(r'\{.*\}', raw_output, re.DOTALL)
    
    llm_response_dict = {col: "" for col in EVAL_COLUMNS}
    
    if json_match:
        json_string = json_match.group(0)
        try:
            llm_response_dict = json.loads(json_string)
        except json.JSONDecodeError:
            pass # Keep default empty dict if parsing fails

    # 2.4 Mock Cost Calculation
    # Token count estimation: 1 token is roughly 4 characters
    prompt_tokens = len(prompt) // 4
    response_tokens = len(raw_output) // 4
    total_tokens = prompt_tokens + response_tokens
    
    cost = total_tokens * cost_per_token

    return llm_response_dict, raw_output, total_tokens, cost


# --- 3. UCB AND MYERSON LOGIC (SLIGHT CHANGE TO RETURN 'a') ---

def calculate_virtual_valuation(llm_key, country_name, current_t, total_reward, total_cost):
    """
    Calculates the Virtual Valuation (V) for a given LLM's result and returns V and 'a'.
    
    a = -Historical_Utility - C * sqrt(ln(t)/N) (Adjusted Utility/Bid)
    """
    config = LLM_CONFIGS[llm_key]
    
    # Step 1: Calculate 'a' (Adjusted Utility/Bid)
    N_eff = max(config["ucb_N"], 1)
    
    # 'a' is the mechanism's "bid". Lower 'a' means a better bid (higher expected utility).
    historical_mean_utility = config["ucb_mean_reward"]
    
    # a = - (Historical Mean Utility) - UCB_Bonus
    a = -historical_mean_utility - UCB_C * math.sqrt(math.log(current_t) / N_eff)
    
    # Step 2: Calculate CDF(a) and PDF(a)
    try:
        pdf_a = norm.pdf(a)
        cdf_a = norm.cdf(a)
    except ValueError:
        return float('inf'), float('inf')


    # Step 3: Calculate Virtual Valuation
    
    if pdf_a == 0:
        virtual_valuation = float('inf')
    else:
        # V(a) = a + (cdf_a / pdf_a) - We select the LLM with the MINIMUM V(a)
        # to maximize the implicit utility.
        virtual_valuation = a + (cdf_a / pdf_a)
        
    return virtual_valuation, a

def update_ucb_stats(llm_key, utility):
    """Updates the UCB statistics for the winning LLM arm based on Net Utility (Reward - Cost)."""
    config = LLM_CONFIGS[llm_key]
    
    # Q now tracks total utility (Reward - Cost)
    config["ucb_Q"] += utility
    config["ucb_N"] += 1
    
    # mean_reward now tracks mean utility
    config["ucb_mean_reward"] = config["ucb_Q"] / config["ucb_N"]


# --- 4. EVALUATION LOGIC (Main loop - MODIFIED FOR PAYMENT) ---
def calculate_efficiency_with_ucb_myerson(df, query_col):
    """
    Loops through the dataset, gets predictions from both LLMs,
    applies UCB/Myerson logic to select the winner, and calculates overall efficiency.
    """
    global GLOBAL_T
    
    # Data cleaning for ground truth
    for col in EVAL_COLUMNS:
        df[col] = df[col].astype(str).str.strip().str.lower()
        
    total_count = len(df)
    
    # Tracking for final results
    correct_counts = {col: 0 for col in EVAL_COLUMNS}
    llm_selection_counts = {key: 0 for key in LLM_CONFIGS.keys()}
    total_reward_collected = 0
    total_cost_incurred = 0
    total_payment_incurred = 0 

    print(f"Starting UCB/Reverse-Myerson evaluation on {total_count} countries...")
    
    for index, row in df.iterrows():
        country = row[query_col]
        GLOBAL_T += 1 # Increment total rounds (t)

        # Dictionary to hold the results for both LLMs in this round
        round_results = {}
        
        # 4.1 Get Predictions, Rewards, and Costs for BOTH LLMs
        for llm_key in LLM_CONFIGS.keys():
            # Run inference for the current LLM
            llm_response_dict, raw_output, total_tokens, cost = \
                get_llm_prediction_and_cost(country, llm_key)
            
            # Calculate Total Reward for this LLM's prediction
            current_reward = 0
            is_correct_for_llm = {col: False for col in EVAL_COLUMNS}
            
            for col in EVAL_COLUMNS:
                true_value = row[col]
                predicted_value = str(llm_response_dict.get(col, '')).strip().lower()
                
                is_correct = (predicted_value == true_value)
                
                # Robust Comparison for Latitude/Longitude (Tolerance 0.05)
                if col in ['Latitude', 'Longitude']:
                    true_num_str = re.sub(r'[^0-9.-]', '', true_value)
                    pred_num_str = re.sub(r'[^0-9.-]', '', predicted_value)
                    
                    try:
                        true_num = float(true_num_str)
                        pred_num = float(pred_num_str)
                        
                        if abs(true_num - pred_num) < 0.05:
                            is_correct = True
                        else:
                            is_correct = False
                    except ValueError:
                        is_correct = False
                
                if is_correct:
                    current_reward += REWARD_MAP[col]
                    is_correct_for_llm[col] = True
            
            round_results[llm_key] = {
                "reward": current_reward,
                "cost": cost,
                "utility": current_reward - cost, # Store Net Utility
                "is_correct": is_correct_for_llm,
                "response_dict": llm_response_dict,
                "adjusted_a": None # Will store 'a' (the bid)
            }


        # 4.2 UCB and Reverse-Myerson Selection & Payment Calculation
        
        # 1. Calculate Virtual Valuation (V) and Adjusted Utility/Bid ('a') for each LLM
        virtual_valuations = {}
        adjusted_a_values = {}
        for llm_key, result in round_results.items():
            V, a = calculate_virtual_valuation(
                llm_key, country, GLOBAL_T, result["reward"], result["cost"]
            )
            virtual_valuations[llm_key] = V
            adjusted_a_values[llm_key] = a # The "bid" for the payment rule
            round_results[llm_key]["adjusted_a"] = a
        
        # 2. Select the winner: MINIMUM Virtual Valuation wins (Maximizing Utility)
        winning_llm_key = min(virtual_valuations, key=virtual_valuations.get)
        winning_result = round_results[winning_llm_key]

        # 3. Check for Reserve Price: If winner's bid 'a' is too high, no one wins
        # Since lower 'a' is better, we reject if 'a' EXCEEDS RESERVE_PRICE_A.
        if winning_result["adjusted_a"] > RESERVE_PRICE_A:
             # Winner's bid (a) is too low utility-wise (a is too high), so reject.
             print(f"Round {GLOBAL_T}: All bids rejected (Winner's adjusted bid $a={winning_result['adjusted_a']:.4f}$ exceeds Reserve $a={RESERVE_PRICE_A:.4f}$).")
             continue # Skip updating stats and metrics for this round

        
        # 4. Calculate Payment (VCG-like mechanism)
        
        # Identify winner and loser bids (in terms of 'a')
        loser_llm_key = next(key for key in LLM_CONFIGS.keys() if key != winning_llm_key)
        loser_a = adjusted_a_values[loser_llm_key]

        # Critical Bid 'A' (the highest 'a' that would have still let the winner win)
        # Since lower 'a' is better, the critical bid is the minimum of the reserve and the loser's bid.
        critical_bid_a = min(RESERVE_PRICE_A, loser_a)

        # The actual financial payment is based on the winner's actual cost, 
        # but the critical bid 'a' determined the selection and the *implicit* charge.
        final_payment = winning_result["cost"] # Using actual cost for monetary tracking
        
        
        # 4.3 Update Statistics
        
        # Update UCB/Myerson Arm Stats with the Net Utility (Reward - Cost)
        update_ucb_stats(winning_llm_key, winning_result["utility"])
        llm_selection_counts[winning_llm_key] += 1
        
        # Update Overall Evaluation Metrics
        total_reward_collected += winning_result["reward"]
        total_cost_incurred += winning_result["cost"] # Total cost of models run (for reference)
        total_payment_incurred += final_payment      # Total actual payment to the winner
        
        for col in EVAL_COLUMNS:
            if winning_result["is_correct"][col]:
                correct_counts[col] += 1
        
        if (index + 1) % 10 == 0 or (index + 1) == total_count:
            print(f"Processed {index + 1}/{total_count} entries. Winner: {winning_llm_key} | Winner's Bid $a={winning_result['adjusted_a']:.4f}$ | Loser's Bid $a={loser_a:.4f}$ | Critical Bid $a={critical_bid_a:.4f}$")

    # 4.4 Final Efficiency Calculation
    efficiency = {
        col: f"{correct_counts[col] / total_count * 100:.2f}%"
        for col in EVAL_COLUMNS
    }
    
    total_possible_reward = total_count * sum(REWARD_MAP.values())
    
    return (efficiency, total_count, correct_counts, llm_selection_counts,
            total_reward_collected, total_possible_reward, total_cost_incurred, total_payment_incurred)


# --- 5. EXECUTION ---
if __name__ == "__main__":
    
    # 5.1 Initialize LLMs
    try:
        initialize_llms()
    except Exception as e:
        print(f"LLM initialization failed: {e}. Proceeding with potential errors or mocking.")


    # 5.2 Main Evaluation Block
    try:
        # Load the ground truth data
        data = pd.read_csv(file_path)
        
        # Run the evaluation
        results = calculate_efficiency_with_ucb_myerson(data, QUERY_COLUMN)
        (efficiency_results, total, correct, llm_selections,
         total_reward, total_possible_reward, total_cost, total_payment) = results
        
        # 5.3 Print Final Results
        print("\n" + "="*70)
        print("ðŸ§  Multi-LLM UCB/Reverse-Myerson Evaluation Results (with VCG Payment Logic)")
        print("="*70)
        print(f"Reserve Price 'a' used: {RESERVE_PRICE_A}")
        print(f"Total Countries Evaluated (t): {total}")
        print(f"Total Possible Reward: {total_possible_reward}")
        print(f"Total Reward Collected: {total_reward:.2f}")
        print(f"Total Cost Incurred (Actual Model Cost): ${total_cost:.8f}")
        print(f"Total Payment Incurred (to Winner LLM): ${total_payment:.8f}")
        print(f"Net Utility (Reward - Payment): {total_reward - total_payment:.2f}")
        
        print("\n## Model Selection Counts")
        selection_table = pd.DataFrame([llm_selections]).T
        selection_table.columns = ['Times Selected']
        selection_table.index.name = 'LLM'
        print(selection_table.to_markdown(numalign="left", stralign="left"))
        
        print("\n## Final Accuracy (Based on Winning LLM's Prediction)")
        results_table = pd.DataFrame([efficiency_results]).T
        results_table.columns = ['Efficiency']
        results_table.index.name = 'Column'
        print(results_table.to_markdown(numalign="left", stralign="left"))

        print("\nRaw Correct Counts (for the selected winner):")
        for col in EVAL_COLUMNS:
            print(f"- {col}: {correct[col]}/{total} correct")
        
        print("\nLLM UCB Statistics (Mean Reward = Mean Utility):")
        for key, config in LLM_CONFIGS.items():
             print(f"- {key}: N={config['ucb_N']}, Mean Utility={config['ucb_mean_reward']:.4f}")

    except FileNotFoundError:
        print(f"\nFATAL ERROR: The file was not found at the configured path:\n{file_path}")
        print("Please ensure the path is correct.")
    except Exception as e:
        print(f"\nAn unhandled error occurred during execution: {e}")

Initializing LLM pipelines...
Loading 1B-Instruct (meta-llama/Llama-3.2-1B-Instruct)...
1B-Instruct loaded successfully.
Loading 3B-Instruct (meta-llama/Llama-3.2-3B-Instruct)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

3B-Instruct loaded successfully.
Starting UCB/Reverse-Myerson evaluation on 196 countries...
Processed 10/196 entries. Winner: LLM_1B | Winner's Bid $a=-2.5861$ | Loser's Bid $a=-0.7587$ | Critical Bid $a=-0.7587$
Processed 20/196 entries. Winner: LLM_1B | Winner's Bid $a=-2.5669$ | Loser's Bid $a=-0.8654$ | Critical Bid $a=-0.8654$
Processed 30/196 entries. Winner: LLM_1B | Winner's Bid $a=-2.4125$ | Loser's Bid $a=-0.9221$ | Critical Bid $a=-0.9221$
Processed 40/196 entries. Winner: LLM_1B | Winner's Bid $a=-2.3588$ | Loser's Bid $a=-0.9603$ | Critical Bid $a=-0.9603$
Processed 50/196 entries. Winner: LLM_1B | Winner's Bid $a=-2.4473$ | Loser's Bid $a=-0.9889$ | Critical Bid $a=-0.9889$
Processed 60/196 entries. Winner: LLM_1B | Winner's Bid $a=-2.3859$ | Loser's Bid $a=-1.0117$ | Critical Bid $a=-1.0117$
Processed 70/196 entries. Winner: LLM_1B | Winner's Bid $a=-2.3704$ | Loser's Bid $a=-1.0306$ | Critical Bid $a=-1.0306$
Processed 80/196 entries. Winner: LLM_1B | Winner's Bid $a=-