In [None]:
import re
import csv

import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics import accuracy_score

# =============================================================================
# DATA LOADING
# =============================================================================

def load_data(csv_file_path):
    """Load data from CSV file into list of dicts."""
    with open(csv_file_path, 'r', encoding='utf-8') as f:
        return list(csv.DictReader(f))

def load_prompt_template(prompt_file):
    """Load prompt template from file."""
    with open(prompt_file, 'r', encoding='utf-8') as f:
        return f.read().strip()

# =============================================================================
# MODEL MANAGEMENT
# =============================================================================

def load_model(model_name):
    """Load model and tokenizer."""
    print(f"Loading {model_name}...")
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="auto" if torch.cuda.is_available() else None
        )

        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        print(f"✓ Loaded {model_name}")
        return tokenizer, model
    except Exception as e:
        print(f"✗ Failed to load {model_name}: {e}")
        return None, None

def cleanup_model():
    """Clear GPU memory."""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# =============================================================================
# PREDICTION
# =============================================================================

def create_prompt(intervention, cq, template):
    """Create prompt from template."""
    return template.format(intervention=intervention, cq=cq)

def generate_prediction(prompt, tokenizer, model):
    """Generate model prediction."""
    # Format with chat template
    if tokenizer.chat_template:
        messages = [{"role": "user", "content": prompt}]
        formatted_input = tokenizer.apply_chat_template(messages, tokenize=False)
    else:
        formatted_input = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"

    # Generate
    inputs = tokenizer(formatted_input, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = inputs.to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_new_tokens=100,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id
        )

    # Decode response
    response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    response = re.sub(r'<\|assistant\|>|</think>|header|system|user|assistant|subject', "", response)
    response = response.strip()

    return classify_response(response)

def classify_response(response):
    """Classify response into Useful/Unhelpful/Invalid."""
    response_lower = response.lower()

    if 'good' in response_lower:
        return 'good'
    elif 'bad' in response_lower:
        return 'bad'
    elif 'invalid' in response_lower:
        return 'bad'
    else:
        return response  # Return raw response if no clear classification

# =============================================================================
# EVALUATION
# =============================================================================

def evaluate_single_item(item, tokenizer, model, template):
    """Evaluate single CQ item."""
    prompt = create_prompt(item['intervention'], item['cq'], template)
    prediction = generate_prediction(prompt, tokenizer, model)

    return {
        'intervention_id': item['intervention_id'],
        'cq_id': item['cq_id'],
        'intervention': item['intervention'],
        'cq': item['cq'],
        'ground_truth': item['label'],
        'prediction': prediction
    }

def run_evaluation(data_file, prompt_file, output_file='results.csv'):
    """Run complete evaluation."""
    # Load data and template
    data = load_data(data_file)
    template = load_prompt_template(prompt_file)

    models = [
        "Qwen/Qwen2.5-1.5B-Instruct",
        "Qwen/Qwen3-1.7B",
        "tiiuae/Falcon3-1B-Instruct"
    ]

    all_results = []

    for model_name in models:
        print(f"\n{'='*50}")
        print(f"Evaluating with: {model_name}")
        print(f"{'='*50}")

        # Load model
        tokenizer, model = load_model(model_name)
        if tokenizer is None or model is None:
            continue

        # Evaluate each item
        for i, item in enumerate(data):
            print(f"Processing {i+1}/{len(data)}: {item['cq_id']}")

            result = evaluate_single_item(item, tokenizer, model, template)
            result['model'] = model_name
            all_results.append(result)

            # Save progress every 10 items
            if (i + 1) % 10 == 0:
                save_results(all_results, output_file)
                print(f"Progress saved at {i+1} items")

        # Cleanup
        del tokenizer, model
        cleanup_model()
        print(f"Completed {model_name}")

    # Final save
    save_results(all_results, output_file)
    print(f"\nEvaluation complete. Results saved to {output_file}")
    return all_results

# =============================================================================
# RESULTS MANAGEMENT
# =============================================================================

def save_results(results, filename):
    """Save results to CSV."""
    df = pd.DataFrame(results)
    df.to_csv(filename, index=False)

def calculate_accuracy(results_file='results.csv'):
    """Calculate accuracy for each model."""
    df = pd.read_csv(results_file)

    print(f"\nAccuracy Results:")
    print("-" * 40)

    for model in df['model'].unique():
        model_data = df[df['model'] == model]
        accuracy = accuracy_score(model_data['ground_truth'], model_data['prediction'])
        print(f"{model}: {accuracy:.4f}")

    return df


In [None]:
results = run_evaluation('/content/extracted_intervention_cq_pairs.csv', "/content/binary_prompt.txt")


Evaluating with: Qwen/Qwen2.5-1.5B-Instruct
Loading Qwen/Qwen2.5-1.5B-Instruct...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


✓ Loaded Qwen/Qwen2.5-1.5B-Instruct
Processing 1/100: secretcurse__137_LLM_us2016reddit_D_meta-llama_Meta-Llama-3-70B-Instruct_5_S
Processing 2/100: JL_3_LLM_moral_maze_schemes_D_meta-llama_Meta-Llama-3-70B-Instruct_7_L
Processing 3/100: CLINTON_225_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_5_L
Processing 4/100: MT_14_LLM_moral_maze_schemes_D_meta-llama_Meta-Llama-3-70B-Instruct_3_S
Processing 5/100: TRUMP_140_1_T__3
Processing 6/100: PeanutAllergy_232_T__0
Processing 7/100: TRUMP_240_2_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_2_S
Processing 8/100: CLINTON_244_2_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_6_S
Processing 9/100: howie_238_LLM_rrd_D_meta-llama_Meta-Llama-3-70B-Instruct_0_L
Processing 10/100: Javier_84_LLM_rrd_D_meta-llama_Meta-Llama-3-70B-Instruct_4_L
Progress saved at 10 items
Processing 11/100: TRUMP_9_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_1_S
Processing 12/100: CLINTON_199_1_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_5_L
Proces

tokenizer_config.json:   0%|          | 0.00/9.73k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/622M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

✓ Loaded Qwen/Qwen3-1.7B
Processing 1/100: secretcurse__137_LLM_us2016reddit_D_meta-llama_Meta-Llama-3-70B-Instruct_5_S
Processing 2/100: JL_3_LLM_moral_maze_schemes_D_meta-llama_Meta-Llama-3-70B-Instruct_7_L
Processing 3/100: CLINTON_225_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_5_L
Processing 4/100: MT_14_LLM_moral_maze_schemes_D_meta-llama_Meta-Llama-3-70B-Instruct_3_S
Processing 5/100: TRUMP_140_1_T__3
Processing 6/100: PeanutAllergy_232_T__0
Processing 7/100: TRUMP_240_2_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_2_S
Processing 8/100: CLINTON_244_2_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_6_S
Processing 9/100: howie_238_LLM_rrd_D_meta-llama_Meta-Llama-3-70B-Instruct_0_L
Processing 10/100: Javier_84_LLM_rrd_D_meta-llama_Meta-Llama-3-70B-Instruct_4_L
Progress saved at 10 items
Processing 11/100: TRUMP_9_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_1_S
Processing 12/100: CLINTON_199_1_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_5_L
Processing 13/100

tokenizer_config.json:   0%|          | 0.00/365k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.78M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/656 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.34G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

✓ Loaded tiiuae/Falcon3-1B-Instruct
Processing 1/100: secretcurse__137_LLM_us2016reddit_D_meta-llama_Meta-Llama-3-70B-Instruct_5_S
Processing 2/100: JL_3_LLM_moral_maze_schemes_D_meta-llama_Meta-Llama-3-70B-Instruct_7_L
Processing 3/100: CLINTON_225_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_5_L
Processing 4/100: MT_14_LLM_moral_maze_schemes_D_meta-llama_Meta-Llama-3-70B-Instruct_3_S
Processing 5/100: TRUMP_140_1_T__3
Processing 6/100: PeanutAllergy_232_T__0
Processing 7/100: TRUMP_240_2_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_2_S
Processing 8/100: CLINTON_244_2_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_6_S
Processing 9/100: howie_238_LLM_rrd_D_meta-llama_Meta-Llama-3-70B-Instruct_0_L
Processing 10/100: Javier_84_LLM_rrd_D_meta-llama_Meta-Llama-3-70B-Instruct_4_L
Progress saved at 10 items
Processing 11/100: TRUMP_9_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_1_S
Processing 12/100: CLINTON_199_1_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_5_L
Proces

In [None]:
import pandas as pd

def reformat_results(input_file, output_file='results_wide.csv'):
    """Convert long format to wide format with model predictions as columns."""
    df = pd.read_csv(input_file)

    # Pivot: keep base columns, spread model predictions across columns
    wide_df = df.pivot_table(
        index=['intervention_id', 'cq_id', 'intervention', 'cq', 'ground_truth'],
        columns='model',
        values='prediction',
        aggfunc='first'
    ).reset_index()

    # Flatten column names
    wide_df.columns.name = None

    wide_df.to_csv(output_file, index=False)
    print(f"Reformatted results saved to {output_file}")
    return wide_df



In [None]:
from collections import Counter
def add_majority_vote(input_file, output_file='results_with_majority.csv'):
    """Add majority vote column to wide format CSV file."""
    wide_df = pd.read_csv(input_file)

    # Get model columns (exclude base info columns)
    base_cols = ['intervention_id', 'cq_id', 'intervention', 'cq', 'ground_truth']
    model_cols = [col for col in wide_df.columns if col not in base_cols]

    # Calculate majority vote for each row
    majority_votes = []
    for _, row in wide_df.iterrows():
        predictions = [row[col] for col in model_cols if pd.notna(row[col])]
        if predictions:
            vote_counts = Counter(predictions)
            most_common_count = vote_counts.most_common(1)[0][1]
            # Only assign majority if it appears more than once
            if most_common_count > 1:
                majority_vote = vote_counts.most_common(1)[0][0]
            else:
                majority_vote = "NOT DECIDED"
        else:
            majority_vote = None
        majority_votes.append(majority_vote)

    # Add majority vote column
    wide_df['majority_vote'] = majority_votes

    wide_df.to_csv(output_file, index=False)
    print(f"Results with majority vote saved to {output_file}")
    return wide_df

In [None]:
def calculate_agreement(csv_file):
    """Calculate agreement - auto-detects binary mode if answers are good/bad."""
    df = pd.read_csv(csv_file)
    base_cols = ['intervention_id', 'cq_id', 'intervention', 'cq', 'ground_truth']
    model_cols = [col for col in df.columns if col not in base_cols]

    for col in model_cols:

        # Check if this model's predictions are in binary format
        col_values = df[col].dropna()
        binary_values = col_values.isin(['good', 'bad'])
        is_binary = binary_values.mean() >= 0.9

        if is_binary:
            # Convert ground truth to binary for comparison
            mapping = {'Useful': 'good', 'Invalid': 'bad', 'Unhelpful': 'bad'}
            ground_truth_binary = df['ground_truth'].map(mapping)
            agreement = (df[col] == ground_truth_binary).mean()
        else:
            # Direct comparison for 3-way data
            agreement = (df[col] == df['ground_truth']).mean()

        print(f"{col}: {agreement:.3f}")

In [None]:
#post eval pipeline
# 1) make a neat table by aggregating answers by cq_id
# 2) add column representing majority vote
# 3) calculate agreement by model and by majority vote

In [None]:
reformat_results("/content/results.csv")
add_majority_vote("/content/results_wide.csv")
calculate_agreement("/content/results_with_majority.csv")

Reformatted results saved to results_wide.csv
Results with majority vote saved to results_with_majority.csv
Qwen/Qwen2.5-1.5B-Instruct: 0.380
Qwen/Qwen3-1.7B: 0.400
tiiuae/Falcon3-1B-Instruct: 0.720
majority_vote: 0.460
