In [None]:
import re
import csv

import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics import accuracy_score

# =============================================================================
# DATA LOADING
# =============================================================================

def load_data(csv_file_path):
    """Load data from CSV file into list of dicts."""
    with open(csv_file_path, 'r', encoding='utf-8') as f:
        return list(csv.DictReader(f))

def load_prompt_template(prompt_file):
    """Load prompt template from file."""
    with open(prompt_file, 'r', encoding='utf-8') as f:
        return f.read().strip()

# =============================================================================
# MODEL MANAGEMENT
# =============================================================================

def load_model(model_name):
    """Load model and tokenizer."""
    print(f"Loading {model_name}...")
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="auto" if torch.cuda.is_available() else None
        )

        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        print(f"✓ Loaded {model_name}")
        return tokenizer, model
    except Exception as e:
        print(f"✗ Failed to load {model_name}: {e}")
        return None, None

def cleanup_model():
    """Clear GPU memory."""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# =============================================================================
# PREDICTION
# =============================================================================

def create_prompt(intervention, cq, template):
    """Create prompt from template."""
    return template.format(intervention=intervention, cq=cq)

def generate_prediction(prompt, tokenizer, model):
    """Generate model prediction."""
    # Format with chat template
    if tokenizer.chat_template:
        messages = [{"role": "user", "content": prompt}]
        formatted_input = tokenizer.apply_chat_template(messages, tokenize=False)
    else:
        formatted_input = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"

    # Generate
    inputs = tokenizer(formatted_input, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = inputs.to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_new_tokens=100,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id
        )

    # Decode response
    response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
    response = re.sub(r'<\|assistant\|>|</think>|header|system|user|assistant|subject', "", response)
    response = response.strip()

    return classify_response(response)

def classify_response(response):
    """Classify response into Useful/Unhelpful/Invalid."""
    response_lower = response.lower()

    if 'useful' in response_lower:
        return 'Useful'
    elif 'unhelpful' in response_lower:
        return 'Unhelpful'
    elif 'invalid' in response_lower:
        return 'Invalid'
    else:
        return response  # Return raw response if no clear classification

# =============================================================================
# EVALUATION
# =============================================================================

def evaluate_single_item(item, tokenizer, model, template):
    """Evaluate single CQ item."""
    prompt = create_prompt(item['intervention'], item['cq'], template)
    prediction = generate_prediction(prompt, tokenizer, model)

    return {
        'intervention_id': item['intervention_id'],
        'cq_id': item['cq_id'],
        'intervention': item['intervention'],
        'cq': item['cq'],
        'ground_truth': item['label'],
        'prediction': prediction
    }

def run_evaluation(data_file, prompt_file, output_file='results.csv'):
    """Run complete evaluation."""
    # Load data and template
    data = load_data(data_file)
    template = load_prompt_template(prompt_file)

    models = [
        "Qwen/Qwen2.5-1.5B-Instruct",
        "Qwen/Qwen3-1.7B",
        "tiiuae/Falcon3-1B-Instruct"
    ]

    all_results = []

    for model_name in models:
        print(f"\n{'='*50}")
        print(f"Evaluating with: {model_name}")
        print(f"{'='*50}")

        # Load model
        tokenizer, model = load_model(model_name)
        if tokenizer is None or model is None:
            continue

        # Evaluate each item
        for i, item in enumerate(data):
            print(f"Processing {i+1}/{len(data)}: {item['cq_id']}")

            result = evaluate_single_item(item, tokenizer, model, template)
            result['model'] = model_name
            all_results.append(result)

            # Save progress every 10 items
            if (i + 1) % 10 == 0:
                save_results(all_results, output_file)
                print(f"Progress saved at {i+1} items")

        # Cleanup
        del tokenizer, model
        cleanup_model()
        print(f"Completed {model_name}")

    # Final save
    save_results(all_results, output_file)
    print(f"\nEvaluation complete. Results saved to {output_file}")
    return all_results

# =============================================================================
# RESULTS MANAGEMENT
# =============================================================================

def save_results(results, filename):
    """Save results to CSV."""
    df = pd.DataFrame(results)
    df.to_csv(filename, index=False)

def calculate_accuracy(results_file='results.csv'):
    """Calculate accuracy for each model."""
    df = pd.read_csv(results_file)

    print(f"\nAccuracy Results:")
    print("-" * 40)

    for model in df['model'].unique():
        model_data = df[df['model'] == model]
        accuracy = accuracy_score(model_data['ground_truth'], model_data['prediction'])
        print(f"{model}: {accuracy:.4f}")

    return df



In [None]:
import time

start_time = time.time()
results = run_evaluation('/content/extracted_intervention_cq_pairs.csv', '/content/scheme_classic_prompt.txt', 'results.csv')
accuracy_df = calculate_accuracy('results.csv')
end_time = time.time()

execution_time = end_time - start_time
print(f"Execution time: {execution_time:.2f} seconds")


Evaluating with: Qwen/Qwen2.5-1.5B-Instruct
Loading Qwen/Qwen2.5-1.5B-Instruct...
✓ Loaded Qwen/Qwen2.5-1.5B-Instruct
Processing 1/100: secretcurse__137_LLM_us2016reddit_D_meta-llama_Meta-Llama-3-70B-Instruct_5_S
Processing 2/100: JL_3_LLM_moral_maze_schemes_D_meta-llama_Meta-Llama-3-70B-Instruct_7_L
Processing 3/100: CLINTON_225_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_5_L
Processing 4/100: MT_14_LLM_moral_maze_schemes_D_meta-llama_Meta-Llama-3-70B-Instruct_3_S
Processing 5/100: TRUMP_140_1_T__3
Processing 6/100: PeanutAllergy_232_T__0
Processing 7/100: TRUMP_240_2_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_2_S
Processing 8/100: CLINTON_244_2_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_6_S
Processing 9/100: howie_238_LLM_rrd_D_meta-llama_Meta-Llama-3-70B-Instruct_0_L
Processing 10/100: Javier_84_LLM_rrd_D_meta-llama_Meta-Llama-3-70B-Instruct_4_L
Progress saved at 10 items
Processing 11/100: TRUMP_9_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_1_S
Processing

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✓ Loaded Qwen/Qwen3-1.7B
Processing 1/100: secretcurse__137_LLM_us2016reddit_D_meta-llama_Meta-Llama-3-70B-Instruct_5_S
Processing 2/100: JL_3_LLM_moral_maze_schemes_D_meta-llama_Meta-Llama-3-70B-Instruct_7_L
Processing 3/100: CLINTON_225_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_5_L
Processing 4/100: MT_14_LLM_moral_maze_schemes_D_meta-llama_Meta-Llama-3-70B-Instruct_3_S
Processing 5/100: TRUMP_140_1_T__3
Processing 6/100: PeanutAllergy_232_T__0
Processing 7/100: TRUMP_240_2_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_2_S
Processing 8/100: CLINTON_244_2_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_6_S
Processing 9/100: howie_238_LLM_rrd_D_meta-llama_Meta-Llama-3-70B-Instruct_0_L
Processing 10/100: Javier_84_LLM_rrd_D_meta-llama_Meta-Llama-3-70B-Instruct_4_L
Progress saved at 10 items
Processing 11/100: TRUMP_9_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_1_S
Processing 12/100: CLINTON_199_1_LLM_US2016_D_meta-llama_Meta-Llama-3-70B-Instruct_5_L
Processing 13/100

In [None]:
import pandas as pd

def reformat_results(input_file, output_file='results_wide.csv'):
    """Convert long format to wide format with model predictions as columns."""
    df = pd.read_csv(input_file)

    # Pivot: keep base columns, spread model predictions across columns
    wide_df = df.pivot_table(
        index=['intervention_id', 'cq_id', 'intervention', 'cq', 'ground_truth'],
        columns='model',
        values='prediction',
        aggfunc='first'
    ).reset_index()

    # Flatten column names
    wide_df.columns.name = None

    wide_df.to_csv(output_file, index=False)
    print(f"Reformatted results saved to {output_file}")
    return wide_df

reformatted = reformat_results('results.csv')

Reformatted results saved to results_wide.csv


In [None]:
from collections import Counter
def add_majority_vote(input_file, output_file='results_with_majority.csv'):
    """Add majority vote column to wide format CSV file."""
    wide_df = pd.read_csv(input_file)

    # Get model columns (exclude base info columns)
    base_cols = ['intervention_id', 'cq_id', 'intervention', 'cq', 'ground_truth']
    model_cols = [col for col in wide_df.columns if col not in base_cols]

    # Calculate majority vote for each row
    majority_votes = []
    for _, row in wide_df.iterrows():
        predictions = [row[col] for col in model_cols if pd.notna(row[col])]
        if predictions:
            vote_counts = Counter(predictions)
            majority_vote = vote_counts.most_common(1)[0][0]
        else:
            majority_vote = None
        majority_votes.append(majority_vote)

    # Add majority vote column
    wide_df['majority_vote'] = majority_votes

    wide_df.to_csv(output_file, index=False)
    print(f"Results with majority vote saved to {output_file}")
    return wide_df

In [None]:
add_majority_vote("/content/results_wide.csv")

Results with majority vote saved to results_with_majority.csv


Unnamed: 0,intervention_id,cq_id,intervention,cq,ground_truth,Qwen/Qwen2.5-1.5B-Instruct,Qwen/Qwen3-1.7B,tiiuae/Falcon3-1B-Instruct,majority_vote
0,17th_knight__247,17th_knight__247_LLM_us2016reddit_D_meta-llama...,"17th: ""They should get the coverage\nThey are ...","What is the author's definition of ""they are a...",Useful,Useful,Unhelpful,Unhelpful,Unhelpful
1,Antanagoge_104,Antanagoge_104_LLM_rrd_D_meta-llama_Meta-Llama...,"Antanagoge: ""The airline industry can not be l...",How significant are the psychological effects ...,Useful,Useful,Unhelpful,Useful,Useful
2,Bill_106,Bill_106_LLM_rrd_D_meta-llama_Meta-Llama-3-70B...,"Bill: ""I run a retail business.\nIf I changed ...",Has Bill explored alternative explanations for...,Useful,Unhelpful,Unhelpful,Useful,Unhelpful
3,CLINTON_103,CLINTON_103_T__1,"CLINTON: ""I think you 've seen another example...","If so, can the practical inconsistency between...",Unhelpful,Unhelpful,Unhelpful,Useful,Unhelpful
4,CLINTON_103,CLINTON_103_T__3,"CLINTON: ""I think you 've seen another example...",Is it possible for the particular case of Trum...,Unhelpful,Unhelpful,Unhelpful,Unhelpful,Unhelpful
...,...,...,...,...,...,...,...,...,...
95,smg_72,smg_72_LLM_rrd_D_meta-llama_Meta-Llama-3-70B-I...,"smg: ""It has become abundantly clear that comm...","How would these regulations ""level the playing...",Useful,Unhelpful,Unhelpful,Useful,Unhelpful
96,smg_72,smg_72_LLM_rrd_D_meta-llama_Meta-Llama-3-70B-I...,"smg: ""It has become abundantly clear that comm...",How would these regulations affect the airline...,Useful,Unhelpful,Unhelpful,Unhelpful,Unhelpful
97,smg_72,smg_72_LLM_rrd_D_meta-llama_Meta-Llama-3-70B-I...,"smg: ""It has become abundantly clear that comm...",Are there any alternative solutions to the pro...,Unhelpful,Unhelpful,Unhelpful,Unhelpful,Unhelpful
98,travellots_133_1,travellots_133_1_LLM_rrd_D_meta-llama_Meta-Lla...,"travellots: ""There should be no discrimination...",Are there any differences between purchasing a...,Useful,Useful,Unhelpful,Useful,Useful
