# Eva-4B-V2 Inference for EvasionBench

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IIIIQIIII/EvasionBench/blob/main/scripts/eva4b_inference.ipynb)

This notebook demonstrates how to use Eva-4B-V2 model to detect evasive answers in earnings call Q&A sessions.

- **Model**: [FutureMa/Eva-4B-V2](https://huggingface.co/FutureMa/Eva-4B-V2)
- **Dataset**: [FutureMa/EvasionBench](https://huggingface.co/datasets/FutureMa/EvasionBench)
- **GitHub**: [IIIIQIIII/EvasionBench](https://github.com/IIIIQIIII/EvasionBench)

## 1. Install Dependencies

In [None]:
!pip install -q transformers datasets accelerate

## 2. Import Libraries

In [None]:
import json
import re
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

## 3. Define Prompt Template

In [None]:
PROMPT_TEMPLATE = """You are a financial analyst. Your task is to Detect Evasive Answers in Financial Q&A

Question: {question}
Answer: {answer}

Response format:
```json
{{"label": "direct|intermediate|fully_evasive"}}
```

Answer in ```json content, no other text"""

## 4. Load Model

In [None]:
model_name = "FutureMa/Eva-4B-V2"

print(f"Loading model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
print(f"Model loaded on device: {model.device}")

## 5. Load EvasionBench Dataset

In [None]:
dataset = load_dataset("FutureMa/EvasionBench", split="train")
print(f"Dataset loaded: {len(dataset)} samples")
print(f"Columns: {dataset.column_names}")

# Sample subset for demo
NUM_SAMPLES = 5
samples = dataset.shuffle(seed=42).select(range(NUM_SAMPLES))
print(f"Selected {NUM_SAMPLES} samples for inference")

## 6. Define Helper Functions

In [None]:
def parse_response(response: str) -> dict:
    """Parse JSON response from model output."""
    # Try to extract JSON from markdown code block
    json_match = re.search(r'```json\s*(\{.*?\})\s*```', response, re.DOTALL)
    if json_match:
        try:
            return json.loads(json_match.group(1))
        except json.JSONDecodeError:
            pass
    
    # Try to extract raw JSON
    json_match = re.search(r'\{[^{}]*"label"[^{}]*\}', response)
    if json_match:
        try:
            return json.loads(json_match.group(0))
        except json.JSONDecodeError:
            pass
    
    return {"label": "unknown"}


def predict(question: str, answer: str) -> tuple:
    """Run inference on a single Q&A pair."""
    prompt = PROMPT_TEMPLATE.format(question=question, answer=answer)
    
    messages = [{"role": "user", "content": prompt}]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )
    
    inputs = tokenizer(text, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=64,
            temperature=0.1,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode only the generated tokens
    generated_tokens = outputs[0][inputs["input_ids"].shape[1]:]
    response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    
    return parse_response(response), response

## 7. Run Inference

In [None]:
print("=" * 60)
print(f"Running inference on {len(samples)} samples")
print("=" * 60)
print()

correct = 0
results = []

for i, sample in enumerate(samples):
    uid = sample["uid"]
    question = sample["question"]
    answer = sample["answer"]
    gold_label = sample["eva4b_label"]
    
    # Truncate for display
    q_display = question[:100] + "..." if len(question) > 100 else question
    a_display = answer[:100] + "..." if len(answer) > 100 else answer
    
    print(f"Sample {i+1}/{len(samples)}")
    print(f"  UID: {uid[:16]}...")
    print(f"  Question: {q_display}")
    print(f"  Answer: {a_display}")
    
    result, raw_response = predict(question, answer)
    pred_label = result.get("label", "unknown")
    
    is_correct = pred_label == gold_label
    correct += int(is_correct)
    
    status = "✓" if is_correct else "✗"
    print(f"  Gold: {gold_label} | Pred: {pred_label} {status}")
    print()
    
    results.append({
        "uid": uid,
        "gold": gold_label,
        "pred": pred_label,
        "correct": is_correct
    })

## 8. Summary

In [None]:
accuracy = correct / len(samples) * 100

print("=" * 60)
print(f"Results: {correct}/{len(samples)} correct ({accuracy:.1f}% accuracy)")
print("=" * 60)

# Display results as table
import pandas as pd
df = pd.DataFrame(results)
df