# SST-2 Instruction Learning 多模型评估

## 1. 环境配置与数据加载

In [None]:
import os
import ssl
import urllib3
import re
import gc

import pandas as pd

from datasets import load_dataset
from sklearn.metrics import accuracy_score
import transformers
import torch

print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# 加载 SST-2 数据集
dataset = load_dataset("glue", "sst2")
val_dataset = dataset["validation"]
print(f"验证集样本数: {len(val_dataset)}")
print(val_dataset[0])


In [None]:
# 配置
BATCH_SIZE = 32

prompt_style = """Below is an instruction that describes a task, paired with
an input that provides further context. Write a response that appropriately
completes the request.

### Instruction:
Analyze the given text from an online review and determine the sentiment
polarity. Return a single number of either 0 and 1, with 0 being negative
and 1 being the positive sentiment.

### Input:
{}

### Response:
"""

def extract_prediction(text):
    """从模型输出中提取预测结果 (0 或 1)"""
    if "### Response:" in text:
        after_response = text.split("### Response:")[-1]
    else:
        after_response = text
    
    matches = re.findall(r'\b([01])\b', after_response)
    if matches:
        return int(matches[0])
    
    text_lower = after_response.lower()
    if "negative" in text_lower:
        return 0
    elif "positive" in text_lower:
        return 1
    
    return -1

def evaluate_model(model_id, model_name):
    """评估模型并返回准确率"""
    print(f"加载模型: {model_id}")
    
    pipeline = transformers.pipeline(
        "text-generation",
        model=model_id,
        max_new_tokens=64,
        model_kwargs={"torch_dtype": torch.bfloat16},
        device_map="auto",
        batch_size=BATCH_SIZE,
    )
    pipeline.tokenizer.pad_token_id = pipeline.tokenizer.eos_token_id
    pipeline.tokenizer.padding_side = "left"
    
    predictions = []
    idx_list = list(val_dataset['idx'])
    true_labels = list(val_dataset['label'])
    prompts = [prompt_style.format(item['sentence']) for item in val_dataset]
    
    print(f"开始评估，共 {len(prompts)} 个样本...")

    for i in range(0, len(prompts), BATCH_SIZE):
        batch_prompts = prompts[i:i+BATCH_SIZE]
        responses = pipeline(batch_prompts)
        
        for response in responses:
            output_text = response[0]["generated_text"]
            pred_label = extract_prediction(output_text)
            predictions.append(pred_label)
        
        if (i + BATCH_SIZE) % 200 == 0 or i + BATCH_SIZE >= len(prompts):
            current_acc = accuracy_score(true_labels[:len(predictions)], predictions)
            print(f"进度: {len(predictions)}/{len(val_dataset)}，当前准确率: {current_acc:.4f}")

    accuracy = accuracy_score(true_labels, predictions)
    
    # 保存预测结果
    result_output = pd.DataFrame(data={"idx": idx_list, "prediction": predictions})
    csv_name = f"sst2_{model_name}.csv"
    result_output.to_csv(csv_name, index=False, quoting=3)
    print(f"预测结果已保存到 {csv_name}")
    
    # 清理显存
    del pipeline
    gc.collect()
    torch.cuda.empty_cache()
    
    print(f"\n最终准确率: {accuracy:.4f} ({accuracy*100:.2f}%)")
    return accuracy

# 存储所有结果
all_results = {}

## 2. Qwen 系列

In [None]:
# Qwen2.5-0.5B
acc = evaluate_model("Qwen/Qwen2.5-0.5B-Instruct", "Qwen2.5-0.5B")
all_results["Qwen2.5-0.5B"] = acc

In [None]:
# Qwen2.5-1.5B
acc = evaluate_model("Qwen/Qwen2.5-1.5B-Instruct", "Qwen2.5-1.5B")
all_results["Qwen2.5-1.5B"] = acc

In [None]:
# Qwen2.5-3B
acc = evaluate_model("Qwen/Qwen2.5-3B-Instruct", "Qwen2.5-3B")
all_results["Qwen2.5-3B"] = acc

In [None]:
# Qwen2.5-7B
acc = evaluate_model("Qwen/Qwen2.5-7B-Instruct", "Qwen2.5-7B")
all_results["Qwen2.5-7B"] = acc

## 3. Llama 系列

In [None]:
# Llama-3.2-1B
acc = evaluate_model("meta-llama/Llama-3.2-1B-Instruct", "Llama-3.2-1B")
all_results["Llama-3.2-1B"] = acc

In [None]:
# Llama-3.2-3B
acc = evaluate_model("meta-llama/Llama-3.2-3B-Instruct", "Llama-3.2-3B")
all_results["Llama-3.2-3B"] = acc

In [None]:
# Llama-3.1-8B
acc = evaluate_model("meta-llama/Llama-3.1-8B-Instruct", "Llama-3.1-8B")
all_results["Llama-3.1-8B"] = acc

## 4. Gemma 系列

In [None]:
# Gemma-2-2B
acc = evaluate_model("google/gemma-2-2b-it", "Gemma-2-2B")
all_results["Gemma-2-2B"] = acc

In [None]:
# Gemma-2-9B
acc = evaluate_model("google/gemma-2-9b-it", "Gemma-2-9B")
all_results["Gemma-2-9B"] = acc

## 5. Phi-4

In [None]:
# Phi-4
acc = evaluate_model("microsoft/phi-4", "Phi-4")
all_results["Phi-4"] = acc

## 6. Mistral 系列

In [None]:
# Mistral-7B
acc = evaluate_model("mistralai/Mistral-7B-Instruct-v0.3", "Mistral-7B")
all_results["Mistral-7B"] = acc

## 7. 结果汇总

In [None]:
# 打印所有结果
print("=" * 50)
print("SST-2 Instruction Learning Results")
print("=" * 50)
print(f"{'Model':<25} {'Accuracy':<15}")
print("-" * 40)
for name, acc in sorted(all_results.items(), key=lambda x: x[1], reverse=True):
    print(f"{name:<25} {acc:.4f} ({acc*100:.2f}%)")
print("=" * 50)

# 保存汇总
summary_df = pd.DataFrame([
    {"model": name, "accuracy": acc} for name, acc in all_results.items()
]).sort_values("accuracy", ascending=False)
summary_df.to_csv("sst2_instruction_learning_summary.csv", index=False)
print("\n汇总已保存到 sst2_instruction_learning_summary.csv")