安装opencompass：Kaggle上已经为我们准备好了其他常用包，只需安装opencompass用于评测即可。如果不在Kaggle上运行，则还需要安装其他必要包。

In [1]:


import argparse
from dataclasses import dataclass, field
from typing import Optional, List, Dict
import sys
import torch
from transformers import TrainingArguments, HfArgumentParser, Trainer, AutoTokenizer, AutoModelForCausalLM
import datasets
import os
# set visible gpu
os.environ["CUDA_VISIBLE_DEVICES"] = "4"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import textwrap

# 在需要打印的地方
def pretty_print(text, width=80):
    print("\n".join(textwrap.wrap(text, width=width)))


# 指令微调

In [3]:
# Define the arguments required for the main program.
# NOTE: You can customize any arguments you need to pass in.
@dataclass
class ModelArguments:
    """Arguments for model
    """
    model_name_or_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "The path to the LLM to fine-tune or its name on the Hugging Face Hub."
        }
    )
    torch_dtype: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "Override the default `torch.dtype` and load the model under this dtype."
            ),
            "choices": ["bfloat16", "float16", "float32"],
        },
    )
    # TODO: add your model arguments here
    pass


@dataclass
class DataArguments:
    """Arguments for data
    """
    dataset_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "The path to the fine-tuning dataset or its name on the Hugging Face Hub."
        }
    )
    # TODO: add your data arguments here
    max_length: int = field(
        default=512,
        metadata={
            "help": "The max length of tokenized data."
        }
    )
    skip_too_long: Optional[bool] = field(
        default = False,
        metadata = {
            "help" : "whether to skip those longer than max length of tokenized data"
        }
    )

In [4]:
def loading(plm_model_path,sft_model_path,sft_model_path_2):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    parser = HfArgumentParser(dataclass_types=[ModelArguments, DataArguments, TrainingArguments])
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
    
    dataset = datasets.load_dataset(path='csv', data_files=data_args.dataset_path)
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_args.model_name_or_path)
    
    model_plm = AutoModelForCausalLM.from_pretrained(
            pretrained_model_name_or_path=plm_model_path, 
            torch_dtype=model_args.torch_dtype,
            trust_remote_code=True,  # Qwen模型需要这个参数
            device_map="auto",  # 可选，用于自动处理模型加载到设备
            use_cache=False
        )
    model_sft = AutoModelForCausalLM.from_pretrained(
            pretrained_model_name_or_path=sft_model_path, 
            torch_dtype=model_args.torch_dtype,
            trust_remote_code=True,  # Qwen模型需要这个参数
            device_map="auto",  # 可选，用于自动处理模型加载到设备
            use_cache=False
        )
    model_sft_2 = AutoModelForCausalLM.from_pretrained(
            pretrained_model_name_or_path=sft_model_path_2, 
            torch_dtype=model_args.torch_dtype,
            trust_remote_code=True,  # Qwen模型需要这个参数
            device_map="auto",  # 可选，用于自动处理模型加载到设备
            use_cache=False
        )       
    return dataset,model_plm.to(device),model_sft.to(device),model_sft_2.to(device),tokenizer
    
def test(dataset,model_plm,model_sft,model_sft_2,tokenizer,your_input=2):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if type(your_input)==int:
        sample = dataset['train'][your_input]
        output_text = sample["output"]
        
        text = "instruction: " + sample["instruction"] if sample["instruction"] else ""
        text += "\n input: " + sample["input"] if sample["input"] else ""
        
        print("Text: ", text)
        print("GT: ", output_text)
        print("===")
        # 不进行 padding，只进行截断
        inputs = tokenizer(text, return_tensors="pt")
    elif type(your_input)==str:
        inputs=tokenizer(your_input, return_tensors="pt")

    
    attention_mask = inputs["attention_mask"]
    pad_token_id = tokenizer.pad_token_id
    
    # # 查看生成的输入 IDs
    # print("Input IDs:", inputs['input_ids'])
    
    # # 查看生成的 attention_mask，不会被padding
    # print("Attention Mask:", inputs['attention_mask'])
    
    # # 查看 pad_token_id
    # print("Pad Token ID:", tokenizer.pad_token_id)
    inputs['input_ids'] = inputs['input_ids'].to(device)
    attention_mask = attention_mask.to(device)
    max_tokens = 256
    generate_ids = model_plm.generate(inputs['input_ids'], attention_mask=attention_mask, pad_token_id=pad_token_id, max_new_tokens=max_tokens)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs['input_ids'], generate_ids)]
    generated_text_plm = tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    
    pretty_print(f"plm:{generated_text_plm}\n")
    print(".....")
    generate_ids = model_sft.generate(inputs['input_ids'], attention_mask=attention_mask, pad_token_id=pad_token_id, max_new_tokens=max_tokens)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs['input_ids'], generate_ids)]
    generated_text_sft = tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    
    pretty_print(f"sft:{generated_text_sft}\n")
    print(".....")
    generate_ids = model_sft_2.generate(inputs['input_ids'], attention_mask=attention_mask, pad_token_id=pad_token_id, max_new_tokens=max_tokens)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs['input_ids'], generate_ids)]
    generated_text_sft_2 = tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    pretty_print(f"sft_2:{generated_text_sft_2}\n")
    print("==========")
    return generated_text_plm,generated_text_sft,generated_text_sft_2

In [5]:
# Pass your training arguments.
# NOTE [IMPORTANT!!!] DO NOT FORGET TO PASS PROPER ARGUMENTS TO SAVE YOUR CHECKPOINTS!!!
sys.argv = [
    "notebook", 
    # "--arg1", "value1",
    # "--arg2", "value2",
    # ...
    ### cjy
    "--model_name_or_path", "/ssd/xiaxinyuan/code/CS3602_NLP_Final_Project/output/1430/checkpoint-50000",
    "--dataset_path", "/home/xiaxinyuan/.cache/kagglehub/datasets/thedevastator/alpaca-language-instruction-training/versions/2/train.csv",
    "--torch_dtype", "bfloat16", #see Qwen2.5-0.5B/config.json?
    "--output_dir", "output/1227/", # --output_dir 参数在 TrainingArguments 中有
    "--remove_unused_columns", "False", #ValueError: No columns in the dataset match the model's forward method signature. The following columns have been ignored: [output, instruction, input]. Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`
    "--max_length", "512",
    ### xxy
    "--skip_too_long","True",
    "--learning_rate","1e-5",
    "--lr_scheduler_type","cosine",
    "--optim", "adamw_hf",
    "--warmup_ratio", "0.03",
    "--weight_decay", "0.003",
    ### xxy
    ### cjy
    "--per_device_train_batch_size", "4",  # 设置训练的 batch size
    "--per_device_eval_batch_size", "4", 
    "--save_steps", "1000",
    "--save_total_limit", "3",
    "--num_train_epochs", "3",  # 通常3-5个epoch即可收敛，长时间训练可能会过拟合 
    "--bf16","True",  # 开启混合精度加速
]
dataset,model_plm,model_sft,model_sft_2,tokenizer=loading(plm_model_path="/home/xiaxinyuan/.cache/kagglehub/models/qwen-lm/qwen2.5/transformers/0.5b/1/", # 修改为你的模型路径
                                    sft_model_path="/ssd/xiaxinyuan/code/CS3602_NLP_Final_Project/output/1430/checkpoint-50000",
                                    sft_model_path_2="/ssd/xiaxinyuan/code/CS3602_NLP_Final_Project/output/0.5B/checkpoint-50000") # 修改为你的模型路径

In [49]:
instruction_set = [
    "college econometrics\n \
    Which of the following statements concerning the regression population and sample is FALSE? \
    \nA. The population is the total collection of all items of interest \
    \nB. The population can be infinite \
    \nC. In theory, the sample could be larger than the population \
    \nD. A random sample is one where each individual item from the population is equally likely to be drawn. \
    \nC. In theory, the sample could be larger than the population",
]



In [54]:
# instruction_set = [
#     "Which of the following statements concerning the regression population and sample is FALSE?\n A. The population is the total collection of all items of interest\n B. The population can be infinite\n C. In theory, the sample could be larger than the population\n D. A random sample is one where each individual item from the population is equally likely to be drawn."
# ]
import pandas as pd
def get_template(instruction,choice,field,ans,mode = 'infer'):
    template_set = [
        # f"HUMAN: There is a single choice question about {field}. \nQ:{instruction}: {choice} ",
        # f"HUMAN: There is a single choice question about {field}. \nQ:{instruction}: {choice}\nBOT: ",
        # f"HUMAN: There is a single choice question about college physics. Q:{instruction}: {choice}\nBOT: ",
        # f"HUMAN: There is a single choice question about college physics. \nQ:{instruction}: {choice}\nBOT: ",
        # f"HUMAN: There is a single choice question about college mathematics. Q:{instruction}: {choice}\nBOT: ",
        # f"HUMAN: There is a single choice question about {field}.\nQ:{instruction}: {choice}\nLet's think step by step. A:",
        # f"HUMAN: There is a single choice question about {field}.\nQ:{instruction}: {choice}\nLet's think step by step. BOT: A:",
        # f"HUMAN: There is a single choice question about {field}.\nQ: {instruction}:\n{choice}\nLet's think step by step. BOT:",
        # f"HUMAN: There is a single choice question about {field}.\nQ: {instruction}:\n{choice}\nA:",
        f"HUMAN: There is a single choice question about {field}.\nQuestion: {instruction}. \n{choice}\n Why the answer is {ans}? \n Think step by step. BOT: ",
    ]
    return template_set
for instruction in instruction_set: 
    ins = instruction.split("\n")
    area = ins[0]
    instruction = ins[1]
    choice = ins[2:6]
    ans=ins[6]
    # print(area)
    formatted_text = '\n'.join(choice.strip() for choice in choice)
    final_instructions = get_template(instruction,formatted_text,area,ans)
    for final_instruction in final_instructions:
        # pretty_print(final_instruction)
        print(final_instruction)
        plm_ans, sft1_ans, sft2_ans = test(dataset=dataset,
            model_plm=model_plm,
            model_sft=model_sft,
            model_sft_2=model_sft_2,
            tokenizer=tokenizer,
            your_input=final_instruction) # 如果your_input是数字，则是被理解dataset中的index，即问alpaca中的第your_input个问题；如果是字符串，则是输入的文本
        # save template set and answer in a row in excel
        # df = pd.DataFrame({'template': [final_instruction], 'plm_ans': [plm_ans], 'sft1_ans': [sft1_ans], 'sft2_ans': [sft2_ans]})
        # df.to_excel('case_study.xlsx', index=False)


HUMAN: There is a single choice question about college econometrics.
Question:      Which of the following statements concerning the regression population and sample is FALSE?     . 
A. The population is the total collection of all items of interest
B. The population can be infinite
C. In theory, the sample could be larger than the population
D. A random sample is one where each individual item from the population is equally likely to be drawn.
 Why the answer is C. In theory, the sample could be larger than the population? 
 Think step by step. BOT: 
plm:1. The population is the total collection of all items of interest. 2. In
theory, the sample could be larger than the population. 3. A random sample is
one where each individual item from the population is equally likely to be
drawn. 4. The population can be infinite. 5. The population is the total
collection of all items of interest. 6. The population can be infinite. 7. In
theory, the sample could be larger than the population. 8. A

In [8]:
for instruction in instruction_set: 
    ins = instruction.split("\n")
    choices = ins[1:]
    formatted_text = ' '.join(choice.strip() for choice in choices)
    print(formatted_text)

The primary source of the Sun’s energy is a series of thermonuclear reactions in which the energy produced is c^2 times the mass difference between A. two hydrogen atoms and one helium atom B. four hydrogen atoms and one helium atom C. six hydrogen atoms and two helium atoms D. three helium atoms and one carbon atom 


In [9]:

instruction = "Write a humerous joke"
input_text = None
print(f"instruction: {instruction}\ninput: {input_text}")

instruction: Write a humerous joke
input: None


In [10]:
model_sft

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((

In [11]:
model_sft_2


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2SdpaAttention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
          (rotary_emb): Qwen2RotaryEmbedding()
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((

In [12]:
from opencompass.models import HuggingFaceCausalLM
from opencompass.datasets import (
    MMluPPL, 
    HellaswagCleanPPL, 
    WinograndePPL, 
    ARCePPL, 
    ARCcCleanPPL, 
    SuperGLUEBoolQPPL
)
from opencompass.runners import LocalRunner
from opencompass.summarizers import ExampleSummarizer

from opencompass.models import HuggingFaceCausalLM
from opencompass.datasets.mmlu import MMLU  # 修正导入路径
from opencompass.datasets.hellaswag import HellaSwag
from opencompass.datasets.winogrande import Winogrande
from opencompass.datasets.arc import ARC_e, ARC_c
from opencompass.runners import LocalRunner
from opencompass.summarizers import ExampleSummarizer

def train():
    # 模型配置
    model_cfg = dict(
        type=HuggingFaceCausalLM,
        path="output/0.5B/checkpoint-50000",
        tokenizer_kwargs={
            'padding_side': 'left',
            'truncation': 'left'
        },
        max_seq_len=2048,
        batch_size=4,
        model_kwargs={'device_map': 'auto'},
    )

    # 数据集配置
    dataset_cfgs = [
        dict(
            type=MMLU,
            path='mmlu',
            name='mmlu',
            ppl_eval=True  # 使用困惑度评估
        ),
        dict(
            type=HellaSwag,
            path='hellaswag',
            name='hellaswag',
            ppl_eval=True
        ),
        dict(
            type=Winogrande,
            path='winogrande',
            name='winogrande',
            ppl_eval=True
        ),
        dict(
            type=ARC_e,
            path='arc_e',
            name='arc_e',
            ppl_eval=True
        ),
        dict(
            type=ARC_c,
            path='arc_c',
            name='arc_c',
            ppl_eval=True
        ),
        dict(
            type=BoolQ,
            path='boolq',
            name='boolq',
            ppl_eval=True,
            few_shot=True
        ),
    ]

    # 评测配置
    eval_cfg = dict(
        work_dir="/ssd/xiaxinyuan/code/CS3602_NLP_Final_Project/evals/plm",
        summarizer=dict(type=ExampleSummarizer),
        debug=True
    )

    # 创建评测实例
    from opencompass import Evaluator
    evaluator = Evaluator(
        model_cfg=model_cfg,
        dataset_cfgs=dataset_cfgs,
        eval_cfg=eval_cfg
    )

    # 运行评测
    results = evaluator.run()

train()

  _warn_about_config_migration()


ImportError: cannot import name 'MMluPPL' from 'opencompass.datasets' (/home/xiaxinyuan/.local/lib/python3.10/site-packages/opencompass/datasets/__init__.py)