一、导入相关包

In [1]:
from vllm import LLM, SamplingParams
from datasets import Dataset
from typing import Dict
import warnings
import random
import pandas as pd
import torch
from tqdm import tqdm

pd.set_option('future.no_silent_downcasting', True)

二、加载数据集

In [2]:
# 读取文件
train_path = '../data/train.ft.txt'
test_path = '../data/test.ft.txt'

def read_review(path: str) -> pd.DataFrame:
    # 定义一个空列表来存储标签和评论
    reviews_list = []
    # 打开并读取文件内容
    with open(path, 'r') as file:
        lines = file.readlines()
        for line in tqdm(lines):
            # 分割标签和评论
            label, review = line.strip().split(' ', 1)
            reviews_list.append((label, review))
    # 将列表转换为DataFrame
    df = pd.DataFrame(reviews_list, columns=['Label', 'Review'])
    # 更换label的类型
    df['Label'] = df['Label'].replace({'__label__1': '1', '__label__2': '2'})
    return df

train_df = read_review(train_path)
test_df = read_review(test_path)
train_df

100%|██████████| 350443/350443 [00:00<00:00, 829800.88it/s]
100%|██████████| 400000/400000 [00:00<00:00, 1041851.50it/s]


Unnamed: 0,Label,Review
0,2,Stuning even for the non-gamer: This sound tra...
1,2,The best soundtrack ever to anything.: I'm rea...
2,2,Amazing!: This soundtrack is my favorite music...
3,2,Excellent Soundtrack: I truly like this soundt...
4,2,"Remember, Pull Your Jaw Off The Floor After He..."
...,...,...
350438,1,Oracle SQL needs Help: This book waltzes throu...
350439,1,Unusual writing style: This book chronicles th...
350440,1,"Junk.: They didn't give ""no stars"" as an optio..."
350441,1,WORTHLESS!!!: This book only presents a sequen...


In [3]:
# 文件转换
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

nums = 10000
mini_test_ds = Dataset.from_pandas(test_df.iloc[:nums, :])

train_ds

Dataset({
    features: ['Label', 'Review'],
    num_rows: 350443
})

三、数据集预处理（加入prompt）

In [4]:
def get_prompt(origin: Dict[str, str], train_ds: Dataset, max_length: int=1024, shot_num: int=3):
    # 随机筛选上下文示例
    shot_list = random.choices(train_ds, k=shot_num)
    # 根据上下文示例生成输入模板
    input_content = "Please classify the following text into Label 1 or Label 2. " + \
                "And 2 represents positive emotions and 1 represents negative emotions. \n"
    for shot in shot_list:
        input_content += "\n    Text: "
        input_content += shot['Review']
        input_content += "\n    Label: "
        input_content += shot['Label']
        input_content += "\n"
    input_content += "\n    Text: "
    input_content += origin['Review']
    input_content += "\n    Label: "
    if len(input_content) > max_length:
        warnings.warn('Got too long prompt!')
    return {
        "prompt": input_content,
        "label": origin['Label'],
    }

In [5]:
# 固定随机数种子
random.seed(42)

test_prompt = mini_test_ds.map(get_prompt, remove_columns=mini_test_ds.column_names, 
                               fn_kwargs={'train_ds': train_ds})

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]



In [6]:
# 生成数据集
prompts = test_prompt[:]['prompt']

四、加载模型，利用vllm加速推理

In [7]:
path = '../model/LLM-Research/Meta-Llama-3-8B'

# sampling_params = SamplingParams(temperature=0.5, top_p=0.95)
sampling_params = SamplingParams(temperature=0, max_tokens=128) # 贪婪采样
llm = LLM(model=path, tokenizer=path)

INFO 06-05 18:27:29 llm_engine.py:75] Initializing an LLM engine (v0.4.0) with config: model='../model/LLM-Research/Meta-Llama-3-8B', tokenizer='../model/LLM-Research/Meta-Llama-3-8B', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 06-05 18:27:29 selector.py:16] Using FlashAttention backend.
INFO 06-05 18:27:36 model_runner.py:104] Loading model weights took 14.9595 GB
INFO 06-05 18:27:37 gpu_executor.py:94] # GPU blocks: 2316, # CPU blocks: 2048
INFO 06-05 18:27:39 model_runner.py:791] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-05 18:27:39 model_runner.py:795] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 06-05 18:27:43 model_runner.py:867] Graph capturing finished in 4 secs.


In [8]:
outputs = llm.generate(prompts, sampling_params)

Processed prompts: 100%|██████████| 10000/10000 [19:27<00:00,  8.57it/s]


五、结果评估

In [9]:
# 统计准确率
acc_num = 0
bad_num = 0
bad_output = []
perfect_num = 0

for output, actual in zip(outputs, test_prompt[:nums]['label']):
    # prompt = output.prompt
    generated_text = output.outputs[0].text
    if generated_text[0] == actual:
        acc_num += 1
    if generated_text[0] not in ['1', '2']:
        bad_num += 1
        bad_output.append(output)
    if generated_text in ['1', '2']:
        perfect_num += 1

print(f"准确率: {acc_num / nums:.2%}")
print(f"非正确格式的输出数: {bad_num}")
print(f"完全正确格式的输出数: {perfect_num}")

准确率: 94.11%
非正确格式的输出数: 0
完全正确格式的输出数: 0
