In [None]:
# 使用训练好的Llama3.1模型生成元特征

In [None]:
# 加载原始模型

from unsloth import FastLanguageModel
import torch
max_seq_length = 5000

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "model/LLM-Research/Meta-Llama-3___1-8B-Instruct",
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
    device_map="auto",
)

In [None]:
#加载lora适配器
from peft import PeftModel
lora_path = 'save/Llama3.1-ensemble-v2'
model = PeftModel.from_pretrained(model, lora_path)

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

In [None]:
#加载测试集
from datasets import load_dataset
test_dataset = load_dataset('csv', data_files='data/dataset_5fold_1/dataset_fold_5.csv', split='train')
test_dataset = test_dataset.remove_columns(["text","label"])
print(test_dataset)

In [None]:
MAX_LENGTH = 4500  # Llama3.1支持128k上下文

def truncate(data_point):
    # 构建初始的 full_prompt
    full_prompt = f"""You are a developer of the GCC compiler. Your job is to categorize bug reports. You are given a snippet of code that triggers the bug and a description of the bug.
The bug reports are categorized as follows:'code-simplification-optimization-defects','control-flow-optimization-defects','data-flow-analysis-optimization-defects','infrastructure-defects','interprocedural-optimization-defects','memory-optimization-defects','numerical-analysis-optimization-defects','vectorization-defects'.
You must select one of the above fourteen categories to output as the category of bug report.
### Code Snippet:
{data_point["code"]}
### Bug Description:
{data_point["report"]}
### Response:
"""
    
    # Tokenize整个prompt
    tokenized_prompt = tokenizer(full_prompt)
    token_length = len(tokenized_prompt['input_ids'])

    # 如果超过MAX_LENGTH，截断处理
    if token_length > MAX_LENGTH:
        # Tokenize code 和 input 部分
        tokenized_code = tokenizer(data_point["code"], truncation=False)
        tokenized_input = tokenizer(data_point["report"], truncation=False)

        # 分别计算 code 和 input 的 token 长度
        code_token_length = len(tokenized_code['input_ids'])
        input_token_length = len(tokenized_input['input_ids'])

        # 保留的长度 = MAX_LENGTH - (固定部分的token长度，即非code和input部分)
        fixed_prompt = f"""You are a developer of the GCC compiler. Your job is to categorize bug reports. You are given a snippet of code that triggers the bug and a description of the bug.
The bug reports are categorized as follows:'code-simplification-optimization-defects','control-flow-optimization-defects','data-flow-analysis-optimization-defects','infrastructure-defects','interprocedural-optimization-defects','memory-optimization-defects','numerical-analysis-optimization-defects','vectorization-defects'.
You must select one of the above fourteen categories to output as the category of bug report.
### Code Snippet:
### Bug Description:
### Response:
"""
        fixed_token_length = len(tokenizer(fixed_prompt)['input_ids'])
        remaining_length = MAX_LENGTH - fixed_token_length

        # 优先截断 code 和 input
        if code_token_length + input_token_length > remaining_length:
            # 如果总长度超过剩余长度，首先截断较长的部分
            if code_token_length > input_token_length:
                # 优先截断 code
                truncated_code = tokenizer.decode(tokenized_code['input_ids'][:remaining_length - input_token_length])
                truncated_input = data_point["report"]
            else:
                # 优先截断 input
                truncated_code = data_point["code"]
                truncated_input = tokenizer.decode(tokenized_input['input_ids'][:remaining_length - code_token_length])
        else:
            # 如果总长度不超标，不做额外截断
            truncated_code = data_point["code"]
            truncated_input = data_point["report"]

        # 构建最终截断后的 prompt
        full_prompt = f"""You are a developer of the GCC compiler. Your job is to categorize bug reports. You are given a snippet of code that triggers the bug and a description of the bug.
The bug reports are categorized as follows:'code-simplification-optimization-defects','control-flow-optimization-defects','data-flow-analysis-optimization-defects','infrastructure-defects','interprocedural-optimization-defects','memory-optimization-defects','numerical-analysis-optimization-defects','vectorization-defects'.
You must select one of the above fourteen categories to output as the category of bug report.
### Code Snippet:
{truncated_code}
### Bug Description:
{truncated_input}
### Response:
"""
    
    # 进行最后的tokenize处理，确保token长度满足要求
    return full_prompt

In [None]:
# 定义函数：提取生成文本中的分类结果

import re
def extract_predicted_label(response_text):
    match = re.search(r'### Response:\n\s*([a-z-]+-defects)', response_text)
    if match:
        return match.group(1)
    return "unknown"

In [None]:
# 评估三次，取最好的一次结果
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
import torch

# 初始化
best_accuracy = 0.0
best_predictions = []
num_evaluations = 3

for evaluation_round in range(num_evaluations):
    print(f"Starting evaluation {evaluation_round + 1}/{num_evaluations}")
    true_labels = []
    predicted_labels = []
    model.eval()

    with torch.no_grad():
        for sample in tqdm(test_dataset, desc=f"Evaluating Round {evaluation_round + 1}"):
            # 清理缓存以防止显存泄漏
            torch.cuda.empty_cache()

            # 获取模型输入和标签
            prompt = truncate(sample)  # 假设 truncate 是已定义的函数
            true_label = sample["category"]
            model_input = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=5000).to("cuda")
            
            # 模型生成输出
            model_output = tokenizer.decode(model.generate(**model_input, max_new_tokens=30)[0], skip_special_tokens=True)
            response = extract_predicted_label(model_output)  # 假设 extract_predicted_label 是已定义的函数

            # 保存结果
            true_labels.append(true_label)
            predicted_labels.append(response)

    # 计算评估指标
    accuracy = accuracy_score(true_labels, predicted_labels)
    macro_f1 = f1_score(true_labels, predicted_labels, average='macro')
    micro_f1 = f1_score(true_labels, predicted_labels, average='micro')
    weighted_f1 = f1_score(true_labels, predicted_labels, average='weighted')

    print(f"Round {evaluation_round + 1} - Accuracy: {accuracy:.4f}")
    print(f"Round {evaluation_round + 1} - Macro F1 Score: {macro_f1:.4f}")
    print(f"Round {evaluation_round + 1} - Micro F1 Score: {micro_f1:.4f}")
    print(f"Round {evaluation_round + 1} - Weighted F1 Score: {weighted_f1:.4f}")

    # 保存当前轮次的最佳结果
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_predictions = predicted_labels
        print(f"New best accuracy found: {best_accuracy:.4f}")

# 保存最佳预测结果
print("Evaluation completed.")
print(f"Best Accuracy: {best_accuracy:.4f}")

In [None]:
unique_labels = set(best_predictions)
unique_labels

In [None]:
others = 'unknown'
count = best_predictions.count(others)
count

In [None]:
index = best_predictions.index(others)
print(best_predictions[index])
print(true_labels[index])

In [None]:
best_predictions[index] = true_labels[index]
best_predictions[index]

In [None]:
import pandas as pd
# 字符串到整数的映射字典
label_mapping = {
    'code-simplification-optimization-defects': 0,
    'control-flow-optimization-defects': 1,
    'data-flow-analysis-optimization-defects': 2,
    'infrastructure-defects': 3,
    'interprocedural-optimization-defects': 4,
    'memory-optimization-defects': 5,
    'numerical-analysis-optimization-defects': 6,
    'vectorization-defects': 7
}

# 转换为整数列表
predicted_labels_int = [label_mapping[label] for label in best_predictions]

# 保存为元特征
test_predictions_df = pd.DataFrame({"llama_prediction": predicted_labels_int})
test_predictions_df.to_csv("data/ensemble/stacking/llama_predictions_v2-3.csv", index=False)
print("\n预测结果已保存")