In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = r'model/RTA'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=8) 

In [None]:
# 查看模型架构
model

In [None]:
# 计算参数量
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Model has {num_params} trainable parameters.")

In [None]:
from datasets import load_dataset

# 加载数据集
dataset = load_dataset('csv', data_files={'train': 'data/train_dataset_bert.csv', 'test': 'data/test_dataset_bert.csv'})

In [None]:
def preprocess_data(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)

encoded_train_dataset = dataset['train'].map(preprocess_data, batched=True)
encoded_test_dataset = dataset['test'].map(preprocess_data, batched=True)

In [None]:
encoded_test_dataset[0]

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

# 设置训练参数
training_args = TrainingArguments(
    output_dir='output/RTA-v3',           # 保存结果的文件夹
    learning_rate=1e-5,               # 学习率
    per_device_train_batch_size=32,    # 训练时的 batch size
    per_device_eval_batch_size=32,     # 评估时的 batch size
    num_train_epochs=20,               # 训练的 epoch 数
    weight_decay=0.1,                # 权重衰减
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=100,
    report_to="none",
    lr_scheduler_type="linear",  
    warmup_steps=500,  # 设置 warmup 步骤
    save_strategy="steps",
    save_steps=200,
)

# 定义 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_dataset,   # 训练集
    eval_dataset=encoded_test_dataset,     # 测试集
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)

In [None]:
# 开始训练
trainer.train()

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

# 加载模型和检查点
checkpoint_path = "./output/RTA-20241129/checkpoint-1000"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)
tokenizer = AutoTokenizer.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir='./results', 
    per_device_eval_batch_size=32,  
    do_train=False,  # 关闭训练
    do_eval=True,    # 打开评估模式
)

trainer = Trainer(
    model=model,                        
    args=training_args,  
)

# 在测试集上进行预测
predictions, labels, _ = trainer.predict(encoded_test_dataset)

# 获取预测的类别
predicted_classes = np.argmax(predictions, axis=1)


class_names = ['code-simplification-optimization-defects',
    'control-flow-optimization-defects',
    'data-flow-analysis-optimization-defects',
    'infrastructure-defects',
    'interprocedural-optimization-defects',
    'memory-optimization-defects',
    'numerical-analysis-optimization-defects',
    'vectorization-defects']

# 生成分类报告
report = classification_report(labels, predicted_classes, target_names=class_names,digits=4)  # 替换为你的类别标签
print("分类报告：")
print(report)

# 生成混淆矩阵
conf_matrix = confusion_matrix(labels, predicted_classes)

In [None]:
dataset['test'][10]