In [3]:
# 加载 SST-2 数据集
from datasets import load_dataset

dataset = load_dataset("glue", "sst2")
# 查看训练集
print(dataset)

README.md: 0.00B [00:00, ?B/s]

sst2/train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

sst2/validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

sst2/test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})


In [4]:
print(dataset["train"][:1])
print(dataset["test"][:1])
print(dataset["validation"][:1])

{'sentence': ['hide new secretions from the parental units '], 'label': [0], 'idx': [0]}
{'sentence': ['uneasy mishmash of styles and genres .'], 'label': [-1], 'idx': [0]}
{'sentence': ["it 's a charming and often affecting journey . "], 'label': [1], 'idx': [0]}


In [5]:
print(dataset["test"]["label"][:10]) #test中的标签全部是-1 不是真实标签

[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1]


In [6]:
import os
import sys
import logging

import numpy as np
import pandas as pd

import datasets
import evaluate
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification
from transformers import Trainer,TrainingArguments

from datasets import load_dataset
from sklearn.metrics import accuracy_score

dataset = load_dataset("glue", "sst2")

train_dataset = dataset["train"]
test_dataset = dataset["test"]
val_dataset = dataset["validation"]

if __name__ == "__main__":
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info(r"running %s" % ''.join(sys.argv))

    #数据处理

    model_id = "microsoft/deberta-base"
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    def preprocess_function(examples):
        return tokenizer(examples["sentence"], truncation=True)

    
    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    tokenized_val = val_dataset.map(preprocess_function, batched=True)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    logger.info("Train size: %d, Validation size: %d" % (len(tokenized_train), len(tokenized_val)))


    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return {"accuracy": accuracy_score(labels, predictions)}

2025-12-31 06:28:56.411626: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767162536.576527      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767162536.627184      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767162537.030799      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767162537.030849      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767162537.030855      55 computation_placer.cc:177] computation placer alr

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/872 [00:00<?, ? examples/s]

INFO:colab_kernel_launcher.py:Train size: 67349, Validation size: 872


In [7]:
sample_sizes = [16, 64, 256, 1024, None] #None在下面判断 表示全部
results = {}

for i in sample_sizes:
    sample_name = "%d样本" % i if i else "全部样本"

    if i:
        train_subset = tokenized_train.shuffle(seed=42).select(range(i))
        logger.info("使用%d个样本进行训练" % i)
    else:
        train_subset = tokenized_train
        logger.info("使用全部样本进行训练")


    model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)

    num_epochs = 3 # 3个epoch
    train_batch_size = 32 # 统一batch size

    training_args = TrainingArguments(
        output_dir='./results/deberta_%s' % sample_name,
        num_train_epochs=num_epochs,
        per_device_train_batch_size=train_batch_size,
        per_device_eval_batch_size=4,
        warmup_ratio=0.1, # 学习率预热比例,训练开始时学习率从 0 逐渐升到设定值,warmup_ratio=0.1 表示前 10% 的训练步数用于预热
        weight_decay=0.01, # 权重衰减（L2 正则化），防止模型过拟合。new_weight = old_weight - lr * gradient - lr * weight_decay * old_weight
        learning_rate=2e-5,
        logging_dir='./logs/deberta_%s' % sample_name,
        logging_steps=1,
        save_strategy="no",
        eval_strategy="epoch",

        # 防止 Kaggle 多进程死锁
        dataloader_num_workers=0,
        dataloader_pin_memory=False,
        report_to="none",
    )

    trainer = Trainer(
        model = model,
        args = training_args,
        train_dataset = train_subset,
        eval_dataset = tokenized_val,
        data_collator = data_collator,
        compute_metrics = compute_metrics,
    )

    trainer.train()

    # 在验证集上预测并保存结果
    prediction_outputs = trainer.predict(tokenized_val)
    predictions = np.argmax(prediction_outputs.predictions, axis=-1)
        
    result_output = pd.DataFrame(data={"idx": val_dataset["idx"], "prediction": predictions})
    csv_name = "sst2_deberta_finetune_%s.csv" % sample_name
    result_output.to_csv(csv_name, index=False, quoting=3)
    logger.info("Predictions saved to %s" % csv_name)

    
    eval_results = trainer.evaluate()
    accuracy = eval_results["eval_accuracy"] #提取准确率
    results[sample_name] = accuracy

    
    logger.info("使用%s样本的准确率accuracy: %.4f" % (i if i else '全部', accuracy))

INFO:colab_kernel_launcher.py:使用16个样本进行训练


pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/559M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,0.7221,0.695025,0.490826
2,0.7131,0.694405,0.490826
3,0.7305,0.694141,0.490826


INFO:colab_kernel_launcher.py:Predictions saved to sst2_deberta_finetune_16样本.csv


INFO:colab_kernel_launcher.py:使用16样本的准确率accuracy: 0.4908
INFO:colab_kernel_launcher.py:使用64个样本进行训练
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7602,0.719408,0.490826
2,0.695,0.70416,0.498853
3,0.6441,0.692486,0.551606


INFO:colab_kernel_launcher.py:Predictions saved to sst2_deberta_finetune_64样本.csv


INFO:colab_kernel_launcher.py:使用64样本的准确率accuracy: 0.5516
INFO:colab_kernel_launcher.py:使用256个样本进行训练
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6913,0.687089,0.530963
2,0.8963,0.692244,0.540138
3,0.6234,0.67334,0.599771


INFO:colab_kernel_launcher.py:Predictions saved to sst2_deberta_finetune_256样本.csv


INFO:colab_kernel_launcher.py:使用256样本的准确率accuracy: 0.5998
INFO:colab_kernel_launcher.py:使用1024个样本进行训练
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.595,0.613895,0.771789
2,0.4794,0.27967,0.902523
3,0.2073,0.266065,0.90711


INFO:colab_kernel_launcher.py:Predictions saved to sst2_deberta_finetune_1024样本.csv


INFO:colab_kernel_launcher.py:使用1024样本的准确率accuracy: 0.9071
INFO:colab_kernel_launcher.py:使用全部样本进行训练
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1951,0.165337,0.948394
2,0.0194,0.192209,0.950688
3,0.0108,0.202214,0.955275


INFO:colab_kernel_launcher.py:Predictions saved to sst2_deberta_finetune_全部样本.csv


INFO:colab_kernel_launcher.py:使用全部样本的准确率accuracy: 0.9553
