In [1]:
import json
from datasets import Dataset

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)  # 直接解析整个JSON数组
    return data

def get_datasets(train_path, dev_path, test_path):
    train_data = load_json(train_path)
    dev_data = load_json(dev_path)
    test_data = load_json(test_path)

    train_dataset = Dataset.from_list(train_data)
    dev_dataset = Dataset.from_list(dev_data)
    test_dataset = Dataset.from_list(test_data)

    return train_dataset, dev_dataset, test_dataset


In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

def get_model_and_tokenizer(model_name='hfl/chinese-macbert-base'):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    return tokenizer, model


In [3]:
import os
import torch
from transformers import Trainer, TrainingArguments
from sklearn.metrics import f1_score
import numpy as np
# from data_loader import get_datasets
# from model import get_model_and_tokenizer

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    macro_f1 = f1_score(labels, preds, average='macro')
    return {'macro_f1': macro_f1}

def main():
    # 自动检测并使用GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"当前使用的设备: {device}")

    logs_dir = os.path.abspath('./logs')
    results_dir = os.path.abspath('./results')
    model_dir = os.path.abspath('./best_model')

    os.makedirs(logs_dir, exist_ok=True)
    os.makedirs(results_dir, exist_ok=True)
    os.makedirs(model_dir, exist_ok=True)

    train_dataset, dev_dataset, _ = get_datasets(
        '/kaggle/input/knowledge-data/data/train.json',
        '/kaggle/input/knowledge-data/data/dev.json',
        '/kaggle/input/knowledge-data/data/test.json'
    )

    tokenizer, model = get_model_and_tokenizer()
    model.to(device)  # 模型移动到GPU

    def preprocess_function(examples):
        return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    tokenized_dev = dev_dataset.map(preprocess_function, batched=True)

    training_args = TrainingArguments(
        output_dir=results_dir,
        eval_strategy='epoch',
        save_strategy='epoch',
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir=logs_dir,
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model='macro_f1',
        greater_is_better=True,
        report_to="tensorboard",  # 让TensorBoard正常工作
        save_total_limit=2,  # 最多保留2个模型
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_dev,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    print("开始训练...")
    trainer.train()
    print("训练完成，保存模型中...")

    trainer.save_model(model_dir)
    print(f"模型已保存到: {model_dir}")

if __name__ == "__main__":
    main()


2025-05-25 04:15:58.962582: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748146559.175681      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748146559.230417      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


当前使用的设备: cuda


tokenizer_config.json:   0%|          | 0.00/19.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/32400 [00:00<?, ? examples/s]

Map:   0%|          | 0/2800 [00:00<?, ? examples/s]

  trainer = Trainer(


开始训练...


Epoch,Training Loss,Validation Loss,Macro F1
1,0.0003,1.915368,0.679208
2,0.0,1.064008,0.847486
3,0.0,1.530207,0.786939


训练完成，保存模型中...
模型已保存到: /kaggle/working/best_model


In [8]:
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm

def main():
    # 判断是否有可用的GPU
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # 加载数据和模型
    test_data = load_json('/kaggle/input/knowledge-data/data/test.json')  # 请确保文件路径正确
    tokenizer = AutoTokenizer.from_pretrained('./best_model')
    model = AutoModelForSequenceClassification.from_pretrained('./best_model')
    
    # 将模型转移到GPU
    model.to(device)
    model.eval()

    results = []
    for item in tqdm(test_data):
        # 将输入数据转移到GPU
        inputs = tokenizer(item['text'], return_tensors='pt', truncation=True, padding=True, max_length=512)
        inputs = {key: value.to(device) for key, value in inputs.items()}
        
        with torch.no_grad():
            # 在GPU上进行推理
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_label = torch.argmax(logits, dim=1).item()
        
        results.append({
            'id': item['id'],
            'text': item['text'],
            'label': predicted_label
        })

    # 将结果保存到文件
    with open('submission.json', 'w', encoding='utf-8') as f:
        for result in results:
            f.write(json.dumps(result, ensure_ascii=False) + '\n')

if __name__ == "__main__":
    main()


100%|██████████| 11000/11000 [03:16<00:00, 55.88it/s]


In [13]:
import json

# 加载 submission.json
submission = {}
with open('submission.json', 'r', encoding='utf-8') as f:
    for line in f:
        item = json.loads(line)
        submission[item['id']] = item['label']

# 加载 test_with_label.json
with open('/kaggle/input/knowledge-data/data/test_with_label.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)

# 进行比对
correct = 0
total = 0

for item in test_data:
    id_ = item['id']
    true_label = item['label']
    pred_label = submission.get(id_, None)
    
    if pred_label is not None:
        if pred_label == true_label:
            correct += 1
        total += 1

accuracy = correct / total if total > 0 else 0
print(f"总样本数: {total}")
print(f"预测正确数: {correct}")
print(f"准确率: {accuracy:.4f}")


总样本数: 11000
预测正确数: 8949
准确率: 0.8135


In [9]:
import os
import zipfile
import datetime

def file2zip(packagePath, zipPath):
    '''
  :param packagePath: 文件夹路径
  :param zipPath: 压缩包路径
  :return:
  '''
    zip = zipfile.ZipFile(zipPath, 'w', zipfile.ZIP_DEFLATED)
    for path, dirNames, fileNames in os.walk(packagePath):
        fpath = path.replace(packagePath, '')
        for name in fileNames:
            fullName = os.path.join(path, name)
            name = fpath + '\\' + name
            zip.write(fullName, name)
    zip.close()


if __name__ == "__main__":
    # 文件夹路径
    packagePath = '/kaggle/working/'
    zipPath = '/kaggle/working/output.zip'
    if os.path.exists(zipPath):
        os.remove(zipPath)
    file2zip(packagePath, zipPath)
    print("打包完成")
    print(datetime.datetime.utcnow())


打包完成
2025-05-25 06:18:52.940135


In [14]:
import os
os.chdir('/kaggle/working')
print(os.getcwd())
print(os.listdir("/kaggle/working"))
from IPython.display import FileLink
FileLink('output.zip')


#下面就会有文件的下载链接，直接点击下载即可


/kaggle/working
['output.zip', 'logs', 'results', 'submission.json', '.virtual_documents', 'best_model']
