In [1]:
import collections
import logging
import os
import os
import sys
from dataclasses import dataclass, field
from typing import Optional, Union

import numpy as np
import numpy as np
import pandas as pd
import torch
import transformers
from datasets import Dataset
from datasets import load_dataset
from tqdm.auto import tqdm
from transformers import (
    AutoConfig,
    AutoModelForMultipleChoice,
    AutoTokenizer,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import AutoTokenizer
from transformers import default_data_collator
from transformers.tokenization_utils_base import PaddingStrategy, PreTrainedTokenizerBase
from transformers.trainer_utils import get_last_checkpoint, is_main_process

In [2]:

train_df = pd.read_excel('data/好大夫在线_非标准化疾病诉求的简单分诊数据集/train.xlsx', engine='openpyxl')
test_df = pd.read_excel('data/好大夫在线_非标准化疾病诉求的简单分诊数据集/test.xlsx', engine='openpyxl')

In [3]:
for df in [train_df, test_df]:
    df['text'] = str(df['title']) + str(df['conditionDesc']) + str(df['hopeHelp']) + str(df['diseaseName'])

In [4]:
train_df[0:100].to_csv("data/好大夫在线_非标准化疾病诉求的简单分诊数据集/new_train.csv", index=False)
train_df[100:200].to_csv("data/好大夫在线_非标准化疾病诉求的简单分诊数据集/new_valid.csv", index=False)
train_df[200:300].to_csv("data/好大夫在线_非标准化疾病诉求的简单分诊数据集/new_test.csv", index=False)

In [5]:
data_files = {}
data_files["train"] = 'data/好大夫在线_非标准化疾病诉求的简单分诊数据集/new_train.csv'
data_files["validation"] = 'data/好大夫在线_非标准化疾病诉求的简单分诊数据集/new_valid.csv'
data_files["test"] = 'data/好大夫在线_非标准化疾病诉求的简单分诊数据集/new_test.csv'

extension = 'csv'
datasets = load_dataset(extension, data_files=data_files)

Using custom data configuration default-8317c29feb5bba8d


Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /Users/derbysofti81/.cache/huggingface/datasets/csv/default-8317c29feb5bba8d/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff...


                            

Dataset csv downloaded and prepared to /Users/derbysofti81/.cache/huggingface/datasets/csv/default-8317c29feb5bba8d/0.0.0/9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff. Subsequent calls will reuse this data.




In [6]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'age', 'diseaseName', 'conditionDesc', 'title', 'hopeHelp', 'label', 'text'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['id', 'age', 'diseaseName', 'conditionDesc', 'title', 'hopeHelp', 'label', 'text'],
        num_rows: 100
    })
    test: Dataset({
        features: ['id', 'age', 'diseaseName', 'conditionDesc', 'title', 'hopeHelp', 'label', 'text'],
        num_rows: 100
    })
})

In [17]:
# model_checkpoint = "bert-base-chinese"
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
# model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

# https://medium.com/data-folks-indonesia/modeling-using-hugging-face-transformers-f956bccf7ccd

config = AutoConfig.from_pretrained(
    "bert-base-chinese",
    num_labels=10,
    finetuning_task='haodaifu',
    )
tokenizer = AutoTokenizer.from_pretrained(
    "bert-base-chinese"
)
model = AutoModelForSequenceClassification.from_config(
    config
)

loading configuration file https://huggingface.co/bert-base-chinese/resolve/main/config.json from cache at /Users/derbysofti81/.cache/huggingface/transformers/6cc404ca8136bc87bae0fb24f2259904943d776a6c5ddc26598bbdc319476f42.0f9bcd8314d841c06633e7b92b04509f1802c16796ee67b0f1177065739e24ae
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "finetuning_task": "haodaifu",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "L

In [8]:
context_name = "text"
max_seq_length = 512
def preprocess_function(examples):
    # 生成模型需要的输入格式 formatting
    sentences = examples[context_name]
    # Tokenize
    tokenized_examples = tokenizer(
        sentences,
        truncation=True,
        max_length=max_seq_length,
        padding="max_length" if 1024 else False,
    )
    # Un-flatten
    return {k: v for k, v in tokenized_examples.items()}

In [9]:
tokenized_datasets = datasets.map(
    preprocess_function,
    batched=True,
    num_proc=2,
    # load_from_cache_file=not data_args.overwrite_cache,
)

 #0: 100%|██████████| 1/1 [00:00<00:00, 11.28ba/s]
 #1: 100%|██████████| 1/1 [00:00<00:00, 10.79ba/s]
 #0: 100%|██████████| 1/1 [00:00<00:00,  6.44ba/s]
 #1: 100%|██████████| 1/1 [00:00<00:00,  5.94ba/s]
 #0: 100%|██████████| 1/1 [00:00<00:00, 11.05ba/s]
 #1: 100%|██████████| 1/1 [00:00<00:00, 11.65ba/s]


In [10]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['age', 'attention_mask', 'conditionDesc', 'diseaseName', 'hopeHelp', 'id', 'input_ids', 'label', 'text', 'title', 'token_type_ids'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['age', 'attention_mask', 'conditionDesc', 'diseaseName', 'hopeHelp', 'id', 'input_ids', 'label', 'text', 'title', 'token_type_ids'],
        num_rows: 100
    })
    test: Dataset({
        features: ['age', 'attention_mask', 'conditionDesc', 'diseaseName', 'hopeHelp', 'id', 'input_ids', 'label', 'text', 'title', 'token_type_ids'],
        num_rows: 100
    })
})

In [20]:
@dataclass
class DataCollatorForSentClassification:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        flattened_features = features
        
        # padding
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        # Un-flatten
        batch = {k: v.view(batch_size, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        # print(batch['input_ids'].shape)
        return batch

In [21]:
# Data collator
data_collator = (
  DataCollatorForSentClassification(tokenizer=tokenizer, pad_to_multiple_of=8)
)

In [22]:
# dcsc = DataCollatorForSentClassification(tokenizer=tokenizer, pad_to_multiple_of=8)
# dcsc(tokenized_datasets["train"])

In [23]:
batch_size = 8
training_args = TrainingArguments(
    "saved_model/haodaifu",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [24]:
# Metric
def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    # print(predictions, label_ids)
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

# Initialize our Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text, id, title, hopeHelp, age, diseaseName, conditionDesc.
***** Running training *****
  Num examples = 100
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 39

[A