In [4]:
import datasets
from datasets import Dataset, DatasetInfo, Features, Value, ClassLabel, Sequence
import json

valid_labels = [
    "关注点_教材_少儿教育",
    "下一步行动",
    "关注点_上课形式_少儿教育",
    "客户问题",
    "异议_考虑一下",
    "关注点_师资_少儿教育",
    "同意加微信",
    "关注点_课程数量_少儿教育",
    "客户确认日期与时间",
    "异议_下次再说",
    "51talk_理念渗透",
    "关注点_上课内容_少儿教育",
    "销售提及加微信",
    "关注点_价格"
]

class NERDataset(Dataset):
    def _info(self) -> DatasetInfo:
        return DatasetInfo(
            features=Features(
                {
                    "text": Value("string"),
                    "label": Sequence(
                        feature={
                            "entity_type": ClassLabel(num_classes = 14, names=valid_labels),
                            "start": Value("int32"),
                            "end": Value("int32"),
                        }
                    ),
                }
            ),
            supervised_keys=None,
        )

    def _split_generators(self, dl_manager):
        data_file = r"D:\projects\BERT-NER-Pytorch\datasets\yiliang\train.json"
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={"filepath": data_file},
            )
        ]

    def _generate_examples(self, filepath):
        with open(filepath, encoding="utf-8") as f:
            for line in f:
                data = json.loads(line)
                text = data["text"]
                labels = []
                for entity_type, entity_list in data["label"].items():
                    for entity, spans in entity_list.items():
                        for start, end in spans:
                            labels.append({"entity_type": entity_type, "start": start, "end": end})
                yield data["id"], {"text": text, "label": labels}

In [5]:
dataset = NERDataset()

TypeError: __init__() missing 1 required positional argument: 'arrow_table'

In [3]:
from datasets import load_dataset

dataset = load_dataset("ner_dataset.py")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


TypeError: 'NoneType' object is not callable

In [7]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')

def encode_examples(examples):
    # 对文本进行分词
    encoding = tokenizer(examples['text'], truncation=True, padding="max_length", max_length=128)
    labels = [[-100] * 128 for _ in range(len(examples['text']))]  # 初始化标签，用 -100 填充（忽略计算损失的索引）
    
    # 将标签转换为适合模型的格式
    for idx, (doc_labels, text) in enumerate(zip(examples['label'], examples['text'])):
        doc_encodings = tokenizer(text, truncation=True, padding="max_length", max_length=128)
        for entity, positions in doc_labels.items():
            for pos_list in positions.values():
                for start, end in pos_list:
                    start_pos = doc_encodings.char_to_token(start)
                    end_pos = doc_encodings.char_to_token(end - 1)
                    if start_pos is not None and end_pos is not None:
                        labels[idx][start_pos] = 1  # 例如，使用 1 来标注实体的开始位置
                        labels[idx][start_pos + 1:end_pos + 1] = 2  # 使用 2 来标注实体内部的位置

    encoding['labels'] = labels
    return encoding

# 对数据集应用预处理
dataset = dataset.map(encode_examples, batched=True)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map:   0%|          | 0/7545 [00:27<?, ? examples/s]


AttributeError: 'NoneType' object has no attribute 'values'