In [1]:
# install requirements: adaseq
!pip install transformers seqeval modelscope
!pip install adaseq --ignore-requires-python --no-deps

Looking in indexes: https://mirrors.aliyun.com/pypi/simple
Collecting seqeval
  Downloading https://mirrors.aliyun.com/pypi/packages/9d/2d/233c79d5b4e5ab1dbf111242299153f3caddddbb691219f363ad55ce783d/seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting modelscope
  Downloading https://mirrors.aliyun.com/pypi/packages/3c/33/553d775dd38932af489f6b9a2a192c60a099632494f9e9cc7e2e86ac8980/modelscope-1.4.2-py3-none-any.whl (4.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting oss2
  Downloading https://mirrors.aliyun.com/pypi/packages/86/e7/017f8a5948d70e130815eebd14c25dfad916bd574590f2d7e046f19eee3f/oss2-2.17.0.tar.gz (259 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m259.5/259.5 kB[0m [31m13.5 MB/s[0m eta

In [20]:
path = '/mnt/workspace/downloads/109339/'

In [21]:
# process test file to conll format
with open(path + 'final_test.txt', 'r', encoding='utf8') as fin, \
     open(path + 'test.conll', 'w', encoding='utf8') as fout:
    for line in fin:
        guid, text = line.strip().split('\u0001')
        for token in text:
            print(token, 'O', sep=' ', file=fout)
        print('', file=fout)    

In [30]:
# prepare training configuration
from modelscope.utils.config import Config

config = Config.from_string("""
experiment:
  exp_dir: experiments/
  exp_name: transformer_crf
  seed: 42

task: named-entity-recognition

dataset:
  data_file:
    train: /mnt/workspace/downloads/109339/train.txt
    valid: /mnt/workspace/downloads/109339/dev.txt
    test: /mnt/workspace/downloads/109339/test.conll
  data_type: conll

preprocessor:
  type: sequence-labeling-preprocessor
  max_length: 80

data_collator: SequenceLabelingDataCollatorWithPadding

model:
  type: sequence-labeling-model
  embedder:
    model_name_or_path: damo/nlp_raner_named-entity-recognition_chinese-base-news
  dropout: 0.15
  use_crf: true

train:
  max_epochs: 30
  dataloader:
    batch_size_per_gpu: 16
  optimizer:
    type: AdamW
    lr: 5.0e-5
    param_groups:
      - regex: crf
        lr: 5.0e-1
  lr_scheduler:
    type: StepLR
    step_size: 2 
    gamma: 0.8
  hooks:
    - type: TensorboardHook

evaluation:
  dataloader:
    batch_size_per_gpu: 128
  metrics:
    - type: ner-metric
    - type: ner-dumper
      model_type: sequence_labeling
      dump_format: conll
""", file_format='.yaml')

In [ ]:
# initialize a trainer
import os
from adaseq.commands.train import build_trainer_from_partial_objects

work_dir = 'experiments/transformer_crf'
os.makedirs(work_dir, exist_ok=True)

trainer = build_trainer_from_partial_objects(
    config,
    work_dir=work_dir,
    seed=42,
    device='cuda:0'
)

# do training
trainer.train()

# do testing
trainer.test()

2023-04-01 12:34:16,593 - INFO - adaseq.data.dataset_manager - Will use a custom loading script: /opt/conda/lib/python3.10/site-packages/adaseq/data/dataset_builders/named_entity_recognition_dataset_builder.py


100%|██████████| 3/3 [00:00<00:00, 439.47it/s]

2023-04-01 12:34:16,710 - INFO - adaseq.data.dataset_manager - First sample in train set: {'id': '0', 'tokens': ['浙', '江', '杭', '州', '市', '江', '干', '区', '九', '堡', '镇', '三', '村', '村', '一', '区'], 'spans': [{'start': 0, 'end': 2, 'type': 'prov'}, {'start': 2, 'end': 5, 'type': 'city'}, {'start': 5, 'end': 8, 'type': 'district'}, {'start': 8, 'end': 11, 'type': 'town'}, {'start': 11, 'end': 14, 'type': 'community'}, {'start': 14, 'end': 16, 'type': 'poi'}], 'mask': [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]}



Counting labels by count_span_labels: 100%|██████████| 8856/8856 [00:05<00:00, 1550.12ex/s]
Counting labels by count_span_labels: 100%|██████████| 1970/1970 [00:01<00:00, 1956.01ex/s]


2023-04-01 12:34:23,894 - INFO - adaseq.data.preprocessors.sequence_labeling_preprocessor - label_to_id: {'O': 0, 'B-assist': 1, 'I-assist': 2, 'E-assist': 3, 'S-assist': 4, 'B-cellno': 5, 'I-cellno': 6, 'E-cellno': 7, 'S-cellno': 8, 'B-city': 9, 'I-city': 10, 'E-city': 11, 'S-city': 12, 'B-community': 13, 'I-community': 14, 'E-community': 15, 'S-community': 16, 'B-devzone': 17, 'I-devzone': 18, 'E-devzone': 19, 'S-devzone': 20, 'B-distance': 21, 'I-distance': 22, 'E-distance': 23, 'S-distance': 24, 'B-district': 25, 'I-district': 26, 'E-district': 27, 'S-district': 28, 'B-floorno': 29, 'I-floorno': 30, 'E-floorno': 31, 'S-floorno': 32, 'B-houseno': 33, 'I-houseno': 34, 'E-houseno': 35, 'S-houseno': 36, 'B-intersection': 37, 'I-intersection': 38, 'E-intersection': 39, 'S-intersection': 40, 'B-poi': 41, 'I-poi': 42, 'E-poi': 43, 'S-poi': 44, 'B-prov': 45, 'I-prov': 46, 'E-prov': 47, 'S-prov': 48, 'B-road': 49, 'I-road': 50, 'E-road': 51, 'S-road': 52, 'B-roadno': 53, 'I-roadno': 54, 'E-

Some weights of the model checkpoint at /root/.cache/modelscope/hub/damo/nlp_raner_named-entity-recognition_chinese-base-news were not used when initializing BertModel: ['encoder.encoder.layer.9.attention.output.LayerNorm.bias', 'encoder.embeddings.word_embeddings.weight', 'encoder.encoder.layer.7.attention.output.dense.bias', 'encoder.encoder.layer.5.attention.output.dense.bias', 'encoder.encoder.layer.6.attention.self.key.weight', 'encoder.encoder.layer.7.attention.self.key.weight', 'encoder.encoder.layer.10.attention.self.value.bias', 'encoder.encoder.layer.2.attention.self.key.weight', 'encoder.embeddings.position_embeddings.weight', 'encoder.encoder.layer.8.attention.self.query.weight', 'encoder.encoder.layer.6.attention.self.value.bias', 'encoder.encoder.layer.0.output.dense.weight', 'encoder.encoder.layer.2.output.LayerNorm.weight', 'encoder.encoder.layer.7.attention.output.LayerNorm.weight', 'encoder.encoder.layer.2.attention.output.dense.weight', 'encoder.encoder.layer.4.atten

** build_dataset error log: 'sequence-labeling-model is not in the custom_datasets registry group named-entity-recognition. Please make sure the correct version of ModelScope library is used.'
** build_dataset error log: 'sequence-labeling-model is not in the custom_datasets registry group named-entity-recognition. Please make sure the correct version of ModelScope library is used.'
2023-04-01 12:34:46,602 - INFO - modelscope - {
    "data_collator": "SequenceLabelingDataCollatorWithPadding",
    "dataset": {
        "data_file": {
            "test": "/mnt/workspace/downloads/109339/test.conll",
            "train": "/mnt/workspace/downloads/109339/train.txt",
            "valid": "/mnt/workspace/downloads/109339/dev.txt"
        },
        "data_type": "conll"
    },
    "evaluation": {
        "dataloader": {
            "batch_size_per_gpu": 128,
            "workers_per_gpu": 0,
            "shuffle": false
        },
        "metrics": [
            {
                "type": "ner

In [ ]:
# process pred file to conll format
import os

exp_dir = 'experiments/transformer_crf'

with open(f'{exp_dir}/pred.txt', 'r', encoding='utf8') as fin, \
     open('outputs/baseline4.pred.txt', 'w', encoding='utf8') as fout:
    guid = 1
    tokens = []
    labels = []
    for line in fin:
        if line == '' or line == '\n':
            if tokens:
                print(guid, ''.join(tokens), ' '.join(labels), sep='\u0001', file=fout)
                guid += 1
                tokens = []
                labels = []
        else:
            splits = line.split('\t')
            tokens.append(splits[0])
            labels.append(splits[-1].rstrip())
    if tokens:
        print(guid, ''.join(tokens), ' '.join(labels), sep='\u0001', file=fout)