# mount google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# set workspace

In [3]:
!nvidia-smi

Fri Jul  8 13:54:33 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import os
import sys
os.chdir('/content/drive/MyDrive/code/siumaai/examples/cmeee')
root_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd())))), 'siumaai')
sys.path.append(root_path)

# install packages

In [5]:
!pip install transformers pytorch-lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 32.8 MB/s 
[?25hCollecting pytorch-lightning
  Downloading pytorch_lightning-1.6.4-py3-none-any.whl (585 kB)
[K     |████████████████████████████████| 585 kB 58.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 55.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 11.4 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 56.5 MB/s 
Collecti

In [6]:
import json
import torch
from dataclasses import asdict
from torch.utils.data import random_split, DataLoader
from siumaai.features.ner.bio import convert_logits_to_examples, convert_crf_logits_to_examples
from torch.utils.data.dataloader import default_collate
from siumaai.features.ner.bio import BIOForNerDataset
from siumaai.features.ner import EntityExample, NerExample
from transformers import BertTokenizerFast
import pytorch_lightning as pl
from siumaai.pl_models.ner import CrfNer
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
from transformers import AutoConfig
from siumaai.models import MODEL_CLS_MAP


# Config

In [7]:
MAX_SEQ_LENGTH=128
PRETRAIN_MODEL_PATH='bert-base-chinese'
LABEL_PATH = '/content/drive/MyDrive/datasets/CMeEE/labels.txt'
DATA_PATH = '/content/drive/MyDrive/datasets/CMeEE/data.json'
BATCH_SIZE = 100
TEST_BATCH_SIZE = 1000
pl.seed_everything(2)

Global seed set to 2


2

# load label

In [8]:
ID_TO_LABEL_MAP = {}
LABEL_TO_ID_MAP = {}
with open(LABEL_PATH, encoding='utf-8')as f:
    index = 0
    for line in f:
        label = line.strip()
        if label and label != 'O':
            ID_TO_LABEL_MAP[index] = f'B-{label}'
            LABEL_TO_ID_MAP[f'B-{label}'] = index
            index += 1

            ID_TO_LABEL_MAP[index] = f'I-{label}'
            LABEL_TO_ID_MAP[f'I-{label}'] = index
            index += 1

        elif label and label == 'O':
            ID_TO_LABEL_MAP[index] = label
            LABEL_TO_ID_MAP[label] = index
            index += 1

ID_TO_LABEL_MAP[len(ID_TO_LABEL_MAP)] = '[PAD]'
LABEL_TO_ID_MAP['[PAD]'] = len(LABEL_TO_ID_MAP)

NUM_LABELS = len(ID_TO_LABEL_MAP)
PAD_ID = LABEL_TO_ID_MAP['[PAD]']
# PAD_ID = -100

print(f'id_to_label_map: {ID_TO_LABEL_MAP}')
print(f'label_to_id_map: {LABEL_TO_ID_MAP}')


id_to_label_map: {0: 'B-equ', 1: 'I-equ', 2: 'B-dru', 3: 'I-dru', 4: 'B-bod', 5: 'I-bod', 6: 'B-dis', 7: 'I-dis', 8: 'B-pro', 9: 'I-pro', 10: 'B-dep', 11: 'I-dep', 12: 'B-sym', 13: 'I-sym', 14: 'B-ite', 15: 'I-ite', 16: 'B-mic', 17: 'I-mic', 18: '[PAD]'}
label_to_id_map: {'B-equ': 0, 'I-equ': 1, 'B-dru': 2, 'I-dru': 3, 'B-bod': 4, 'I-bod': 5, 'B-dis': 6, 'I-dis': 7, 'B-pro': 8, 'I-pro': 9, 'B-dep': 10, 'I-dep': 11, 'B-sym': 12, 'I-sym': 13, 'B-ite': 14, 'I-ite': 15, 'B-mic': 16, 'I-mic': 17, '[PAD]': 18}


# load data

In [10]:
with open(DATA_PATH, encoding='utf-8') as f:
    example_list = [
        NerExample(
            text=data['text'],
            words=list(data['text']),
            entities=[
                EntityExample(
                    start_idx=entity['start_idx'],
                    end_idx=entity['end_idx'],
                    entity=entity['entity'],
                    type=entity['type']
                )
                for entity in data.get('entities', [])
            ]
        )
        for data in json.load(f)
    ]

train_example_size = int(len(example_list) * 0.8)
val_example_size = int(len(example_list) * 0.1)
test_example_size = len(example_list) - train_example_size - val_example_size
train_example_list, val_example_list, test_example_list = random_split(
        example_list, [train_example_size, val_example_size, test_example_size])
print(f'train: {len(train_example_list)}, val: {len(val_example_list)}, test: {len(test_example_list)}')

train: 16000, val: 2000, test: 2000


# load tokenizer

In [13]:
tokenizer = BertTokenizerFast.from_pretrained(PRETRAIN_MODEL_PATH)
tokenizer.add_special_tokens({'additional_special_tokens': [' ', '\n']})


2

# train

## 1. load train_data, val_data

In [17]:

train_dataset = BIOForNerDataset(train_example_list, tokenizer, LABEL_TO_ID_MAP, MAX_SEQ_LENGTH, pad_id=PAD_ID, check_tokenization=False)
val_dataset = BIOForNerDataset(val_example_list, tokenizer, LABEL_TO_ID_MAP, MAX_SEQ_LENGTH, pad_id=PAD_ID, check_tokenization=False)

print(f'train_dataset_size: {len(train_dataset)}')
print(f'val_dataset_size: {len(val_dataset)}')


def fit_collate_func(batch):
    return default_collate([
        {
            'input_ids': data.input_ids,
            'attention_mask': data.attention_mask,
            'token_type_ids': data.token_type_ids,
            'labels': data.labels
        }
        for data in batch
    ])

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=fit_collate_func)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=fit_collate_func)


KeyError: ignored

## 2. init model

In [15]:
config = AutoConfig.from_pretrained(
    PRETRAIN_MODEL_PATH, 
    return_dict=None)

model_cls = MODEL_CLS_MAP['crf_for_ner']
model_kwargs = {
    'pretrain_model_path': PRETRAIN_MODEL_PATH,
    'num_labels': NUM_LABELS,
    'dropout_rate': config.hidden_dropout_prob,
    'hidden_size': config.hidden_size,
    'vocab_len': len(tokenizer)
}
# model = Ner(
model = CrfNer(
        # crf_learning_rate=0.005248074602497723,
        # learning_rate=0.0005248074602497723,
        crf_learning_rate=3e-04,
        learning_rate=3e-05,
        adam_epsilon=1e-8,
        warmup_rate=0.1,
        weight_decay=0.1,
        model_cls=model_cls,
        **model_kwargs
        )


Downloading:   0%|          | 0.00/393M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## 3. init trainer

In [16]:
trainer = Trainer(
        gpus=1,
        max_epochs=10,
        weights_summary=None,
        logger=TensorBoardLogger('tensorboard_logs/crf'),
        callbacks=[
            EarlyStopping(
                monitor='val_loss',
                min_delta=0.1,
                patience=2,
                verbose=False,
                mode='min'),
            ModelCheckpoint(
                dirpath='ckpt/crf',
                filename='{epoch}-{val_loss:.2f}',
                monitor='val_loss',
                mode='min',
                verbose=True,
                save_top_k=1),
            LearningRateMonitor(logging_interval='step')])





  "Setting `Trainer(weights_summary=None)` is deprecated in v1.5 and will be removed"
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


## 4. tune initial learning_rate

In [None]:
# lr = trainer.tuner.lr_find(model, train_dataloader, val_dataloader, early_stop_threshold=None)
# print(lr.suggestion())
# model.hparams.learning_rate = lr.suggestion()

In [None]:
!ls /content/drive/MyDrive/code/siumaai/examples/cmeee/.lr_find_132f105a-6a28-459f-97ee-95cd388d56ed.ckpt

ls: cannot access '/content/drive/MyDrive/code/siumaai/examples/cmeee/.lr_find_132f105a-6a28-459f-97ee-95cd388d56ed.ckpt': No such file or directory


## 5. training

In [18]:
trainer.fit(model, train_dataloader, val_dataloader)

NameError: ignored

# test

## 1. load test data

In [None]:
test_dataset = BIOForNerDataset(test_example_list, tokenizer, LABEL_TO_ID_MAP, MAX_SEQ_LENGTH, pad_id=PAD_ID, check_tokenization=False)


## 2. load model

In [None]:
!ls ckpt/crf

'epoch=3-val_loss=20.86.ckpt'


In [None]:
model = CrfNer.load_from_checkpoint('ckpt/crf/epoch=3-val_loss=20.86-v1.ckpt')
model.to('cuda')
model.eval()


Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CrfNer(
  (model): CrfForNer(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(21130, 768)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
                (LayerNorm): LayerNorm

## 3. testing

In [None]:
pred_example_list = []
crf_pred_example_list = []
start_index = 0
while start_index < len(test_dataset):
    if start_index + TEST_BATCH_SIZE < len(test_dataset):
        end_index  = start_index + TEST_BATCH_SIZE 
    else:
        end_index = len(test_dataset)

    feature_list = []
    batch = []
    for index in range(start_index, end_index):
        feature_list.append(test_dataset[index])
        batch.append({
            'input_ids': test_dataset[index].input_ids,
            'attention_mask': test_dataset[index].attention_mask,
            'token_type_ids': test_dataset[index].token_type_ids,
        })

    #crf_logits, logits, *_ = model(**default_collate(batch))
    batch = {k:v.to('cuda') for k,v in default_collate(batch).items()}
    with torch.no_grad():
        crf_logits, logits = model(**batch)
        crf_logits = crf_logits.detach().cpu()
        logits = logits.detach().cpu()
    pred_example_list.extend(convert_logits_to_examples(feature_list, logits, ID_TO_LABEL_MAP))
    crf_pred_example_list.extend(convert_crf_logits_to_examples(feature_list, crf_logits, ID_TO_LABEL_MAP))
    print(f'finish {start_index} -> {end_index}')
    start_index = end_index


finish 0 -> 1000
finish 1000 -> 2000


In [None]:
import torch
torch.cuda.empty_cache()

# metric

In [None]:
from siumaai.metrics.ner import calc_metric
metric = calc_metric(test_example_list, pred_example_list)
print(metric)

crf_metric = calc_metric(test_example_list, crf_pred_example_list)
print(crf_metric)


{'precision': 0.6266204595045566, 'recall': 0.5929074568860822, 'f1': 0.6092979719188768, 'tp': 4882, 'fp': 2909, 'fn': 3352}
{'precision': 0.6310106716886378, 'recall': 0.6103959193587564, 'f1': 0.6205321316130625, 'tp': 5026, 'fp': 2939, 'fn': 3208}


In [None]:
!cd  && ls


cmeee  msra_ner
