# mount google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# set workspace

In [2]:
!nvidia-smi

Fri Apr 29 11:27:13 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   39C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import os
import sys
os.chdir('/content/drive/MyDrive/code/siumaai/examples/cmeee')
root_path = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd())))), 'siumaai')
sys.path.append(root_path)

# install packages

In [4]:
!pip install transformers pytorch-lightning



In [5]:
import json 
from dataclasses import asdict
from torch.utils.data import random_split, DataLoader
from torch.utils.data.dataloader import default_collate
from siumaai.features.ner.global_pointer import GlobalPointerForNerDataset, convert_logits_to_examples
from siumaai.features.ner import EntityExample, NerExample
from transformers import BertTokenizerFast
import pytorch_lightning as pl
from siumaai.pl_models.ner import Ner
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
from transformers import AutoConfig
from siumaai.models import MODEL_CLS_MAP

# Config

In [6]:
MAX_SEQ_LENGTH=128
PRETRAIN_MODEL_PATH='bert-base-chinese'
LABEL_PATH = '/content/drive/MyDrive/datasets/CMeEE/labels.txt'
DATA_PATH = '/content/drive/MyDrive/datasets/CMeEE/data.json'
BATCH_SIZE = 100
TEST_BATCH_SIZE = 1000
pl.seed_everything(2)

Global seed set to 2


2

# load label

In [7]:
LABEL_TO_ID_MAP = {}
ID_TO_LABEL_MAP = {}
with open('/content/drive/MyDrive/datasets/CMeEE/labels.txt', encoding='utf8')as f:
    index = 0
    for line in f:
        label = line.strip()
        if label:
            LABEL_TO_ID_MAP[label] = index
            ID_TO_LABEL_MAP[index] = label
            index += 1

NUM_LABELS = len(LABEL_TO_ID_MAP)
PAD_ID = -100

print(f'id_to_label_map: {ID_TO_LABEL_MAP}')
print(f'label_to_id_map: {LABEL_TO_ID_MAP}')


id_to_label_map: {0: 'equ', 1: 'dru', 2: 'bod', 3: 'dis', 4: 'pro', 5: 'dep', 6: 'sym', 7: 'ite', 8: 'mic'}
label_to_id_map: {'equ': 0, 'dru': 1, 'bod': 2, 'dis': 3, 'pro': 4, 'dep': 5, 'sym': 6, 'ite': 7, 'mic': 8}


# load data

In [8]:
with open(DATA_PATH, encoding='utf-8') as f:
    example_list = [
        NerExample(
            text=data['text'],
            words=list(data['text']),
            entities=[
                EntityExample(
                    start_idx=entity['start_idx'],
                    end_idx=entity['end_idx'],
                    entity=entity['entity'],
                    type=entity['type']
                )
                for entity in data.get('entities', [])
            ]
        )
        for data in json.load(f)
    ]

train_example_size = int(len(example_list) * 0.8)
val_example_size = int(len(example_list) * 0.1)
test_example_size = len(example_list) - train_example_size - val_example_size
train_example_list, val_example_list, test_example_list = random_split(
        example_list, [train_example_size, val_example_size, test_example_size])
print(f'train: {len(train_example_list)}, val: {len(val_example_list)}, test: {len(test_example_list)}')

train: 16000, val: 2000, test: 2000


# load tokenizer

In [9]:
tokenizer = BertTokenizerFast.from_pretrained(PRETRAIN_MODEL_PATH)
tokenizer.add_special_tokens({'additional_special_tokens': [' ', '\n']})


2

# train

## 1. load train_data, val_data

In [10]:

train_dataset = GlobalPointerForNerDataset(train_example_list, tokenizer, LABEL_TO_ID_MAP, MAX_SEQ_LENGTH, pad_id=PAD_ID, check_tokenization=False, lazy_load=True)
val_dataset = GlobalPointerForNerDataset(val_example_list, tokenizer, LABEL_TO_ID_MAP, MAX_SEQ_LENGTH, pad_id=PAD_ID, check_tokenization=False, lazy_load=True)

print(f'train_dataset_size: {len(train_dataset)}')
print(f'val_dataset_size: {len(val_dataset)}')


def fit_collate_func(batch):
    return default_collate([
        {
            'input_ids': data.input_ids,
            'attention_mask': data.attention_mask,
            'token_type_ids': data.token_type_ids,
            'labels': data.labels,
            'criterion_mask': data.criterion_mask,
        }
        for data in batch
    ])

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=fit_collate_func)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=fit_collate_func)

train_dataset_size: 16000
val_dataset_size: 2000


## 2. init model

In [11]:
    config = AutoConfig.from_pretrained(
        PRETRAIN_MODEL_PATH, 
        return_dict=None)

    model_cls = MODEL_CLS_MAP['global_pointer_for_ner']
    model_kwargs = {
        'pretrain_model_path': PRETRAIN_MODEL_PATH,
        'inner_dim': 64,
        'hidden_size': config.hidden_size,
        'num_labels': NUM_LABELS,
        'vocab_len': len(tokenizer)
    }

    model = Ner(
            learning_rate=3e-5,
            adam_epsilon=1e-8,
            warmup_rate=0.1,
            weight_decay=0.1,
            model_cls=model_cls,
            **model_kwargs
            )

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## 3. init trainer

In [12]:
    trainer = Trainer(
            gpus=1,
            max_epochs=20,
            # max_epochs=10,
            weights_summary=None,
            logger=TensorBoardLogger('tensorboard_logs/global_pointer'),
            callbacks=[
                EarlyStopping(
                    monitor='val_loss',
                    min_delta=0.005,
                    patience=5,
                    verbose=False,
                    mode='min'),
                ModelCheckpoint(
                    dirpath='ckpt/global_pointer',
                    filename='{epoch}-{val_loss:.2f}',
                    monitor='val_loss',
                    mode='min',
                    verbose=True,
                    save_top_k=1),
                LearningRateMonitor(logging_interval='step')])



  "Setting `Trainer(weights_summary=None)` is deprecated in v1.5 and will be removed"
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


## 4. tune initial learning_rate

In [13]:
lr = trainer.tuner.lr_find(model, train_dataloader, val_dataloader, early_stop_threshold=None)
print(lr.suggestion())
# model.hparams.learning_rate = lr.suggestion()

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

Restoring states from the checkpoint path at /content/drive/MyDrive/code/siumaai/examples/cmeee/.lr_find_540c3459-b4c2-44c7-bab9-856a0eb3ce2b.ckpt


0.0003019951720402019


  "Be aware that when using `ckpt_path`,"


In [14]:
!ls -a /content/drive/MyDrive/code/siumaai/examples/cmeee/

ckpt		    tensorboard_logs		    train_global_pointer_ner.py
.ipynb_checkpoints  train_crf_ner.ipynb
preprocess.ipynb    train_global_pointer_ner.ipynb


## 5. training

In [None]:
trainer.fit(model, train_dataloader, val_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.


Sanity Checking: 0it [00:00, ?it/s]



Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 0, global step 160: 'val_loss' reached inf (best inf), saving model to '/content/drive/MyDrive/code/siumaai/examples/cmeee/ckpt/global_pointer/epoch=0-val_loss=nan.ckpt' as top 1


# test

## 1. load test data

In [None]:
test_dataset = BIOForNerDataset(test_example_list, tokenizer, LABEL_TO_ID_MAP, MAX_SEQ_LENGTH, pad_id=PAD_ID, check_tokenization=False)


NameError: ignored

## 2. load model

In [None]:
!ls ckpt/crf

In [None]:
model = CrfNer.load_from_checkpoint('ckpt/crf/epoch=3-val_loss=20.86-v1.ckpt')
model.to('cuda')
model.eval()


## 3. testing

In [None]:
pred_example_list = []
crf_pred_example_list = []
start_index = 0
while start_index < len(test_dataset):
    if start_index + TEST_BATCH_SIZE < len(test_dataset):
        end_index  = start_index + TEST_BATCH_SIZE 
    else:
        end_index = len(test_dataset)

    feature_list = []
    batch = []
    for index in range(start_index, end_index):
        feature_list.append(test_dataset[index])
        batch.append({
            'input_ids': test_dataset[index].input_ids,
            'attention_mask': test_dataset[index].attention_mask,
            'token_type_ids': test_dataset[index].token_type_ids,
        })

    #crf_logits, logits, *_ = model(**default_collate(batch))
    batch = {k:v.to('cuda') for k,v in default_collate(batch).items()}
    with torch.no_grad():
        crf_logits, logits = model(**batch)
        crf_logits = crf_logits.detach().cpu()
        logits = logits.detach().cpu()
    pred_example_list.extend(convert_logits_to_examples(feature_list, logits, ID_TO_LABEL_MAP))
    crf_pred_example_list.extend(convert_crf_logits_to_examples(feature_list, crf_logits, ID_TO_LABEL_MAP))
    print(f'finish {start_index} -> {end_index}')
    start_index = end_index


In [None]:
import torch
torch.cuda.empty_cache()

# metric

In [None]:
from siumaai.metrics.ner import calc_metric
metric = calc_metric(test_example_list, pred_example_list)
print(metric)

crf_metric = calc_metric(test_example_list, crf_pred_example_list)
print(crf_metric)


In [None]:
!cd  && ls
