## **Loads**

In [None]:
!pip install datasets
!pip install seqeval
!pip install transformers[torch]

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets import load_metric

import torch
from torch.utils.data import Dataset

from sklearn.model_selection import train_test_split
from transformers import (
    BertTokenizerFast,
    DataCollatorForTokenClassification,
    BertForTokenClassification,
    TrainingArguments,
    Trainer,
    pipeline
    )

from seqeval.metrics import f1_score, classification_report

## **Data**

In [4]:
bio_data = pd.read_csv('/content/drive/MyDrive/4 –∫—É—Ä—Å/DL_NLP/BIO_data.csv', sep='\t')

In [5]:
bio_tokens = bio_data.groupby('id')['bio_token'].agg(list)
bio_tags = bio_data.groupby('id')['bio_tag'].agg(list)

In [6]:
train_bio_tokens, test_bio_tokens, train_bio_tags, test_bio_tags = train_test_split(bio_tokens, bio_tags, test_size=0.2, random_state=42)
train_bio_tokens, val_bio_tokens, train_bio_tags, val_bio_tags = train_test_split(train_bio_tokens, train_bio_tags, test_size=0.3, random_state=42)

In [7]:
train_bio_tokens.shape, val_bio_tokens.shape, test_bio_tokens.shape

((1403,), (602,), (502,))

## **Functions and classes**

In [8]:
def get_tokens_pairs(texts, tags, bio=True):

    res_tokens = []
    res_tags = []

    for text, text_tags in zip(texts, tags):
        result = [('[CLS]', 'O')]
        all_tokens = ['[CLS]']
        all_tags = ['O']

        for (word, tag) in zip(text, text_tags):
            tokens = tokenizer.tokenize(word)
            all_tokens.extend(tokens)

            if bio:
                if tag.startswith('B'):
                    all_tags.append(tag)
                    all_tags.extend([tag.replace('B', 'I')]*(len(tokens)-1))
                else:
                    all_tags.extend([tag]*len(tokens))
            else:
                if tag == 'O':
                    all_tags.extend([tag]*len(tokens))
                else:
                    all_tags.extend([tag[2:]]*len(tokens))

        all_tokens.append('[SEP]')
        all_tags.append('O')

        res_tokens.append(all_tokens)
        res_tags.append(all_tags)

    return res_tokens, res_tags

In [9]:
tag2id = {'B-Drugname': 0, 'B-Drugform': 1, 'B-Drugclass': 2, 'B-ADR': 3, 'B-DI': 4, 'B-Finding': 5,
          'I-Drugname': 6, 'I-Drugform': 7, 'I-Drugclass': 8, 'I-ADR': 9, 'I-DI': 10, 'I-Finding': 11, 'O': 12}

id2tag = {idx: tag for tag, idx in tag2id.items()}

In [10]:
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, tokens, labels, label2id):
        self.tokens = tokens
        self.labels = labels
        self.label2id = label2id

    def prepare_data(self, tokens, labels):
        tokens = torch.tensor([tokenizer.convert_tokens_to_ids(text) for text in tokens])
        labels = torch.tensor([self.label2id[l] for l in labels])
        return tokens, labels

    def __getitem__(self, idx):
        tokens, labels = self.prepare_data(self.tokens[idx], self.labels[idx])
        return {'input_ids': tokens, 'labels': labels}

    def __len__(self):
        return len(self.tokens)

In [11]:
metric = load_metric("seqeval")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    true_labels = [[id2tag[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [id2tag[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)

    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

## **Rudr-bert**

### Train

In [13]:
model_name = 'cimm-kzn/rudr-bert'
tokenizer = BertTokenizerFast.from_pretrained(model_name, model_max_length=512)
data_collator = DataCollatorForTokenClassification(tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/521 [00:00<?, ?B/s]

In [20]:
def train_model(model_name, augmentations_datapath, train_bio_tokens, train_bio_tags, val_bio_tokens, val_bio_tags, to_save):
    tokenizer = BertTokenizerFast.from_pretrained(model_name, model_max_length=512)
    data_collator = DataCollatorForTokenClassification(tokenizer)

    augs = ''
    if augmentations_datapath:
        augs = pd.read_csv(augmentations_datapath, sep='\t')
        augs_bio_tokens = augs.groupby('id')['bio_token'].agg(list)
        augs_bio_tags = augs.groupby('id')['bio_tag'].agg(list)
        train_bio_tokens = train_bio_tokens.append(augs_bio_tokens)
        train_bio_tags = train_bio_tags.append(augs_bio_tags)
        augs = augmentations_datapath.split('/')[-1].split('.')[0]

    train_tokens, train_tags = get_tokens_pairs(train_bio_tokens, train_bio_tags)
    val_tokens, val_tags = get_tokens_pairs(val_bio_tokens, val_bio_tags)

    train_dataset = NERDataset(train_tokens, train_tags, tag2id)
    val_dataset = NERDataset(val_tokens, val_tags, tag2id)

    model = BertForTokenClassification.from_pretrained(
        model_name,
        num_labels = len(tag2id),
        output_attentions = False,
        output_hidden_states = False
    )

    training_args = TrainingArguments(
        output_dir=f'/content/drive/MyDrive/4 –∫—É—Ä—Å/DL_NLP/output_dir/{to_save}',
        evaluation_strategy='epoch',
        learning_rate=1e-4,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=20,
        weight_decay=0.01,
        save_strategy='no'
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    trainer.save_model(f'/content/drive/MyDrive/4 –∫—É—Ä—Å/DL_NLP/{to_save}')

In [None]:
aug_datapaths = [
    None, #rudr-bert0
    '/content/drive/MyDrive/4 –∫—É—Ä—Å/DL_NLP/augmentations/augmented_synonyms_bio.csv', #rudr-bert1
    '/content/drive/MyDrive/4 –∫—É—Ä—Å/DL_NLP/augmentations/augmented_bert_new_bio.csv', #rudr-bert2
    '/content/drive/MyDrive/4 –∫—É—Ä—Å/DL_NLP/augmentations/augmented_umls_bio.csv' #rudr-bert3
]

In [None]:
for i in range(len(aug_datapaths)):
    to_save = str(model_name) + str(i)
    train_model(model_name, aug_datapaths[i], train_bio_tokens, train_bio_tags, val_bio_tokens, val_bio_tags, to_save)

### Test

In [14]:
test_tokens, test_tags = get_tokens_pairs(test_bio_tokens, test_bio_tags)

In [16]:
# without augmentations
classifier = pipeline('ner', model=f'/content/drive/MyDrive/4 –∫—É—Ä—Å/DL_NLP/cimm-kzn/rudr-bert0')

pred_labels = []
true_labels = []
for i in tqdm(range(len(test_bio_tokens))):
    res = classifier(' '.join(test_bio_tokens.iloc[i]))
    preds = []
    for word in res:
        preds.append(id2tag[int(word['entity'].split('_')[-1])])
    pred_labels.append(preds)

for i in tqdm(range(len(test_tokens))):
    true_labels.append(test_tags[i][1:-1])

test_score = f1_score(true_labels, pred_labels)
print('f1_score =', test_score)

res_dict = classification_report(true_labels, pred_labels, output_dict=True)
pd.DataFrame(res_dict).T

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 502/502 [01:29<00:00,  5.58it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 502/502 [00:00<00:00, 437378.61it/s]


f1_score = 0.7378640776699029


Unnamed: 0,precision,recall,f1-score,support
ADR,0.473988,0.525641,0.49848,156.0
DI,0.59375,0.741135,0.659306,282.0
Drugclass,0.921875,0.907692,0.914729,65.0
Drugform,0.933702,0.982558,0.957507,172.0
Drugname,0.878924,0.94686,0.911628,207.0
Finding,0.212121,0.142857,0.170732,49.0
micro avg,0.703704,0.77551,0.737864,931.0
macro avg,0.66906,0.707791,0.685397,931.0
weighted avg,0.702717,0.77551,0.73567,931.0


In [19]:
# with synonyms
classifier = pipeline('ner', model=f'/content/drive/MyDrive/4 –∫—É—Ä—Å/DL_NLP/cimm-kzn/rudr-bert1')

pred_labels = []
true_labels = []
for i in tqdm(range(len(test_bio_tokens))):
    res = classifier(' '.join(test_bio_tokens.iloc[i]))
    preds = []
    for word in res:
        preds.append(id2tag[int(word['entity'].split('_')[-1])])
    pred_labels.append(preds)

for i in tqdm(range(len(test_tokens))):
    true_labels.append(test_tags[i][1:-1])

test_score = f1_score(true_labels, pred_labels)
print('f1_score =', test_score)

res_dict = classification_report(true_labels, pred_labels, output_dict=True)
pd.DataFrame(res_dict).T

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 502/502 [01:35<00:00,  5.25it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 502/502 [00:00<00:00, 236020.69it/s]


f1_score = 0.8319907940161105


Unnamed: 0,precision,recall,f1-score,support
ADR,0.785714,0.705128,0.743243,156.0
DI,0.874016,0.787234,0.828358,282.0
Drugclass,0.961538,0.769231,0.854701,65.0
Drugform,0.987261,0.901163,0.942249,172.0
Drugname,0.933735,0.748792,0.831099,207.0
Finding,0.815789,0.632653,0.712644,49.0
micro avg,0.895911,0.776584,0.831991,931.0
macro avg,0.893009,0.757367,0.818716,931.0
weighted avg,0.896466,0.776584,0.831496,931.0


In [20]:
# with bert !!
classifier = pipeline('ner', model=f'/content/drive/MyDrive/4 –∫—É—Ä—Å/DL_NLP/cimm-kzn/rudr-bert2')

pred_labels = []
true_labels = []
for i in tqdm(range(len(test_bio_tokens))):
    res = classifier(' '.join(test_bio_tokens.iloc[i]))
    preds = []
    for word in res:
        preds.append(id2tag[int(word['entity'].split('_')[-1])])
    pred_labels.append(preds)

for i in tqdm(range(len(test_tokens))):
    true_labels.append(test_tags[i][1:-1])

test_score = f1_score(true_labels, pred_labels)
print('f1_score =', test_score)

res_dict = classification_report(true_labels, pred_labels, output_dict=True)
pd.DataFrame(res_dict).T

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 502/502 [01:22<00:00,  6.10it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 502/502 [00:00<00:00, 598675.18it/s]


f1_score = 0.9491162292447778


Unnamed: 0,precision,recall,f1-score,support
ADR,0.915584,0.903846,0.909677,156.0
DI,0.894198,0.929078,0.911304,282.0
Drugclass,1.0,1.0,1.0,65.0
Drugform,1.0,0.994186,0.997085,172.0
Drugname,0.975962,0.980676,0.978313,207.0
Finding,0.977778,0.897959,0.93617,49.0
micro avg,0.946581,0.951665,0.949116,931.0
macro avg,0.960587,0.950958,0.955425,931.0
weighted avg,0.947293,0.951665,0.94928,931.0


In [21]:
# with umls
classifier = pipeline('ner', model=f'/content/drive/MyDrive/4 –∫—É—Ä—Å/DL_NLP/cimm-kzn/rudr-bert3')

pred_labels = []
true_labels = []
for i in tqdm(range(len(test_bio_tokens))):
    res = classifier(' '.join(test_bio_tokens.iloc[i]))
    preds = []
    for word in res:
        preds.append(id2tag[int(word['entity'].split('_')[-1])])
    pred_labels.append(preds)

for i in tqdm(range(len(test_tokens))):
    true_labels.append(test_tags[i][1:-1])

test_score = f1_score(true_labels, pred_labels)
print('f1_score =', test_score)

res_dict = classification_report(true_labels, pred_labels, output_dict=True)
pd.DataFrame(res_dict).T

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 502/502 [01:22<00:00,  6.05it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 502/502 [00:00<00:00, 443365.05it/s]


f1_score = 0.7703375760929718


Unnamed: 0,precision,recall,f1-score,support
ADR,0.586826,0.628205,0.606811,156.0
DI,0.755474,0.734043,0.744604,282.0
Drugclass,0.964286,0.830769,0.892562,65.0
Drugform,0.923077,0.906977,0.914956,172.0
Drugname,0.894444,0.777778,0.832041,207.0
Finding,0.666667,0.408163,0.506329,49.0
micro avg,0.794521,0.747583,0.770338,931.0
macro avg,0.798462,0.714322,0.749551,931.0
weighted avg,0.798983,0.747583,0.770218,931.0


## Other models without augmentations

### **Rubert-base-cased** 0.772

In [None]:
model_name = 'DeepPavlov/rubert-base-cased'
tokenizer = BertTokenizerFast.from_pretrained(model_name, model_max_length=512)
data_collator = DataCollatorForTokenClassification(tokenizer)

tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

In [None]:
train_tokens, train_tags = get_tokens_pairs(train_bio_tokens, train_bio_tags)
test_tokens, test_tags = get_tokens_pairs(test_bio_tokens, test_bio_tags)

In [None]:
train_dataset = NERDataset(train_tokens, train_tags, tag2id)
test_dataset = NERDataset(test_tokens, test_tags, tag2id)

In [None]:
model = BertForTokenClassification.from_pretrained(
    model_name,
    num_labels = len(tag2id),
    output_attentions = False,
    output_hidden_states = False
)

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir='rubert-base-cased',
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=20,
    weight_decay=0.01,
    save_strategy='no'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.3699,0.257787,0.57265,0.647691,0.607863,0.919094
2,0.2251,0.264908,0.59061,0.675618,0.630261,0.919178
3,0.1673,0.312931,0.585727,0.722879,0.647115,0.919849
4,0.1367,0.266811,0.653409,0.741139,0.694514,0.93546
5,0.1021,0.304516,0.76392,0.736842,0.750137,0.943601
6,0.0904,0.276706,0.692857,0.729323,0.710623,0.937138
7,0.0708,0.264892,0.677143,0.763695,0.717819,0.940831
8,0.0578,0.300946,0.72004,0.767991,0.743243,0.942342
9,0.048,0.379207,0.678116,0.742213,0.708718,0.933949
10,0.0368,0.313911,0.73057,0.75725,0.743671,0.943265


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=10040, training_loss=0.07110841814057522, metrics={'train_runtime': 1170.3643, 'train_samples_per_second': 34.263, 'train_steps_per_second': 8.579, 'total_flos': 766894461962136.0, 'train_loss': 0.07110841814057522, 'epoch': 20.0})

In [None]:
trainer.save_model('/content/drive/MyDrive/rubert-base-cased')

In [None]:
classifier = pipeline("ner", model='/content/drive/MyDrive/rubert-base-cased')

pred_labels = []
true_labels = []
for i in tqdm(range(len(test_bio_tokens))):
    res = classifier(' '.join(test_bio_tokens.iloc[i]))
    preds = []
    for word in res:
        preds.append(id2tag[int(word['entity'].split('_')[-1])])
    pred_labels.append(preds)

for i in tqdm(range(len(test_tokens))):
    true_labels.append(test_tags[i][1:-1])

res_dict = classification_report(true_labels, pred_labels, output_dict=True)

pd.DataFrame(res_dict).T

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 502/502 [01:10<00:00,  7.14it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 502/502 [00:00<00:00, 434490.43it/s]


Unnamed: 0,precision,recall,f1-score,support
ADR,0.515464,0.641026,0.571429,156.0
DI,0.714829,0.666667,0.689908,282.0
Drugclass,0.9375,0.923077,0.930233,65.0
Drugform,0.933702,0.982558,0.957507,172.0
Drugname,0.925234,0.956522,0.940618,207.0
Finding,0.333333,0.22449,0.268293,49.0
micro avg,0.765016,0.779807,0.77234,931.0
macro avg,0.726677,0.73239,0.726331,931.0
weighted avg,0.764108,0.779807,0.769825,931.0


### **Bert-base-uncased** 0.776

In [None]:
model_name = 'deepvk/bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(model_name, model_max_length=512)
data_collator = DataCollatorForTokenClassification(tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/332 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/449k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.02M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [None]:
train_bio_tokens = train_bio_tokens.apply(lambda text: [str.lower(word) for word in text])
test_bio_tokens = test_bio_tokens.apply(lambda text: [str.lower(word) for word in text])

In [None]:
train_tokens, train_tags = get_tokens_pairs(train_bio_tokens, train_bio_tags)
test_tokens, test_tags = get_tokens_pairs(test_bio_tokens, test_bio_tags)

In [None]:
train_dataset = NERDataset(train_tokens, train_tags, tag2id)
test_dataset = NERDataset(test_tokens, test_tags, tag2id)

In [None]:
model = BertForTokenClassification.from_pretrained(
    model_name,
    num_labels = len(tag2id),
    output_attentions = False,
    output_hidden_states = False
)

config.json:   0%|          | 0.00/669 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/455M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at deepvk/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir='bert-base-uncased',
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=20,
    weight_decay=0.01,
    save_strategy='no'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.318,0.214645,0.635202,0.693878,0.663244,0.926197
2,0.1679,0.229773,0.607843,0.699248,0.65035,0.935145
3,0.1183,0.286915,0.667317,0.734694,0.699387,0.935473
4,0.0895,0.276422,0.712667,0.743287,0.727655,0.94278
5,0.0671,0.278952,0.734486,0.77551,0.754441,0.94631
6,0.0453,0.269418,0.696884,0.792696,0.741709,0.942123
7,0.0357,0.2718,0.703812,0.773362,0.73695,0.942944
8,0.0311,0.324048,0.667288,0.769066,0.714571,0.940399
9,0.0268,0.340779,0.690058,0.760473,0.723556,0.938347
10,0.0208,0.367037,0.701857,0.771214,0.734903,0.942698


TrainOutput(global_step=10040, training_loss=0.04869768169905204, metrics={'train_runtime': 922.2381, 'train_samples_per_second': 43.481, 'train_steps_per_second': 10.887, 'total_flos': 782634053255052.0, 'train_loss': 0.04869768169905204, 'epoch': 20.0})

In [None]:
trainer.save_model('/content/drive/MyDrive/bert-base-uncased')

In [None]:
classifier = pipeline("ner", model='/content/drive/MyDrive/bert-base-uncased')

pred_labels = []
true_labels = []
for i in tqdm(range(len(test_bio_tokens))):
    res = classifier(' '.join(test_bio_tokens.iloc[i]))
    preds = []
    for word in res:
        preds.append(id2tag[int(word['entity'].split('_')[-1])])
    pred_labels.append(preds)

for i in tqdm(range(len(test_tokens))):
    true_labels.append(test_tags[i][1:-1])

res_dict = classification_report(true_labels, pred_labels, output_dict=True)

pd.DataFrame(res_dict).T

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 502/502 [01:06<00:00,  7.50it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 502/502 [00:00<00:00, 312673.09it/s]


Unnamed: 0,precision,recall,f1-score,support
ADR,0.531073,0.602564,0.564565,156.0
DI,0.71831,0.723404,0.720848,282.0
Drugclass,0.983871,0.938462,0.96063,65.0
Drugform,0.922222,0.965116,0.943182,172.0
Drugname,0.920561,0.951691,0.935867,207.0
Finding,0.257143,0.183673,0.214286,49.0
micro avg,0.767857,0.785177,0.776421,931.0
macro avg,0.722197,0.727485,0.72323,931.0
weighted avg,0.763846,0.785177,0.773624,931.0


### **Multilingual cased** 0.755

In [None]:
model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizerFast.from_pretrained(model_name, model_max_length=512)
data_collator = DataCollatorForTokenClassification(tokenizer)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [None]:
train_bio_tokens, test_bio_tokens, train_bio_tags, test_bio_tags = train_test_split(bio_tokens, bio_tags, test_size=0.2, random_state=42)

In [None]:
train_tokens, train_tags = get_tokens_pairs(train_bio_tokens, train_bio_tags)
test_tokens, test_tags = get_tokens_pairs(test_bio_tokens, test_bio_tags)

In [None]:
train_dataset = NERDataset(train_tokens, train_tags, tag2id)
test_dataset = NERDataset(test_tokens, test_tags, tag2id)

In [None]:
model = BertForTokenClassification.from_pretrained(
    model_name,
    num_labels = len(tag2id),
    output_attentions = False,
    output_hidden_states = False
)

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir='bert-base-multilingual-cased',
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=20,
    weight_decay=0.01,
    save_strategy='no'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.4755,0.301963,0.620172,0.620838,0.620505,0.90339
2,0.3004,0.319311,0.50282,0.670247,0.574586,0.899936
3,0.2438,0.344599,0.497026,0.628357,0.555028,0.906493
4,0.1956,0.315739,0.613412,0.668099,0.639589,0.916213
5,0.1665,0.327508,0.634409,0.6971,0.664278,0.921892
6,0.135,0.384424,0.648148,0.714286,0.679612,0.920019
7,0.1127,0.341017,0.628627,0.698174,0.661578,0.922302
8,0.1045,0.388768,0.682702,0.716434,0.699161,0.923298
9,0.0774,0.393647,0.669661,0.72073,0.694258,0.921424
10,0.0694,0.381185,0.719136,0.750806,0.73463,0.930617


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=10040, training_loss=0.10479781717151047, metrics={'train_runtime': 1225.4409, 'train_samples_per_second': 32.723, 'train_steps_per_second': 8.193, 'total_flos': 1101133395526620.0, 'train_loss': 0.10479781717151047, 'epoch': 20.0})

In [None]:
trainer.save_model('/content/drive/MyDrive/bert-base-multilingual-cased')

In [None]:
classifier = pipeline("ner", model='/content/drive/MyDrive/bert-base-multilingual-cased')

pred_labels = []
true_labels = []
for i in tqdm(range(len(test_bio_tokens))):
    res = classifier(' '.join(test_bio_tokens.iloc[i]))
    preds = []
    for word in res:
        preds.append(id2tag[int(word['entity'].split('_')[-1])])
    pred_labels.append(preds)

for i in tqdm(range(len(test_tokens))):
    true_labels.append(test_tags[i][1:-1])

res_dict = classification_report(true_labels, pred_labels, output_dict=True)

pd.DataFrame(res_dict).T

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 502/502 [01:21<00:00,  6.18it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 502/502 [00:00<00:00, 662536.38it/s]


Unnamed: 0,precision,recall,f1-score,support
ADR,0.505882,0.551282,0.527607,156.0
DI,0.666667,0.70922,0.687285,282.0
Drugclass,0.938462,0.938462,0.938462,65.0
Drugform,0.942857,0.959302,0.951009,172.0
Drugname,0.908257,0.956522,0.931765,207.0
Finding,0.208333,0.204082,0.206186,49.0
micro avg,0.737705,0.773362,0.755113,931.0
macro avg,0.695076,0.719812,0.707052,931.0
weighted avg,0.73932,0.773362,0.755825,931.0


### **Multilingual uncased** 0.749

In [None]:
model_name = 'bert-base-multilingual-uncased'
tokenizer = BertTokenizerFast.from_pretrained(model_name, model_max_length=512)
data_collator = DataCollatorForTokenClassification(tokenizer)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [None]:
train_bio_tokens = train_bio_tokens.apply(lambda text: [str.lower(word) for word in text])
test_bio_tokens = test_bio_tokens.apply(lambda text: [str.lower(word) for word in text])

In [None]:
train_tokens, train_tags = get_tokens_pairs(train_bio_tokens, train_bio_tags)
test_tokens, test_tags = get_tokens_pairs(test_bio_tokens, test_bio_tags)

In [None]:
train_dataset = NERDataset(train_tokens, train_tags, tag2id)
test_dataset = NERDataset(test_tokens, test_tags, tag2id)

In [None]:
model = BertForTokenClassification.from_pretrained(
    model_name,
    num_labels = len(tag2id),
    output_attentions = False,
    output_hidden_states = False
)

model.safetensors:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir='bert-base-multilingual-uncased',
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=20,
    weight_decay=0.01,
    save_strategy='no'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.4494,0.283712,0.58977,0.606874,0.5982,0.90448
2,0.2683,0.303209,0.57156,0.664876,0.614697,0.903702
3,0.2026,0.288385,0.630069,0.684211,0.656025,0.922962
4,0.157,0.325723,0.603914,0.696026,0.646707,0.919553
5,0.1356,0.403227,0.667992,0.721805,0.693856,0.922483
6,0.1019,0.301879,0.602888,0.717508,0.655223,0.924457
7,0.0853,0.402646,0.587727,0.730397,0.651341,0.915725
8,0.0623,0.384781,0.682683,0.732546,0.706736,0.929422
9,0.0657,0.428424,0.693648,0.727175,0.710016,0.928225
10,0.0477,0.455774,0.670051,0.708915,0.688935,0.928465


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=10040, training_loss=0.08490340596962853, metrics={'train_runtime': 1167.906, 'train_samples_per_second': 34.335, 'train_steps_per_second': 8.597, 'total_flos': 1080203077096764.0, 'train_loss': 0.08490340596962853, 'epoch': 20.0})

In [None]:
trainer.save_model('/content/drive/MyDrive/bert-base-multilingual-uncased')

In [None]:
classifier = pipeline("ner", model='/content/drive/MyDrive/bert-base-multilingual-uncased')

pred_labels = []
true_labels = []
for i in tqdm(range(len(test_bio_tokens))):
    res = classifier(' '.join(test_bio_tokens.iloc[i]))
    preds = []
    for word in res:
        preds.append(id2tag[int(word['entity'].split('_')[-1])])
    pred_labels.append(preds)

for i in tqdm(range(len(test_tokens))):
    true_labels.append(test_tags[i][1:-1])

res_dict = classification_report(true_labels, pred_labels, output_dict=True)

pd.DataFrame(res_dict).T

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 502/502 [01:27<00:00,  5.72it/s]
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 502/502 [00:00<00:00, 554089.63it/s]


Unnamed: 0,precision,recall,f1-score,support
ADR,0.508876,0.551282,0.529231,156.0
DI,0.65529,0.680851,0.667826,282.0
Drugclass,0.936508,0.907692,0.921875,65.0
Drugform,0.932584,0.965116,0.948571,172.0
Drugname,0.895928,0.956522,0.925234,207.0
Finding,0.230769,0.183673,0.204545,49.0
micro avg,0.737279,0.762621,0.749736,931.0
macro avg,0.693326,0.707523,0.699547,931.0
weighted avg,0.73278,0.762621,0.747056,931.0
