In [1]:
!pip install transformers -q

In [2]:
!pip install seqeval -q

In [3]:
import json
import torch
import torch.nn.functional as F
from torch import nn
from transformers import (
    BertTokenizerFast,
    BertForTokenClassification,
    DataCollatorForTokenClassification
)
from torch.utils.data import DataLoader
from seqeval.metrics import f1_score, classification_report
from sklearn.metrics import f1_score as f1_sklearn, classification_report as classification_report_sklearn
from tqdm import tqdm

import pandas as pd
pd.set_option('max_colwidth', None)

import os

2024-03-24 16:25:27.451617: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-24 16:25:27.451739: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-24 16:25:27.608414: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
import warnings
warnings.filterwarnings("ignore")

## Get sentence classes

In [5]:
# !git init !git pull https://github.com/google-research/bert.git
# !git clone https://github.com/Andoree/med_project.git
# !cp med_project/multilabel_code/bert_preprocessing.py ./
# !cp med_project/multilabel_code/multilabel_bert.py ./

In [6]:
# !python /content/med_project/bert_multilabel/formatting/otzovik_reviews_formatting.py --reviews_dir="/content/drive/MyDrive/DL NLP/Project/annotation" --output_dir="/content/drive/MyDrive/DL NLP/Project/sentence_data_splitted" --n_splits=1
# !ls /content/data/rudrec_annotated/sentences

In [7]:
# pd.read_csv('/content/drive/MyDrive/DL NLP/Project/sentence_data_splitted/train.csv')

## Get tokens

In [9]:
sentence_datapath = '/kaggle/input/dl-project/sentence_data_splitted'
bio_datapath = '/kaggle/input/dl-project/BIO_data.csv'

In [13]:
bio_data = pd.read_csv(bio_datapath, sep='\t')
sent_train = pd.read_csv(f'{sentence_datapath}/train.csv')
sent_train['sentence_id'] = sent_train['sentence_id'].apply(lambda idx: idx-1)
sent_val = pd.read_csv(f'{sentence_datapath}/dev.csv')
sent_val['sentence_id'] = sent_val['sentence_id'].apply(lambda idx: idx-1)
sent_test = pd.read_csv(f'{sentence_datapath}/test.csv')
sent_test['sentence_id'] = sent_test['sentence_id'].apply(lambda idx: idx-1)

In [15]:
sent_train['id'] = sent_train.apply(lambda row: f"{row['review_id']}.tsv_{row['sentence_id']}", axis=1)
sent_val['id'] = sent_val.apply(lambda row: f"{row['review_id']}.tsv_{row['sentence_id']}", axis=1)
sent_test['id'] = sent_test.apply(lambda row: f"{row['review_id']}.tsv_{row['sentence_id']}", axis=1)

In [16]:
tags = bio_data['bio_tag'].tolist()
clean_tags = {}

for tag in tags:
    if tag.split('-')[-1] not in clean_tags:
        clean_tags[tag.split('-')[-1]] = 0
    clean_tags[tag.split('-')[-1]] += 1

In [17]:
clean_tags

{'O': 37481,
 'ADR': 1470,
 'Drugform': 840,
 'DI': 2286,
 'Drugname': 1155,
 'Drugclass': 329,
 'Finding': 350}

### Tokens preprocessing

In [20]:
train_bio = bio_data[bio_data.id.isin(sent_train.id)]
train_bio = train_bio.groupby('id')[['bio_token', 'bio_tag']].agg(list).reset_index()

val_bio = bio_data[bio_data.id.isin(sent_val.id)]
val_bio = val_bio.groupby('id')[['bio_token', 'bio_tag']].agg(list).reset_index()

test_bio = bio_data[bio_data.id.isin(sent_test.id)]
test_bio = test_bio.groupby('id')[['bio_token', 'bio_tag']].agg(list).reset_index()

In [22]:
train_bio = train_bio.merge(sent_train, on='id', how='inner')
val_bio = val_bio.merge(sent_val, on='id', how='inner')
test_bio = test_bio.merge(sent_test, on='id', how='inner')

In [36]:
train_bio.shape, val_bio.shape, test_bio.shape

((1322, 12), (175, 12), (170, 12))

In [38]:
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, tokens, labels, tokenizer, sent_labels, label2id):
        self.tokens = tokens
        self.labels = labels
        self.sent_labels = sent_labels
        self.label2id = label2id
        self.tokenizer = tokenizer

    def prepare_data(self, tokens, labels):
        tokens = torch.tensor([self.tokenizer.convert_tokens_to_ids(text) for text in tokens])
        labels = torch.tensor([self.label2id[l] for l in labels])
        attention_mask = torch.tensor([0 for _ in labels])

        return tokens, labels

    def __getitem__(self, idx):
        tokens, labels = self.prepare_data(self.tokens[idx], self.labels[idx])
        return {'input_ids': tokens, 'labels': labels, 'sent_label': self.sent_labels[idx]}

    def __len__(self):
        return len(self.tokens)

In [39]:
def get_tokens_pairs(text, tags, tokenizer, to_pad=512):
    '''
    Функция делит слова на токены, подходящие модели,
    и изменяет набор тэгов, чтобы он соответствовал набору токенов
    '''

    all_tokens = ['[CLS]']
    all_labels = ['O']

    for (word, label) in zip(text, tags):
        tokens = tokenizer.tokenize(word)

        all_tokens.extend(tokens)

        if label.startswith('B'):
            all_labels.append(label)
            all_labels.extend([label.replace('B', 'I')]*(len(tokens)-1))
        else:
            all_labels.extend([label]*len(tokens))

    all_tokens.append('[SEP]')
    all_labels.append('O')

    return all_tokens[:512], all_labels[:512]

In [40]:
import os

file_path = '/kaggle/input/dl-project/'
label2id_file = 'label2id.json'
id2label_file = 'id2label.json'
if not os.path.exists(os.path.join(file_path, label2id_file)):
    tag_values = set(train_bio['bio_tag'].explode())
    label2id = {tag: i for i, tag in enumerate(tag_values)}
    id2label = {i: tag for tag, i in label2id.items()}

    with open(os.path.join(file_path, label2id_file), 'w') as f:
        json.dump(label2id, f)

    with open(os.path.join(file_path, id2label_file), 'w') as f:
        json.dump(id2label, f)

else:
    with open(os.path.join(file_path, label2id_file), 'r') as f:
        label2id = json.loads(f.read())
        label2id = {tag: int(idx) for tag, idx in label2id.items()}

    with open(os.path.join(file_path, id2label_file), 'r') as f:
        id2label = json.loads(f.read())
        id2label = {int(idx): tag for idx, tag in id2label.items()}

### Model class

In [47]:
class Ner2SentClassification(nn.Module):

    def __init__(self, bert, num_classes):
        super().__init__()
        self.bert = bert
        self.classification_head = nn.Sequential(
            nn.Linear(in_features=768, out_features=num_classes),
            nn.Sigmoid()
        )

    def forward(self, tokens):
        out = self.bert(**tokens)
        logits = out.logits

        hs = out.hidden_states[-1]
        hs_mean = hs.mean(axis=1)
        sent_class = self.classification_head(hs_mean)

        return logits, sent_class

### Model training

In [54]:
def train_model(model_name, augmentations_datapath, train_bio, val_bio):
    tokenizer = BertTokenizerFast.from_pretrained(model_name, model_max_length=512)
    data_collator = DataCollatorForTokenClassification(tokenizer)

    augs = ''
    if augmentations_datapath:
        augs = pd.read_csv(augmentations_datapath, sep='\t')
        augs_bio = augs.groupby('id')[['bio_token', 'bio_tag']].agg(list).reset_index()
        augs_bio = augs_bio.merge(sent_train, on='id', how='inner')
        train_bio = pd.concat([train_bio, augs_bio])
        augs = augmentations_datapath.split('/')[-1].split('.')[0]

    train_bio[["bio_token_full", "bio_tag_full"]] = train_bio.apply(lambda row: get_tokens_pairs(row.bio_token, row.bio_tag, tokenizer), axis='columns', result_type='expand')
    val_bio[["bio_token_full", "bio_tag_full"]] = val_bio.apply(lambda row: get_tokens_pairs(row.bio_token, row.bio_tag, tokenizer), axis='columns', result_type='expand')

    file_path = '/kaggle/input/dl-project/'
    label2id_file = 'label2id.json'
    id2label_file = 'id2label.json'
    with open(os.path.join(file_path, label2id_file), 'r') as f:
        label2id = json.loads(f.read())
        label2id = {tag: int(idx) for tag, idx in label2id.items()}

    with open(os.path.join(file_path, id2label_file), 'r') as f:
        id2label = json.loads(f.read())
        id2label = {int(idx): tag for idx, tag in id2label.items()}

    columns = ['EF', 'INF', 'ADR', 'DI', 'Finding']
    change_type_columns = {column: 'float64' for column in columns}
    train_bio = train_bio.astype(change_type_columns)
    val_bio = val_bio.astype(change_type_columns)

    train_bio['sent_labels'] = train_bio[columns].agg(list, axis=1)
    val_bio['sent_labels'] = val_bio[columns].agg(list, axis=1)

    train_dataset = NERDataset(train_bio['bio_token_full'].tolist(), train_bio['bio_tag_full'].tolist(), tokenizer, train_bio['sent_labels'].tolist(), label2id)
    val_dataset = NERDataset(val_bio['bio_token_full'], val_bio['bio_tag_full'], tokenizer, val_bio['sent_labels'], label2id)

    train_dataloader = DataLoader(train_dataset, batch_size=32, collate_fn=data_collator, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=32, collate_fn=data_collator)

    bert = BertForTokenClassification.from_pretrained(
        model_name,
        num_labels = len(label2id),
        output_attentions = False,
        output_hidden_states = True
    )
    model = Ner2SentClassification(bert = bert, num_classes = 5)

    optimizer = torch.optim.Adam([
        {"params": model.bert.parameters(), "lr": 1e-4},
    ], lr=1e-2)
    scheduler = torch.optim.lr_scheduler.PolynomialLR(optimizer, total_iters=100)

    criterion_sent = nn.BCEWithLogitsLoss()
    criterion_ner = nn.CrossEntropyLoss()
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    num_epochs = 40
    model = model.to(device)

    splitted_model_name = model_name.split('/')[-1].replace('-', '_')
    path_to_save = f'/kaggle/working/{splitted_model_name}_{augs}.pth'
    best_result = {
        'model_name': model_name,
        'path_to_save': path_to_save,
        'train_loss': 0,
        'val_loss': 0,
        'num_epochs': num_epochs ,
        'NER': {
            'epoch': 0,
            'score': 0
        },
        'Sent': {
            'epoch': 0,
            'score': 0
        }
    }

    print('Start training:', model_name)
    for epoch in tqdm(range(num_epochs)):
        model.train()
        all_train_losses = []
        for batch in train_dataloader:
            optimizer.zero_grad()

            input_values = {
                'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device)
            }
            labels = batch['labels'].to(device)
            sent_labels = batch['sent_label'].to(device)

            output = model(input_values)
            loss_ner = criterion_ner(
                output[0].flatten(end_dim=1),
                torch.where(input_values['attention_mask'].bool(), labels, -100).flatten(end_dim=1)
            )
            loss_sent = criterion_sent(output[1], sent_labels)
            loss = 2*loss_sent + loss_ner

            loss.backward()
            optimizer.step()

            all_train_losses.append(loss.item())

        print('train_loss:', sum(all_train_losses) / len(train_dataloader))
        best_result['train_loss'] = sum(all_train_losses) / len(train_dataloader)
        scheduler.step()

        model.eval()
        all_val_losses = []
        all_prediction = []
        all_true_answers = []
        all_true_sent = []
        all_pred_sent = []
        cnt_changes = 0
        for batch in val_dataloader:

            input_values = {
                'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device)
            }
            labels = batch['labels'].to(device)
            sent_labels = batch['sent_label'].to(device)

            output = model(input_values)
            loss_ner = criterion_ner(
                output[0].flatten(end_dim=1),
                torch.where(input_values['attention_mask'].bool(), labels, -100).flatten(end_dim=1)
            )
            loss_sent = criterion_sent(output[1], sent_labels)
            loss = loss_sent + loss_ner

            all_val_losses.append(loss.item())

            for sent_idx in range(len(output[0])):
                sent_true_tags = []
                sent_pred_tags = []
                for true_tag, pred_tag in zip(labels[sent_idx], output[0][sent_idx]):
                    if true_tag != -100:
                        pred_tag = torch.argmax(pred_tag).item()
                        sent_pred_tags.append(id2label[pred_tag])
                        sent_true_tags.append(id2label[true_tag.item()])

                all_prediction.append(sent_pred_tags)
                all_true_answers.append(sent_true_tags)

            sent_preds = torch.where(output[1] > 0.5, 1, 0).cpu()
            all_pred_sent.extend(sent_preds.cpu().tolist())
            all_true_sent.extend(sent_labels.cpu().tolist())

        val_results = {
            'f1_micro': f1_sklearn(all_true_sent, all_pred_sent, average='micro'),
            'f1_macro': f1_sklearn(all_true_sent, all_pred_sent, average='macro'),
            'f1_average': f1_sklearn(all_true_sent, all_pred_sent, average='samples'),
            'f1_all': f1_sklearn(all_true_sent, all_pred_sent, average=None)
        }

        ner_score = f1_score(all_true_answers, all_prediction)
        sent_score = val_results['f1_average']

        if (ner_score > best_result['NER']['score']) and (sent_score > best_result['Sent']['score']):
            best_result['NER']['score'] = ner_score
            best_result['NER']['epoch'] = epoch
            best_result['Sent']['score'] = sent_score
            best_result['Sent']['epoch'] = epoch

        elif ner_score > best_result['NER']['score']:
            best_result['NER']['score'] = ner_score
            best_result['NER']['epoch'] = epoch

        elif sent_score > best_result['Sent']['score']:
            best_result['Sent']['score'] = sent_score
            best_result['Sent']['epoch'] = epoch

        print('val_loss:', sum(all_val_losses) / len(val_dataloader))

        best_result['val_loss'] = sum(all_val_losses) / len(val_dataloader)

    report = pd.DataFrame(classification_report(all_true_answers, all_prediction, output_dict=True)).T
    report.to_csv(f'/kaggle/working/ner_{splitted_model_name}_{augs}.csv')

    report_sent = pd.DataFrame(classification_report_sklearn(all_true_sent, all_pred_sent, output_dict=True)).T
    report_sent.to_csv(f'/kaggle/working/sent_{splitted_model_name}_{augs}.csv')

    return best_result

In [None]:
model_name = 'cimm-kzn/rudr-bert'

aug_datapaths = [
    None,
    '/kaggle/input/dl-project/augmentations/augmented_synonyms_bio.csv',
    '/kaggle/input/dl-project/augmentations/augmented_bert_new_bio.csv',
    '/kaggle/input/dl-project/augmentations/augmented_umls_bio.csv'
]

results_pd = pd.DataFrame(columns=['model_name', 'augpath', 'path_to_save', 'train_loss', 'val_loss', 'num_epochs', 'ner_epoch', 'ner_score', 'sent_epoch', 'sent_score'])
i = 0
for augpath in aug_datapaths:
    i += 1
    best_result = train_model(model_name, augpath, train_bio, val_bio)
    result_to_save = {
        'model_name': model_name,
        'augpath': augpath,
        'path_to_save': best_result['path_to_save'],
        'train_loss': best_result['train_loss'],
        'val_loss': best_result['val_loss'],
        'num_epochs': 40,
        'ner_epoch': best_result['NER']['epoch'],
        'ner_score': best_result['NER']['score'],
        'sent_epoch': best_result['Sent']['epoch'],
        'sent_score': best_result['Sent']['score']
    }
    result_to_save = pd.DataFrame(result_to_save, index=[i])
    results_pd = pd.concat([results_pd, result_to_save])

    results_pd.to_csv(f'/kaggle/working/models_compare_change_val.csv')

Some weights of BertForTokenClassification were not initialized from the model checkpoint at cimm-kzn/rudr-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Start training: cimm-kzn/rudr-bert


  0%|          | 0/40 [00:00<?, ?it/s]

train_loss: 1.8252187294837756
{'model_name': 'cimm-kzn/rudr-bert', 'path_to_save': '/kaggle/working/rudr_bert_augmented_synonyms_bio.pth', 'train_loss': 1.8252187294837756, 'val_loss': 0, 'num_epochs': 40, 'NER': {'epoch': 0, 'score': 0.6350832266325224}, 'Sent': {'epoch': 0, 'score': 0.3419047619047619}}


  2%|▎         | 1/40 [00:51<33:44, 51.92s/it]

val_loss: 1.56754203637441
train_loss: 1.5307065034523988
{'model_name': 'cimm-kzn/rudr-bert', 'path_to_save': '/kaggle/working/rudr_bert_augmented_synonyms_bio.pth', 'train_loss': 1.5307065034523988, 'val_loss': 1.56754203637441, 'num_epochs': 40, 'NER': {'epoch': 1, 'score': 0.6552147239263804}, 'Sent': {'epoch': 1, 'score': 0.39523809523809517}}


  5%|▌         | 2/40 [01:43<32:46, 51.74s/it]

val_loss: 1.5519460240999858
train_loss: 1.459316074848175
{'model_name': 'cimm-kzn/rudr-bert', 'path_to_save': '/kaggle/working/rudr_bert_augmented_synonyms_bio.pth', 'train_loss': 1.459316074848175, 'val_loss': 1.5519460240999858, 'num_epochs': 40, 'NER': {'epoch': 2, 'score': 0.6948717948717947}, 'Sent': {'epoch': 2, 'score': 0.44285714285714284}}


  8%|▊         | 3/40 [02:34<31:32, 51.16s/it]

val_loss: 1.555938978989919
train_loss: 1.4182222286860149
{'model_name': 'cimm-kzn/rudr-bert', 'path_to_save': '/kaggle/working/rudr_bert_augmented_synonyms_bio.pth', 'train_loss': 1.4182222286860149, 'val_loss': 1.555938978989919, 'num_epochs': 40, 'NER': {'epoch': 2, 'score': 0.6948717948717947}, 'Sent': {'epoch': 3, 'score': 0.44476190476190464}}


 10%|█         | 4/40 [03:25<30:42, 51.18s/it]

val_loss: 1.5713372627894084
train_loss: 1.3976448682638316
{'model_name': 'cimm-kzn/rudr-bert', 'path_to_save': '/kaggle/working/rudr_bert_augmented_synonyms_bio.pth', 'train_loss': 1.3976448682638316, 'val_loss': 1.5713372627894084, 'num_epochs': 40, 'NER': {'epoch': 2, 'score': 0.6948717948717947}, 'Sent': {'epoch': 4, 'score': 0.45238095238095233}}


 12%|█▎        | 5/40 [04:15<29:40, 50.88s/it]

val_loss: 1.6417476932207744
train_loss: 1.3848321972749171


 15%|█▌        | 6/40 [05:04<28:31, 50.32s/it]

val_loss: 1.618311842282613
train_loss: 1.3706924716631572


 18%|█▊        | 7/40 [05:55<27:43, 50.40s/it]

val_loss: 1.6032909154891968
train_loss: 1.3687690710410094


 20%|██        | 8/40 [06:44<26:36, 49.89s/it]

val_loss: 1.6236187020937602
[5e-05, 0.005]
train_loss: 1.3519845773012211


 22%|██▎       | 9/40 [07:34<25:54, 50.15s/it]

val_loss: 1.6330347061157227


### Testing

In [None]:
model_name = 'cimm-kzn/rudr-bert'
splitted_model_name = model_name.split('/')[-1].replace('-', '_')
file_path = '/kaggle/input/dl-project'
label2id_file = 'label2id.json'
id2label_file = 'id2label.json'
with open(os.path.join(file_path, label2id_file), 'r') as f:
    label2id = json.loads(f.read())
    label2id = {tag: int(idx) for tag, idx in label2id.items()}

with open(os.path.join(file_path, id2label_file), 'r') as f:
    id2label = json.loads(f.read())
    id2label = {int(idx): tag for idx, tag in id2label.items()}

bert = BertForTokenClassification.from_pretrained(
        model_name,
        num_labels = len(label2id),
        output_attentions = False,
        output_hidden_states = True
    )
model = Ner2SentClassification(bert = bert, num_classes = 5)
model.load_state_dict(torch.load(f'/kaggle/working/rudr_bert_.pth'))

In [None]:
tokenizer = BertTokenizerFast.from_pretrained(model_name, model_max_length=512)
data_collator = DataCollatorForTokenClassification(tokenizer)

test_bio[["bio_token_full", "bio_tag_full"]] = test_bio.apply(lambda row: get_tokens_pairs(row.bio_token, row.bio_tag, tokenizer), axis='columns', result_type='expand')

file_path = '/kaggle/input/dl-project'
label2id_file = 'label2id.json'
id2label_file = 'id2label.json'
with open(os.path.join(file_path, label2id_file), 'r') as f:
    label2id = json.loads(f.read())
    label2id = {tag: int(idx) for tag, idx in label2id.items()}

with open(os.path.join(file_path, id2label_file), 'r') as f:
    id2label = json.loads(f.read())
    id2label = {int(idx): tag for idx, tag in id2label.items()}

columns = ['EF', 'INF', 'ADR', 'DI', 'Finding']
change_type_columns = {column: 'float64' for column in columns}
test_bio = test_bio.astype(change_type_columns)

test_bio['sent_labels'] = test_bio[columns].agg(list, axis=1)
test_dataset = NERDataset(test_bio['bio_token_full'], test_bio['bio_tag_full'], tokenizer, test_bio['sent_labels'], label2id)
test_dataloader = DataLoader(test_dataset, batch_size=8, collate_fn=data_collator)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

model.eval()
all_val_losses = []
all_prediction = []
all_true_answers = []
all_true_sent = []
all_pred_sent = []
for batch in test_dataloader:

    input_values = {
        'input_ids': batch['input_ids'].to(device),
        'attention_mask': batch['attention_mask'].to(device)
    }
    labels = batch['labels'].to(device)
    sent_labels = batch['sent_label'].to(device)

    output = model(input_values)

    for sent_idx in range(len(output[0])):
        sent_true_tags = []
        sent_pred_tags = []
        for true_tag, pred_tag in zip(labels[sent_idx], output[0][sent_idx]):
            if true_tag != -100:
                pred_tag = torch.argmax(pred_tag).item()
                sent_pred_tags.append(id2label[pred_tag])
                sent_true_tags.append(id2label[true_tag.item()])

        all_prediction.append(sent_pred_tags)
        all_true_answers.append(sent_true_tags)

    sent_preds = torch.where(output[1] > 0.5, 1, 0).cpu()
    all_pred_sent.extend(sent_preds.cpu().tolist())
    all_true_sent.extend(sent_labels.cpu().tolist())

val_results = {
    'f1_micro': f1_sklearn(all_true_sent, all_pred_sent, average='micro'),
    'f1_macro': f1_sklearn(all_true_sent, all_pred_sent, average='macro'),
    'f1_average': f1_sklearn(all_true_sent, all_pred_sent, average='samples'),
    'f1_all': f1_sklearn(all_true_sent, all_pred_sent, average=None)
}

ner_score = f1_score(all_true_answers, all_prediction)

In [None]:
val_results

In [None]:
ner_score

In [None]:
report = pd.DataFrame(classification_report(all_true_answers, all_prediction, output_dict=True)).T
report_sent = pd.DataFrame(classification_report_sklearn(all_true_sent, all_pred_sent, output_dict=True)).T

In [None]:
report

In [None]:
report_sent