In [None]:
# this notebook only predicts the CUI based on BERT model ie only if there are multiple candidates for a mention.
# Steps to run this notebook.
# 0. Switch to GPU first
# 1. Mount Gdrive with model and meta files using the GUI on left plane.
# 2. Upload a file "data_for_BERT.csv" that will be used by the BERT
# 3. RUN ALL cells
# 4. File called "file_with_prediction.csv" will be generated which can be used to verify\analyse result

In [1]:
%%capture
!pip install transformers

In [2]:
from google.colab import drive
drive.mount('/content/drive')
!cp "/content/drive/My Drive/meta.bin" meta.bin
!cp "/content/drive/My Drive/model.bin" model.bin

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import torch, joblib
model = torch.load("model.bin")
meta = joblib.load("meta.bin")
enc_label = meta['enc_label']

In [6]:
import tensorflow as tf
import torch
from transformers import BertTokenizer
from tqdm import tqdm
from sklearn import preprocessing

if torch.cuda.is_available():    
    device = torch.device("cuda")
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
    
class config:
    TRAIN_PATH = "./train"
    MAX_LEN = 64
    TOKENIZER = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    BATCH_SIZE = 32
    EPOCHS = 10

In [7]:
import numpy as np
import pandas as pd
from tqdm import tqdm
test_df = pd.read_csv("data_for_BERT.csv")

In [8]:
for i in test_df.index:
    try:
        test_df.at[i, 'prediction'] = eval(test_df['prediction'].loc[i])
    except:
        print("C")

In [9]:
input_ids = []
input_target_positions = []
labels = []
test_candidates = []
enc_label = enc_label
tokenizer = config.TOKENIZER
mask_token = tokenizer.tokenize("[MASK]")

for index in tqdm(test_df.index):
    row = test_df.loc[index]
    sentence = row['original_sentence']
    st = row['position_start']
    end = row['position_end']
    candidates = row['prediction']

    tokenized_pre = tokenizer.tokenize(sentence[:st])
    target_position = len(tokenized_pre)
    if target_position > config.MAX_LEN//2:
        tokenized_pre = tokenized_pre[-config.MAX_LEN//2:]
        target_position = len(tokenized_pre)

    tokenized_post = tokenizer.tokenize(sentence[end+1:])
    tokenized = tokenized_pre + (mask_token) + tokenized_post
    ids = tokenizer.convert_tokens_to_ids(tokenized)
    ids = ids[(len(ids)-config.MAX_LEN)//2+1 : (len(ids)+config.MAX_LEN)//2-1]
    
    input_ids.append(ids)
    input_target_positions.append(target_position)
    labels.append(row['cui'])
    test_candidates.append([le_dict.get(_, enc_label.transform(['CUI-less'])[0]) for _ in candidates])

labels = [le_dict.get(_, enc_label.transform(['CUI-less'])[0]) for _ in labels] #enc_label.transform(labels)


100%|██████████| 343/343 [00:06<00:00, 52.44it/s]


In [10]:
le_dict.get('C0019699', enc_label.transform(['CUI-less'])[0])

940

In [11]:
attention_masks = []
for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]
    attention_masks.append(att_mask)

In [13]:
max_len = max([len(_) for _ in test_candidates])
for i in range(len(test_candidates)):
    test_candidates[i] = test_candidates[i] + [enc_label.transform(['CUI-less'])[0]] * (max_len - len(test_candidates[i]))
max_len = max([len(_) for _ in test_candidates])

25 [[940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940], [940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940], [206, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940], [315, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940], [940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940], [47, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940], [940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940], [123, 779, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 940, 94

In [15]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
labels = torch.tensor(labels)
input_target_positions = torch.tensor(input_target_positions)
test_candidates = torch.tensor(test_candidates)

test_data = TensorDataset(input_ids, attention_masks, labels, input_target_positions, test_candidates)
test_sampler = SequentialSampler(test_data)
prediction_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=config.BATCH_SIZE)

In [16]:
predictions , true_labels = [], []
model.eval()
for batch in prediction_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels, b_pos, b_candidates = batch
    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    logits = np.argsort(logits, axis=1)
    preds = []
    for i in range(len(logits)):
        for _ in logits[i]:
            if _ in b_candidates[i]:
                pred = _
                break
            else:
                pred = logits[i][0]
        preds.append(pred)
    label_ids = b_labels.to('cpu').numpy()
    predictions.append(preds)
    true_labels.append(label_ids)

DONE.


In [17]:
from sklearn.metrics import accuracy_score
flat_predictions = [item for sublist in predictions for item in sublist]
flat_true_labels = [item for sublist in true_labels for item in sublist]

print("Accuracy: ", accuracy_score(flat_true_labels, flat_predictions))

Accuracy:  0.5889212827988338


In [18]:
test_df['BERT_prediction'] = enc_label.inverse_transform(flat_predictions)

In [24]:
test_df.to_csv("file_with_prediction.csv")
!cp file_with_prediction.csv "/content/drive/My Drive/file_with_prediction.csv"