## Evaluate models based on nearest-neighbor

In [1]:
import os
import sys
import random

p = os.path.abspath('../')
sys.path.insert(1, p)

os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"

import transformers
transformers.logging.set_verbosity_error()

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm

import matplotlib.pyplot as plt

from sklearn.neighbors import NearestNeighbors

from datasets import Dataset, load_dataset, Split

import seaborn as sns
sns.set_theme(color_codes=True)

from transformers import DataCollatorWithPadding, AutoModelForSequenceClassification, Trainer, TrainingArguments
import evaluate

from src.contrastive_transformers.collators import TextCollator
from src.contrastive_transformers.datasets import AutoAugmentDataset
from src.contrastive_transformers.trainers import ContrastiveTrainer
from src.contrastive_transformers.losses import SupConLoss

from src.utils.utils import * 

seed = 7631
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

batch_size = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

%load_ext autoreload
%autoreload 2

In [2]:
smm4h20 = load_dataset('KevinSpaghetti/smm4h20')
cadec = load_dataset('KevinSpaghetti/cadec')

meddra_ontology = pd.read_csv('./data/processed/meddra_ontology.csv')
all_pts = meddra_ontology['term_PT'].unique()

llt_pt_examples = pd.read_csv('./data/processed/llt_pt_examples.csv')
all_llts = llt_pt_examples['term_LLT'].unique()

llt_to_pt_mapping = dict(llt_pt_examples[['term_LLT', 'term_PT']].itertuples(index=False))

all_pts = load_dataset('KevinSpaghetti/all_pts', split=Split.ALL)
pt_vocab = dict(zip(all_pts['term'], all_pts['label']))
index_to_label = dict(zip(all_pts['label'], all_pts['term']))
print(len(pt_vocab))

Using custom data configuration KevinSpaghetti--smm4h20-9b0cef5ffb10261f
Found cached dataset parquet (/home/kevinds/.cache/huggingface/datasets/KevinSpaghetti___parquet/KevinSpaghetti--smm4h20-9b0cef5ffb10261f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

Using custom data configuration KevinSpaghetti--cadec-d97aca76af8be810
Found cached dataset parquet (/home/kevinds/.cache/huggingface/datasets/KevinSpaghetti___parquet/KevinSpaghetti--cadec-d97aca76af8be810/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

Using custom data configuration KevinSpaghetti--all_pts-28f171117b934745
Found cached dataset parquet (/home/kevinds/.cache/huggingface/datasets/KevinSpaghetti___parquet/KevinSpaghetti--all_pts-28f171117b934745/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


24571


In [65]:
model = AutoModel.from_pretrained(
    'bert-base-uncased',
    cache_dir='./cache/')

In [66]:
model.eval()
model.to(device)
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

test=cadec['test'].map(lambda x: {'label': pt_vocab[x['term_PT']]}, remove_columns=['term_PT'])
train=cadec['train'].map(lambda x: {'label': pt_vocab[x['term_PT']]}, remove_columns=['term_PT'])

seen, unseen = get_seen_unseen_split(train, test, label_col='label')

results = {}

Loading cached processed dataset at /home/kevinds/.cache/huggingface/datasets/KevinSpaghetti___parquet/KevinSpaghetti--cadec-d97aca76af8be810/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-f7fb3b2ba43ca60d.arrow
Loading cached processed dataset at /home/kevinds/.cache/huggingface/datasets/KevinSpaghetti___parquet/KevinSpaghetti--cadec-d97aca76af8be810/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-f3c1c5f1a96d4bee.arrow
Loading cached processed dataset at /home/kevinds/.cache/huggingface/datasets/KevinSpaghetti___parquet/KevinSpaghetti--cadec-d97aca76af8be810/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-4012229c18c65204.arrow
Loading cached processed dataset at /home/kevinds/.cache/huggingface/datasets/KevinSpaghetti___parquet/KevinSpaghetti--cadec-d97aca76af8be810/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-2d63e3a74735e8a8.arrow


In [67]:
def encode_classes(model, classes, num_classes, label_col='label', encode_col='term'):
    class_encodings = torch.zeros((num_classes, model.config.hidden_size), device=device)
    with torch.no_grad():
        for idx in tqdm(range(0, len(classes), batch_size), total=len(classes) // batch_size):
            batch_start = idx
            batch_end = min(len(classes), batch_start+batch_size)
            model_inputs = tokenizer(classes[batch_start:batch_end][encode_col], padding='max_length', max_length=32, return_tensors='pt').to(device)
            model_output = model(**model_inputs, output_hidden_states=True)
            embedding = model_output.pooler_output
            class_encodings[classes[batch_start:batch_end][label_col]] += embedding
    
    class_encodings = class_encodings.to('cpu', non_blocking=True)
    return class_encodings

In [68]:
def encode_samples(model, samples):
    with torch.no_grad():
        sample_encodings = torch.zeros((len(samples), model.config.hidden_size), device=device)
        for idx in tqdm(range(0, len(samples), batch_size), total=len(samples) // batch_size):
            batch_start = idx
            batch_end = min(len(samples), batch_start+batch_size)
            model_inputs = tokenizer(samples[batch_start:batch_end], padding='max_length', max_length=32, return_tensors='pt').to(device)
            model_output = model(**model_inputs, output_hidden_states=True)
            model_prediction = model_output.pooler_output
            sample_encodings[batch_start:batch_end] = model_prediction
    return sample_encodings

In [69]:
class_encodings = encode_classes(model, 
                                 all_pts, 
                                 len(pt_vocab), 
                                 label_col='label',
                                 encode_col='term')

384it [00:10, 35.23it/s]                                        


In [70]:
accuracyk = evaluate.load('KevinSpaghetti/accuracyk')

embedding_index = NearestNeighbors(n_neighbors=5, 
                                   metric='cosine', 
                                   algorithm='brute', 
                                   n_jobs=-1).fit(class_encodings.to('cpu'))

encoded_samples = encode_samples(model, test['ade'])
_, predictions = embedding_index.kneighbors(encoded_samples.cpu())
results['complete'] = {
    'top1': accuracyk.compute(predictions=predictions[:, 0][:, None], references=test['label'])['accuracy'],
    'top5': accuracyk.compute(predictions=predictions, references=test['label'])['accuracy']
}

encoded_samples = encode_samples(model, seen['ade'])
_, predictions = embedding_index.kneighbors(encoded_samples.cpu())
results['seen'] = {
    'top1': accuracyk.compute(predictions=predictions[:, 0][:, None], references=seen['label'])['accuracy'],
    'top5': accuracyk.compute(predictions=predictions, references=seen['label'])['accuracy']
}

encoded_samples = encode_samples(model, unseen['ade'])
_, predictions = embedding_index.kneighbors(encoded_samples.cpu())
results['unseen'] = {
    'top1': accuracyk.compute(predictions=predictions[:, 0][:, None], references=unseen['label'])['accuracy'],
    'top5': accuracyk.compute(predictions=predictions, references=unseen['label'])['accuracy']
}

18it [00:00, 39.30it/s]                                         
17it [00:00, 39.30it/s]                                         
1it [00:00, 83.58it/s]


In [71]:
results

{'complete': {'top1': 0.1462979482604817, 'top5': 0.14986619090098127},
 'seen': {'top1': 0.14681440443213298, 'top5': 0.1505078485687904},
 'unseen': {'top1': 0.13157894736842105, 'top5': 0.13157894736842105}}