# Introduction

In this notebook, we will explore some benchmarks for different models.

## Importing modules 

In [1]:
import numpy as np
import pandas as pd
import re
import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizer, PreTrainedTokenizerFast
from datasets import load_dataset
from typing import Any, Iterable, Callable


## Defining global variables

In [2]:
BENCHMARKS: dict[str, dict[str, Iterable[Any] | Callable]] = {}
'''
    Should be a dictionary with the following as entries:
    {
        'data': Iterable[Any],
        'evaluation': Callable[Any, PreTrainedTokenizer | PreTrainedTokenizerFast, TypeOfData]  -> This function will be called with the "data" value
    }
'''

DEVICE = 'cuda'
MODEL = 'NOVA-vision-language/GlorIA-1.3B'

## Defining auxiliar functions

In [3]:
# Soft Max
def soft_max(arr): return arr.exp() / arr.exp().sum()

def get_first_word(original_text: str, predicted_text: str):
    predicted_text = predicted_text.replace(original_text, '')
    # Regex to find first word
    first_word = re.search(r'\b\w+\b', predicted_text)
    first_word = first_word.group() if first_word else None
    return first_word

def load_dataset_to_dataframe(*args, data_dir=None, dataset_types=['train', 'validation', 'test'], **kwargs):
    ds = load_dataset(*args, data_dir=data_dir, **kwargs)
    output = []
    for ds_type in dataset_types:
        output.append(ds[ds_type].to_pandas())
        output[-1]['Dataset Type'] = ds_type
    return pd.concat(output)

# 1. Loading the model

In [4]:
# GlorIA-1.3B
model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    use_safetensors=True
).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# 2. Loading Benchmark Datasets


## CALAME-PT

In [6]:
# Loading CALAME-PT dataset onto a Pandas DataFrame
df_handwritten = pd.read_json("hf://datasets/NOVA-vision-language/calame-pt/calamept_handwritten_only.jsonl", lines=True)
df_handwritten['Source'] = 'Handwritten'
df_generated = pd.read_json("hf://datasets/NOVA-vision-language/calame-pt/calamept_gen_only.jsonl", lines=True)
df_generated['Source'] = 'Generated'
calame_pt_df = pd.concat([df_handwritten, df_generated])[['id', 'sentence', 'last_word']]

# Defining the Benchmark function for CALAME-PT
def benchmark_calamept(
    model,
    tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
    dataset: list[dict[str, str]],
    model_kwargs = {
        'do_sample': False,
        'temperature': None,
        'pad_token_id': 50257, # This is the tokenizer.eos_token_id
        # 'output_scores': True,            # INCLUDE LATER to get the probability distribution and use that as a benchmark
        # 'return_dict_in_generate': True,
        'return_legacy_cache': False
    },
    parallel = False
):
    # NOTE: We could improve this benchmark by using the PROBABILITY Distribution of the actual token.
    benchmark_output = {}
    if not parallel:
        for data in tqdm.tqdm(dataset, desc='Running CALAME-PT Benchmark'):
            # Retrieving data from the dictionary
            n, predicted_text, correct_word = data['id'], data['sentence'], data['last_word']

            input_tokens = tokenizer.encode(predicted_text, return_tensors="pt").to(DEVICE)
            prediction = model.generate(
                input_tokens,
                max_length=input_tokens.size()[1] + 5,
                **model_kwargs
            )
            prediction = tokenizer.decode(prediction[0])
            predicted_word = get_first_word(predicted_text, prediction)
            benchmark_output[n] = {'text': predicted_text, 'prediction': predicted_word, 'correct_word': correct_word}
    else:
        input_tokens = [d['sentence'] for d in dataset]
        max_length = max(len(d) for d in input_tokens) + 5
        input_tokens = tokenizer.encode(input_tokens, return_tensors='pt', padding=True, truncation=True)['input_ids'].to(DEVICE)
        predictions = model.generate(
            input_tokens,
            max_length=max_length,
            **model_kwargs
        )
        for n, pred in enumerate(predictions):
            predicted_text = tokenizer.decode(pred[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
            predicted_word = get_first_word(dataset[n]['sentence'], predicted_text)
            benchmark_output[n] = {'text': dataset[n]['sentence'], 'prediction': predicted_word, 'correct_word': dataset[n]['last_word']}

    accurate_preds = sum(ben['prediction'] == ben['correct_word'] for ben in benchmark_output.values())
    return {
        'benchmark': 'CALAME-PT',
        'accuracy': accurate_preds / len(BENCHMARKS['CALAME-PT']['data']),
        'accurate_predictions': accurate_preds,
        'wrong_predictions': len(BENCHMARKS['CALAME-PT']['data']) - accurate_preds,
        'benchmark_predictions': benchmark_output,
        'model': MODEL
    }

# Adding it onto benchmarks dictionary
BENCHMARKS['CALAME-PT'] =  {
    'data': calame_pt_df.to_dict('records'),
    'data-dataframe': calame_pt_df,
    'evaluation': benchmark_calamept,    # Should be a function that receives 3 args: model, tokenizer and dataset
}

## SUPERGLUE PT-PT

In [5]:
superglue_benchmarks = {}

### Task 1. BoolQ

Boolean Question task, consists of determining wether a given question is true or false based on a given passage.

In [6]:
superglue_boolq_df = load_dataset_to_dataframe('PORTULAN/extraglue', data_dir='data/boolq_pt-PT')

def benchmark_task_boolq(
    model,
    tokenizer,
    dataset,
    model_kwargs = {
        'do_sample': False,
        'temperature': None,
        'pad_token_id': 50257, # This is the tokenizer.eos_token_id
        # 'output_scores': True,            # INCLUDE LATER to get the probability distribution and use that as a benchmark
        # 'return_dict_in_generate': True,
        'return_legacy_cache': False
    }
):
    benchmark_output = []
    for data in tqdm.tqdm(dataset, desc='Running Superglue ptPT Benchamrk - Task BoolQ'):
        # Retrieving data from the dictionary
        idx, passage, question, correct_label = data['idx'], data['passage'], data['question'], data['label']

        input_text = f'Passagem: {passage}\nPergunta: {question}\nResposta (0-Verdade, 1-Mentira):'

        input_tokens = tokenizer.encode(input_text, return_tensors="pt").to(DEVICE)
        prediction = model.generate(
            input_tokens,
            max_length=input_tokens.size()[1] + 5,
            **model_kwargs
        )
        prediction = tokenizer.decode(prediction[0])
        predicted_answer = get_first_word(input_text, prediction)
        if not predicted_answer in ['1', '0']:
            predicted_answer = '-1' if predicted_answer is None else '1' if predicted_answer.lower() == 'sim' else '0'
        benchmark_output.append({
            'idx': idx, 'input_text': input_text, 'prediction_answer': predicted_answer, 'correct_label': correct_label
        })

    accurate_preds = sum(ben['prediction_answer'].strip()[0] == ben['correct_label'] for ben in benchmark_output.values())
    return {
        'benchmark': 'Superglue pt-PT: Task BoolQ',
        'accuracy': accurate_preds / len(dataset),
        'accurate_predictions': accurate_preds,
        'wrong_predictions': len(dataset) - accurate_preds,
        'benchmark_predictions': benchmark_output,
        'model': MODEL
    }

superglue_benchmarks['BoolQ'] = {
    'data': superglue_boolq_df.to_dict('records'),
    'data-dataframe': superglue_boolq_df,
    'evaluation': benchmark_task_boolq,    # Should be a function that receives 3 args: model, tokenizer and dataset
}

In [7]:
boolQ = superglue_benchmarks['BoolQ']

boolQ['evaluation'](model, tokenizer, boolQ['data'])

Running Superglue ptPT Benchamrk - Task BoolQ:   0%|          | 0/15942 [00:00<?, ?it/s]

Running Superglue ptPT Benchamrk - Task BoolQ:   1%|          | 111/15942 [00:27<1:05:04,  4.05it/s]


KeyboardInterrupt: 

### Task 2. CB

Commitment Bank task, consist of ...

### Actual benchmark

In [None]:
# Loading GLUE PTPT dataset onto a Pandas DataFrame
superglue_ptpt_df = []
for task in [
    'axb_pt-PT',
    'axg_pt-PT',
    'boolq_pt-PT',  # BoolQ (Boolean Questions) QA task where the goal is to determine whether a given question is true or false based on a given passage.
    'cb_pt-PT',     # CB (CommitmentBank) CommitmentBank is a dataset of 1,000 sentences from the Wall Street Journal annotated with a commitment rating.
    'copa_pt-PT',   # COPA (Choice of Plausible Alternatives) QA task where the goal is to select the most plausible alternative to a given premise.
    'mnli_matched_pt-PT',
    'mnli_mismatched_pt-PT',
    'mrpc_pt-PT',
    'multirc_pt-PT',    #  MultiRC (Multi-Sentence Reading Comprehension) QA task where the goal is to read a passage and answer multiple-choice questions about it.
    'qnli_pt-PT',
    'rte_pt-PT',
    'sst2_pt-PT',
    'stsb_pt-PT',
    'wnli_pt-PT'
]:
    ds = load_dataset("PORTULAN/extraglue", data_dir='data/{}'.format(task), num_proc=5)
    tmp = []
    for dataset_type in ['train', 'validation', 'test']:
        if ds.get(dataset_type, None) is None: continue
        tmp.append(ds[dataset_type].to_pandas())
        tmp[-1]['Dataset Type'] = dataset_type
    superglue_ptpt_df.append(pd.concat(tmp).reset_index(drop=True))
    superglue_ptpt_df[-1]['Task Name'] = task
superglue_ptpt_df = pd.concat(superglue_ptpt_df).reset_index(drop=True)
superglue_ptpt_df = superglue_ptpt_df[['Task Name', 'Dataset Type'] + superglue_ptpt_df.columns.difference(['Task Name', 'Dataset Type']).tolist()]

# Defining the Benchmark function for CALAME-PT
def benchmark_superglueptpt(
    model,
    tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
    dataset: list[dict[str, str]],
    model_kwargs = {
        'do_sample': False,
        'temperature': None,
        'pad_token_id': 50257, # This is the tokenizer.eos_token_id
        # 'output_scores': True,            # INCLUDE LATER to get the probability distribution and use that as a benchmark
        # 'return_dict_in_generate': True,
        'return_legacy_cache': False
    },
    parallel = False
):
    return None


# Adding it onto benchmarks dictionary
BENCHMARKS['SuperGLUE-PTPT'] =  {
    'data': superglue_ptpt_df.to_dict('records'),
    'data-dataframe': superglue_ptpt_df,
    'evaluation': benchmark_superglueptpt,    # Should be a function that receives 3 args: model, tokenizer and dataset
}

## GLUE PT-PT

In [None]:
# Loading GLUE PTPT dataset onto a Pandas DataFrame
glue_ptpt_df = []
# Iterate through 4 different GLUE taks (Read more here: https://openreview.net/pdf?id=rJ4km2R5t7)
for task in [
    "mrpc",     # MRPC - The Microsoft Research Paraphrase Corpus   (Similarity and Paraphrase Task)
    "stsb",     # STS-B - Semantic Textual Similarity Benchmark     (Similarity and Paraphrase Task)
    "rte",      # RTE - Recognizing Textual Entailment              (Inference Task)
    "wnli"      # WNLI - Winograd Schema Challenge                  (Inference Task)
]:
    ds = load_dataset("PORTULAN/extraglue", task)
    # Transform datasets to pandas
    tmp = []
    for dataset_type in ['train', 'validation', 'test']:
        tmp.append(ds[dataset_type].to_pandas())
        tmp[-1]['Dataset Name'] = task
        tmp[-1]['Dataset Type'] = dataset_type
    glue_ptpt_df.append(pd.concat(tmp).reset_index(drop=True))
glue_ptpt_df = pd.concat(glue_ptpt_df).reset_index(drop=True)

# Defining the Benchmark function for GLUE PTPT
def benchmark_glueptpt(
    model,
    tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
    dataset: list[dict[str, str]],
    model_kwargs = {
        'do_sample': False,
        'temperature': None,
        'pad_token_id': 50257, # This is the tokenizer.eos_token_id
        # 'output_scores': True,            # INCLUDE LATER to get the probability distribution and use that as a benchmark
        # 'return_dict_in_generate': True,
        'return_legacy_cache': False
    },
    parallel = False
):
    # NOTE: We could improve this benchmark by using the PROBABILITY Distribution of the actual token.
    benchmark_output = {}
    if not parallel:
        for data in tqdm.tqdm(dataset, desc='Running CALAME-PT Benchmark'):
            # Retrieving data from the dictionary
            n, predicted_text, correct_word = data['id'], data['sentence'], data['last_word']

            input_tokens = tokenizer.encode(predicted_text, return_tensors="pt").to(DEVICE)
            prediction = model.generate(
                input_tokens,
                max_length=input_tokens.size()[1] + 5,
                **model_kwargs
            )
            prediction = tokenizer.decode(prediction[0])
            predicted_word = get_first_word(predicted_text, prediction)
            benchmark_output[n] = {'text': predicted_text, 'prediction': predicted_word, 'correct_word': correct_word}
    else:
        input_tokens = [d['sentence'] for d in dataset]
        max_length = max(len(d) for d in input_tokens) + 5
        input_tokens = tokenizer.encode(input_tokens, return_tensors='pt', padding=True, truncation=True)['input_ids'].to(DEVICE)
        predictions = model.generate(
            input_tokens,
            max_length=max_length,
            **model_kwargs
        )
        for n, pred in enumerate(predictions):
            predicted_text = tokenizer.decode(pred[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
            predicted_word = get_first_word(dataset[n]['sentence'], predicted_text)
            benchmark_output[n] = {'text': dataset[n]['sentence'], 'prediction': predicted_word, 'correct_word': dataset[n]['last_word']}

    accurate_preds = sum(ben['prediction'] == ben['correct_word'] for ben in benchmark_output.values())
    return {
        'benchmark': 'CALAME-PT',
        'accuracy': accurate_preds / len(BENCHMARKS['CALAME-PT']['data']),
        'accurate_predictions': accurate_preds,
        'wrong_predictions': len(BENCHMARKS['CALAME-PT']['data']) - accurate_preds,
        'benchmark_predictions': benchmark_output,
        'model': MODEL
    }


# Adding it onto benchmarks dictionary
BENCHMARKS['GLUE-PTPT'] =  {
    'data': glue_ptpt_df.to_dict('records'),
    'data-pandas': glue_ptpt_df,
    'evaluation': benchmark_glueptpt,    # Should be a function that receives 3 args: model, tokenizer and dataset
}

# Perplexity

We will use ALL datasets to calculate perplexity

In [None]:
texts = []
for benchmark_info in BENCHMARKS.values():
    benchmark_info['data']

# 3. Running Benchmark

In [None]:
results = []
for benchmark, benchmark_info in BENCHMARKS.items():
    results.append(benchmark_info['evaluation'](model, tokenizer, benchmark_info['data']))

pd.DataFrame(results)