# Introduction

In this notebook, we will explore some benchmarks for different models.

## Importing modules 

In [6]:
import re
import tqdm
import torch
import pickle
import numpy as np
import pandas as pd
from datasets import load_dataset
from typing import Any, Iterable, Callable
from transformers import AutoTokenizer, AutoModelForCausalLM, PreTrainedTokenizer, PreTrainedTokenizerFast

In [2]:
import sys
sys.path.insert(1, '../')
from src import BENCHMARKS
BENCHMARKS.config['number_of_evaluations'] = 1
BENCHMARKS.config['parallel_batch_size'] = 16

MODEL_CONFIGS = {
    # 'HuggingFaceTB/SmolLM2-135M': BENCHMARKS.config.copy(),
    # 'NOVA-vision-language/GlorIA-1.3B': BENCHMARKS.config.copy(),
    'PORTULAN/gervasio-7b-portuguese-ptpt-decoder': {
        'parallel_tasks': False,
        'parallel_batch_size': 1,
        'model_kwargs': {
            'torch_dtype': torch.bfloat16,
        },
        'generate_model_kwargs': {
            'do_sample': False,
            'top_p': None,
            'temperature': None,
            'pad_token_id': 50257,
            'return_legacy_cache': False,
            
        },
        'no_config_copy_benchmarks': [],
        'tqdm_description': '<{MODEL}> Running {benchmark_name} Benchmark',
        'number_of_evaluations': 1
    },
}

In [None]:
DEVICE = 'cuda'
for MODEL in MODEL_CONFIGS.keys():
    print('{:-^100s}\n\n{: ^100s}\n\n{:-^100s}'.format('', f'Running benchmarks for model `{MODEL}`', ''))
    model = AutoModelForCausalLM.from_pretrained(MODEL, use_safetensors=True, **MODEL_CONFIGS[MODEL]['model_kwargs']).to(DEVICE)
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    BENCHMARKS.config = MODEL_CONFIGS[MODEL]
    BENCHMARKS.run(model, tokenizer)
    # Freeing up model from GPU memory
    model.to('cpu')
    torch.cuda.empty_cache()
    del(model)

In [7]:
res = BENCHMARKS.get_results()

with open('benchmark_results.pkl', 'wb') as f:
    pickle.dump(res, f)

In [None]:
with open('benchmark_results.pkl', 'rb') as f:
    res = pickle.load(f)


In [None]:
df = []
for model in res.keys():
    df.append({
        'Model': model
    })
    for benchmark in res[model].keys():
        df[-1][benchmark] = res[model][benchmark]['result']
pd.DataFrame(df) 

In [None]:
BENCHMARKS.benchmarks[0].df # Yes/No answer

In [None]:
BENCHMARKS.benchmarks[1].df # Predict the last word

## Defining global variables

In [None]:
BENCHMARKS: dict[str, dict[str, Iterable[Any] | Callable]] = {}
'''
    Should be a dictionary with the following as entries:
    {
        'data': Iterable[Any],
        'evaluation': Callable[Any, PreTrainedTokenizer | PreTrainedTokenizerFast, TypeOfData]  -> This function will be called with the "data" value
    }
'''

DEVICE = 'cuda'
MODEL = 'NOVA-vision-language/GlorIA-1.3B'

## Defining auxiliar functions

In [None]:
# Soft Max
def soft_max(arr): return arr.exp() / arr.exp().sum()

def get_first_word(original_text: str, predicted_text: str):
    predicted_text = predicted_text.replace(original_text, '')
    # Regex to find first word
    first_word = re.search(r'\b\w+\b', predicted_text)
    first_word = first_word.group() if first_word else None
    return first_word

def load_dataset_to_dataframe(*args, data_dir=None, dataset_types=['train', 'validation', 'test'], **kwargs):
    ds = load_dataset(*args, data_dir=data_dir, **kwargs)
    output = []
    for ds_type in dataset_types:
        output.append(ds[ds_type].to_pandas())
        output[-1]['Dataset Type'] = ds_type
    return pd.concat(output)

# 1. Loading the model

In [None]:
# GlorIA-1.3B
model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    use_safetensors=True
).to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
model.device

# 2. Loading Benchmark Datasets


## CALAME-PT

In [None]:
# Loading CALAME-PT dataset onto a Pandas DataFrame
df_handwritten = pd.read_json("hf://datasets/NOVA-vision-language/calame-pt/calamept_handwritten_only.jsonl", lines=True)
df_handwritten['Source'] = 'Handwritten'
df_generated = pd.read_json("hf://datasets/NOVA-vision-language/calame-pt/calamept_gen_only.jsonl", lines=True)
df_generated['Source'] = 'Generated'
calame_pt_df = pd.concat([df_handwritten, df_generated])[['id', 'sentence', 'last_word']]

# Defining the Benchmark function for CALAME-PT
def benchmark_calamept(
    model,
    tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
    dataset: list[dict[str, str]],
    model_kwargs = {
        'do_sample': False,
        'temperature': None,
        'pad_token_id': 50257, # This is the tokenizer.eos_token_id
        # 'output_scores': True,            # INCLUDE LATER to get the probability distribution and use that as a benchmark
        # 'return_dict_in_generate': True,
        'return_legacy_cache': False
    },
    parallel = False
):
    # NOTE: We could improve this benchmark by using the PROBABILITY Distribution of the actual token.
    benchmark_output = {}
    if not parallel:
        for data in tqdm.tqdm(dataset, desc='Running CALAME-PT Benchmark'):
            # Retrieving data from the dictionary
            n, predicted_text, correct_word = data['id'], data['sentence'], data['last_word']

            input_tokens = tokenizer.encode(predicted_text, return_tensors="pt").to(DEVICE)
            prediction = model.generate(
                input_tokens,
                max_length=input_tokens.size()[1] + 5,
                **model_kwargs
            )
            prediction = tokenizer.decode(prediction[0])
            predicted_word = get_first_word(predicted_text, prediction)
            benchmark_output[n] = {'text': predicted_text, 'prediction': predicted_word, 'correct_word': correct_word}
    else:
        input_tokens = [d['sentence'] for d in dataset]
        max_length = max(len(d) for d in input_tokens) + 5
        input_tokens = tokenizer.encode(input_tokens, return_tensors='pt', padding=True, truncation=True)['input_ids'].to(DEVICE)
        predictions = model.generate(
            input_tokens,
            max_length=max_length,
            **model_kwargs
        )
        for n, pred in enumerate(predictions):
            predicted_text = tokenizer.decode(pred[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
            predicted_word = get_first_word(dataset[n]['sentence'], predicted_text)
            benchmark_output[n] = {'text': dataset[n]['sentence'], 'prediction': predicted_word, 'correct_word': dataset[n]['last_word']}

    accurate_preds = sum(ben['prediction'] == ben['correct_word'] for ben in benchmark_output.values())
    return {
        'benchmark': 'CALAME-PT',
        'accuracy': accurate_preds / len(BENCHMARKS['CALAME-PT']['data']),
        'accurate_predictions': accurate_preds,
        'wrong_predictions': len(BENCHMARKS['CALAME-PT']['data']) - accurate_preds,
        'benchmark_predictions': benchmark_output,
        'model': MODEL
    }

# Adding it onto benchmarks dictionary
BENCHMARKS['CALAME-PT'] =  {
    'data': calame_pt_df.to_dict('records'),
    'data-dataframe': calame_pt_df,
    'evaluation': benchmark_calamept,    # Should be a function that receives 3 args: model, tokenizer and dataset
}

## SUPERGLUE PT-PT

In [None]:
superglue_benchmarks = {}

### Task 1. BoolQ

Boolean Question task, consists of determining wether a given question is true or false based on a given passage.

In [None]:
superglue_boolq_df = load_dataset_to_dataframe('PORTULAN/extraglue', data_dir='data/boolq_pt-PT')

def benchmark_task_boolq(
    model,
    tokenizer:  PreTrainedTokenizer | PreTrainedTokenizerFast,
    dataset,
    model_kwargs = {
        'do_sample': False,
        'temperature': None,
        'pad_token_id': 50257, # This is the tokenizer.eos_token_id
        # 'output_scores': True,            # INCLUDE LATER to get the probability distribution and use that as a benchmark
        # 'return_dict_in_generate': True,
        'return_legacy_cache': False
    },
    parallel = True,
    parallel_group_size = 32
):
    benchmark_output: list[dict[str, str]] = []
    # Prepare input texts
    input_texts = []
    for data in dataset:
        passage, question = data['passage'], data['question']
        input_texts.append(f'Passagem: {passage}\nPergunta: {question}\nResposta (0-Verdade, 1-Mentira):')

    # Obtain the predictions using the model generation (either parallel or not)
    predictions = []
    if not parallel:
        # Predict one input_text at a time
        for input_text in tqdm.tqdm(input_texts, desc='Running Superglue ptPT Benchamrk - Task BoolQ'):
            input_tokens = tokenizer.encode(input_text, return_tensors="pt").to(DEVICE)
            predictions.append(model.generate(
                input_tokens,
                max_length=input_tokens.size()[1] + 5,
                **model_kwargs
            ))
    else:
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            tokenizer.pad_token_id = tokenizer.eos_token_id
        # Create group of tokens (if there are too many tokens, GPU may not have enough memory)
        groups_of_tokens = [
            tokenizer(input_texts[i*parallel_group_size: (i+1)*parallel_group_size], return_tensors='pt', padding=True, padding_side='left')
            for i in range(len(input_texts) // parallel_group_size)
        ]
        predictions = []
        for tokens in tqdm.tqdm(groups_of_tokens, desc='Running Superglue ptPT Benchmark - Task BoolQ'):
            token_inputs, attention_mask = tokens['input_ids'].to(DEVICE),tokens['attention_mask'].to(DEVICE)
            predictions.extend(model.generate(
                token_inputs,
                attention_mask = attention_mask,
                max_length = token_inputs.shape[1] + 5,  # Generate 5 aditional tokens
                pad_token_id = tokenizer.eos_token_id
            ))
            # Clearing GPU memory
            del token_inputs, attention_mask

    # After obtaining predictions start the evaluation process
    for data, input_text, prediction in zip(dataset, input_texts, predictions):
        # Retrieve the actual answer from the prediction
        prediction = tokenizer.decode(prediction, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        predicted_answer = get_first_word(input_text, prediction)
        if not predicted_answer in ['1', '0']:
            predicted_answer = '-1' if predicted_answer is None else '1' if predicted_answer.lower() == 'sim' else '0'
        benchmark_output.append({
            'idx': data['idx'], 'input_text': input_text, 'prediction_text': prediction, 'prediction_label': predicted_answer, 'correct_label': data['label']
        })
    accurate_preds = sum(ben['prediction_label'].strip()[:1] == str(ben['correct_label']) for ben in benchmark_output)
    return {
        'benchmark': 'Superglue pt-PT: Task BoolQ',
        'accuracy': accurate_preds / len(dataset),
        'accurate_predictions': accurate_preds,
        'wrong_predictions': len(dataset) - accurate_preds,
        'benchmark_predictions': benchmark_output,
        'model': MODEL
    }
BENCHMARKS['SuperGLUE-PTPT: Task BoolQ'] = {
    'data': superglue_boolq_df.to_dict('records'),
    'data-dataframe': superglue_boolq_df,
    'evaluation': benchmark_task_boolq,    # Should be a function that receives 3 args: model, tokenizer and dataset
}

### Task 2. CB
Commitment Bank task, consist of ...

### Actual benchmark

In [None]:
# # Loading GLUE PTPT dataset onto a Pandas DataFrame
# superglue_ptpt_df = []
# for task in [
#     'axb_pt-PT',
#     'axg_pt-PT',
#     'boolq_pt-PT',  # BoolQ (Boolean Questions) QA task where the goal is to determine whether a given question is true or false based on a given passage.
#     'cb_pt-PT',     # CB (CommitmentBank) CommitmentBank is a dataset of 1,000 sentences from the Wall Street Journal annotated with a commitment rating.
#     'copa_pt-PT',   # COPA (Choice of Plausible Alternatives) QA task where the goal is to select the most plausible alternative to a given premise.
#     'mnli_matched_pt-PT',
#     'mnli_mismatched_pt-PT',
#     'mrpc_pt-PT',
#     'multirc_pt-PT',    #  MultiRC (Multi-Sentence Reading Comprehension) QA task where the goal is to read a passage and answer multiple-choice questions about it.
#     'qnli_pt-PT',
#     'rte_pt-PT',
#     'sst2_pt-PT',
#     'stsb_pt-PT',
#     'wnli_pt-PT'
# ]:
#     ds = load_dataset("PORTULAN/extraglue", data_dir='data/{}'.format(task), num_proc=5)
#     tmp = []
#     for dataset_type in ['train', 'validation', 'test']:
#         if ds.get(dataset_type, None) is None: continue
#         tmp.append(ds[dataset_type].to_pandas())
#         tmp[-1]['Dataset Type'] = dataset_type
#     superglue_ptpt_df.append(pd.concat(tmp).reset_index(drop=True))
#     superglue_ptpt_df[-1]['Task Name'] = task
# superglue_ptpt_df = pd.concat(superglue_ptpt_df).reset_index(drop=True)
# superglue_ptpt_df = superglue_ptpt_df[['Task Name', 'Dataset Type'] + superglue_ptpt_df.columns.difference(['Task Name', 'Dataset Type']).tolist()]

# # Defining the Benchmark function for CALAME-PT
# def benchmark_superglueptpt(
#     model,
#     tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
#     dataset: list[dict[str, str]],
#     model_kwargs = {
#         'do_sample': False,
#         'temperature': None,
#         'pad_token_id': 50257, # This is the tokenizer.eos_token_id
#         # 'output_scores': True,            # INCLUDE LATER to get the probability distribution and use that as a benchmark
#         # 'return_dict_in_generate': True,
#         'return_legacy_cache': False
#     },
#     parallel = False
# ):
#     return None


# # Adding it onto benchmarks dictionary
# BENCHMARKS['SuperGLUE-PTPT'] =  {
#     'data': superglue_ptpt_df.to_dict('records'),
#     'data-dataframe': superglue_ptpt_df,
#     'evaluation': benchmark_superglueptpt,    # Should be a function that receives 3 args: model, tokenizer and dataset
# }

## GLUE PT-PT

In [None]:
# # Loading GLUE PTPT dataset onto a Pandas DataFrame
# glue_ptpt_df = []
# # Iterate through 4 different GLUE taks (Read more here: https://openreview.net/pdf?id=rJ4km2R5t7)
# for task in [
#     "mrpc",     # MRPC - The Microsoft Research Paraphrase Corpus   (Similarity and Paraphrase Task)
#     "stsb",     # STS-B - Semantic Textual Similarity Benchmark     (Similarity and Paraphrase Task)
#     "rte",      # RTE - Recognizing Textual Entailment              (Inference Task)
#     "wnli"      # WNLI - Winograd Schema Challenge                  (Inference Task)
# ]:
#     ds = load_dataset("PORTULAN/extraglue", task)
#     # Transform datasets to pandas
#     tmp = []
#     for dataset_type in ['train', 'validation', 'test']:
#         tmp.append(ds[dataset_type].to_pandas())
#         tmp[-1]['Dataset Name'] = task
#         tmp[-1]['Dataset Type'] = dataset_type
#     glue_ptpt_df.append(pd.concat(tmp).reset_index(drop=True))
# glue_ptpt_df = pd.concat(glue_ptpt_df).reset_index(drop=True)

# # Defining the Benchmark function for GLUE PTPT
# def benchmark_glueptpt(
#     model,
#     tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
#     dataset: list[dict[str, str]],
#     model_kwargs = {
#         'do_sample': False,
#         'temperature': None,
#         'pad_token_id': 50257, # This is the tokenizer.eos_token_id
#         # 'output_scores': True,            # INCLUDE LATER to get the probability distribution and use that as a benchmark
#         # 'return_dict_in_generate': True,
#         'return_legacy_cache': False
#     },
#     parallel = False
# ):
#     # NOTE: We could improve this benchmark by using the PROBABILITY Distribution of the actual token.
#     benchmark_output = {}
#     if not parallel:
#         for data in tqdm.tqdm(dataset, desc='Running CALAME-PT Benchmark'):
#             # Retrieving data from the dictionary
#             n, predicted_text, correct_word = data['id'], data['sentence'], data['last_word']

#             input_tokens = tokenizer.encode(predicted_text, return_tensors="pt").to(DEVICE)
#             prediction = model.generate(
#                 input_tokens,
#                 max_length=input_tokens.size()[1] + 5,
#                 **model_kwargs
#             )
#             prediction = tokenizer.decode(prediction[0])
#             predicted_word = get_first_word(predicted_text, prediction)
#             benchmark_output[n] = {'text': predicted_text, 'prediction': predicted_word, 'correct_word': correct_word}
#     else:
#         input_tokens = [d['sentence'] for d in dataset]
#         max_length = max(len(d) for d in input_tokens) + 5
#         input_tokens = tokenizer.encode(input_tokens, return_tensors='pt', padding=True, truncation=True)['input_ids'].to(DEVICE)
#         predictions = model.generate(
#             input_tokens,
#             max_length=max_length,
#             **model_kwargs
#         )
#         for n, pred in enumerate(predictions):
#             predicted_text = tokenizer.decode(pred[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
#             predicted_word = get_first_word(dataset[n]['sentence'], predicted_text)
#             benchmark_output[n] = {'text': dataset[n]['sentence'], 'prediction': predicted_word, 'correct_word': dataset[n]['last_word']}

#     accurate_preds = sum(ben['prediction'] == ben['correct_word'] for ben in benchmark_output.values())
#     return {
#         'benchmark': 'CALAME-PT',
#         'accuracy': accurate_preds / len(BENCHMARKS['CALAME-PT']['data']),
#         'accurate_predictions': accurate_preds,
#         'wrong_predictions': len(BENCHMARKS['CALAME-PT']['data']) - accurate_preds,
#         'benchmark_predictions': benchmark_output,
#         'model': MODEL
#     }


# # Adding it onto benchmarks dictionary
# BENCHMARKS['GLUE-PTPT'] =  {
#     'data': glue_ptpt_df.to_dict('records'),
#     'data-pandas': glue_ptpt_df,
#     'evaluation': benchmark_glueptpt,    # Should be a function that receives 3 args: model, tokenizer and dataset
# }

# Perplexity & Fertility

We will use ALL datasets to calculate perplexity

In [None]:
def soft_max(arr):
    return arr.exp() / arr.exp().sum()

def real_likelihood(pred_dist, real_token):
    pred_dist = soft_max(pred_dist)
    dist_real_token = pred_dist[0, real_token]
    return dist_real_token.log()

def perplexity(input_tokens, model) -> float:
    perplex = 0
    for n in range(1, input_tokens.shape[0]):
        test_tokens = input_tokens[:n].reshape(1, n)
        real_token = input_tokens[n]
        predicted_distribution = model.generate(
            test_tokens,
            max_length=n+1,
            output_scores=True,
            return_dict_in_generate=True,
            pad_token_id = tokenizer.eos_token_id
        )['scores'][0]
        perplex += real_likelihood(predicted_distribution, real_token)
    return torch.exp(-perplex / input_tokens.shape[0]).to('cpu').item()

def compute_perplexities(model, tokens, text: list[str], parallel_group_size=10) -> list[float]:
    tokens = tokenizer(text, return_tensors="pt", padding=True).to(DEVICE)
    # Create batch of tokens (if there are too many tokens, GPU may not have enough memory)
    perplexities = []
    for tks in tqdm.tqdm(tokens['input_ids'], 'Computing Perplexity'):
        perplexities.append(perplexity(tks, model))
    return perplexities

def compute_fertilities(tokenizer, tokens, text:  list[str]) -> list[float]:
    word_counts = [len(re.findall('\w+', t)) for t in text]
    tokens_without_padding = [
        [token for token in tks if token not in (tokenizer.eos_token, tokenizer.pad_token)]
        for tks in tokens['input_ids'].tolist()
    ]
    fertilities = [len(tks) / n_words if n_words > 0 else None for tks, n_words in zip (tokens_without_padding, word_counts)]
    return fertilities

In [None]:
# Add pad_token to generate and tokenize everything in parallel
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

for benchmark in tqdm.tqdm(list(BENCHMARKS.keys()), desc='Calculating Perplexity & Fertility for all benchmarks'):
    df: pd.DataFrame = BENCHMARKS[benchmark]['data-dataframe']
    perplexity_cols, fertility_cols = [], []
    for col in df.select_dtypes('object').columns:
        perplexity_cols.append(f'PERPLEXITY[{col}]')
        fertility_cols.append(f'FERTILITY[{col}]')
        tokens = tokenizer(df[col].tolist(), return_tensors='pt', padding=True)
        df[fertility_cols[-1]] = compute_fertilities(tokenizer, tokens, df[col].tolist())
        # tokens = tokens.to(DEVICE)
        # df[perplexity_cols[-1]] = compute_perplexities(model, tokens, df[col].tolist())
    BENCHMARKS[benchmark]['data-dataframe'] = df
    # BENCHMARKS[benchmark]['Perplexity'] = df[perplexity_cols].sum().sum() / (df[perplexity_cols].shape[1] * df.shape[0])
    BENCHMARKS[benchmark]['Fertility'] = df[fertility_cols].sum().sum() / (df[fertility_cols].shape[1] * df.shape[0])

# 3. Running Benchmark

In [None]:
results = []
for benchmark, benchmark_info in BENCHMARKS.items():
    results.append(benchmark_info['evaluation'](model, tokenizer, benchmark_info['data']))


# Saving the results in a pikcle file to fetch them later
with open('benchmark_results.pkl', 'wb') as f: pickle.dump(results, f)

pd.DataFrame(results)

In [None]:

df = pd.DataFrame(results)

df.pivot(index='model', columns='benchmark', values='accuracy').reset_index()