# WikiQA Evaluation

In [1]:
import os
import torch
import numpy as np
torch.manual_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
print(torch.cuda.device_count())

from fairscale.nn.model_parallel import initialize_model_parallel
import torch.distributed as dist

master_port="12380"
# Set up distributed environment
def setup(rank, world_size):
    "Sets up the process group and configuration for PyTorch Distributed Data Parallelism"
    os.environ["MASTER_ADDR"] = "localhost"
    # os.environ["MASTER_PORT"] = "12355"
    # Prevent conflict with jupyter in case.
    os.environ["MASTER_PORT"] = str(master_port)
    # Initialize the process group
    if not dist.is_initialized():
        dist.init_process_group("gloo", rank=rank, world_size=world_size)
def cleanup():
    "Cleans up the distributed environment"
    dist.destroy_process_group()

setup(0, 1)
initialize_model_parallel(1)

cuda
1
> initializing model parallel with size 1
> initializing ddp with size 1
> initializing pipeline with size 1


In [2]:
from transformers import LlamaTokenizer
from torch.utils.data import Dataset
from datasets import load_dataset
from logging import getLogger
from typing import List
logger = getLogger()

tokenizer = LlamaTokenizer("./tokenizers/tokenizer.model")
tokenizer.add_bos_token = False
tokenizer.add_eos_token = False

import random
class PromptDataset(Dataset):

    def __init__(self, prompt_dataset, answer_dataset) -> None:
        super().__init__()
        self.prompt_dataset = prompt_dataset
        self.answer_dataset = answer_dataset

        c = list(zip(self.prompt_dataset, self.answer_dataset))

        random.shuffle(c)

        self.prompt_dataset, self.answer_dataset = zip(*c)

        assert len(self.prompt_dataset) == len(self.answer_dataset)

    def __len__(self):
        return len(self.prompt_dataset)

    def __getitem__(self, idx):
        return {
            "prompt": self.prompt_dataset[idx],
            "answer": self.answer_dataset[idx]
        }


class EvaluationDataset(Dataset):
    def __init__(self, data, max_seq_len):
        self.input = data
        self.prompt_dataset = []
        self.answer_dataset = []
        for val in self.input:
            question = val["question"]
            answer = val["answer"]
            if len(question.split(" ")) < max_seq_len:
                self.prompt_dataset.append(question)
                self.answer_dataset.append(answer)
            assert len(self.prompt_dataset) == len(self.answer_dataset)

    def __len__(self):
        return len(self.prompt_dataset)

    def __getitem__(self, idx):
        return {
            "prompt": self.prompt_dataset[idx],
            "answer": self.answer_dataset[idx]
        }

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
import json

from eval.evaluations import eval_general
from typing import List, Optional

from generators.generator_with_runtime_copy import Llama, Dialog




def runnn():
    #ckpt_dir = "./checkpoints/baseline_slimpj_full_try2.pt"
    ckpt_dir = "./checkpoints/novel_proto_8_layer.pt"
    tokenizer_path = "./tokenizers/tokenizer.model"
    temperature: float = 0.6
    top_p: float = 0.9
    max_seq_len: int = 512
    batch_size: int = 32
    max_gen_len: Optional[int] = None

    def prediction(
        model,
        infer_dataset,
        temperature: float = 0.6,
        top_p: float = 0.9,
        max_seq_len: int = 512,
        batch_size: int = 32,
        max_gen_len: Optional[int] = None,
        idx=None,
    ):
        predicted_sequences = []
        ground_truths = []
        for i, step in enumerate(range(0, batch_size, batch_size)):
            if step % 10:
                print(f"Batch: {i}, Samples: {step}")
            batch = infer_dataset[step : step + batch_size]

            prompts: List[str] = batch["prompt"]
            answers = batch["answer"]
            ground_truths += answers

            predicted_sequence = model.text_completion(
                prompts,  # type: ignore
                max_gen_len=max_gen_len,
                temperature=temperature,
                top_p=top_p,
            )
            predicted_sequences += predicted_sequence
        return predicted_sequences, ground_truths

    def save_inference_results(
        evaluation_result: dict,
        sources_sequences: list,
        predicted_sequences: list,
        ground_truths: list,
    ):
        # save as a json file
        df = {
            "eval": evaluation_result,
            "prompts": sources_sequences,
            "results": predicted_sequences,
            "labels": ground_truths,
        }
        with open("testQA.json", "w+", encoding="utf-8") as file:
            json.dump(df, file, ensure_ascii=False)

    model = Llama.build(
        ckpt_dir=ckpt_dir,
        tokenizer_path=tokenizer_path,
        max_seq_len=max_seq_len,
        max_batch_size=batch_size,
    )

    from eval.data.data_utils import create_prompt_dataset

    data = load_dataset("wiki_qa", split="test")
    data_split = data.select(range(0, len(data)))
    infer_dataset = EvaluationDataset(data_split, 512)

    # Inference !
    predicted_sequences, ground_truths = prediction(model, infer_dataset)
    return predicted_sequences, ground_truths
    # Get Accuracy/ROUGE/BLEU/...
    # The evaluation result is stored in a dictionary. e.g. {"accuracy": .., "rouge-L": ..}
    evaluation_result = eval_general.eval(predicted_sequences, ground_truths)
    # if args.global_rank <= 0:  # only one process is running
    print("***** Saving inference results *****")
    save_inference_results(
        evaluation_result,
        infer_dataset[:]["prompt"],
        predicted_sequences,
        ground_truths,
    )

    return evaluation_result, model.runtimes


# result, runtime = runnn()

# mean = np.mean(runtime)

# low = np.percentile(runtime, 5)
# median = np.percentile(runtime, 50)
# high = np.percentile(runtime, 95)
# std = np.std(runtime)
# runtime_metrics = [mean, median, low, high, std]

# metrics = ["mean", "median", "low", "high", "std"]
# tables = {metric: {} for metric in metrics}
# print(tables)
# for key, val in result.items():
#     for attr, value in val.items():
#         tables[attr][key] = value

# count = 0
# for metric, values in tables.items():
#     print(f"Table for {metric}:")
#     res = ""
#     for v in values:
#         res += str(values[v]) + " "
#     res += str(runtime_metrics[count])
#     count += 1
#     print(res)
#     print()



In [7]:
mean = np.mean(runtime)

low = np.percentile(runtime, 5)
median = np.percentile(runtime, 50)
high = np.percentile(runtime, 95)
std = np.std(runtime)
runtime_metrics = [mean, median, low, high, std]
mean, median, low, high, std

(5296310.842741208, 5341310.0, 4689967.3, 5562894.0, 3369310.761344379)

In [8]:
metrics = ['mean', 'median', 'low', 'high', 'std']
tables = {metric: {} for metric in metrics}
print(tables)
for key, val in result.items():
    for attr, value in val.items():
        tables[attr][key] = value

count = 0
for metric, values in tables.items():
    print(f"Table for {metric}:")
    res = ""
    for v in values:
        res += str(values[v]) + ' '
    res += str(runtime_metrics[count])
    count += 1
    print(res)
    print()

{'mean': {}, 'median': {}, 'low': {}, 'high': {}, 'std': {}}
Table for mean:
0.009170254200980296 0.0014920185621561208 0.08956978928953643 0.08650932804574835 3.258996212121221 5296310.842741208

Table for median:
0.009680374259169351 0.00157209915425711 0.09458063018927546 0.091370973407304 3.4393939393939394 5341310.0

Table for low:
0.009144539330610981 0.0013564956303575384 0.0913219874170982 0.08936381649239213 3.3267045454545454 4689967.3

Table for high:
0.010249341575857373 0.0018227574042409792 0.09797446146321216 0.09331478087729288 3.5615530303030303 5562894.0

Table for std:
0.01118627798610428 0.004564195435031823 0.06736035281845515 0.039505201758894055 2.3658068338622282 3369310.761344379



In [None]:
import pandas as pd
count = 0
for metric, values in tables.items():
    print(f"Table for {metric}:")
    res = []
    for v in values:
        res.append(values[v])
    res.append(runtime_metrics[count])
    count += 1
    df = pd.DataFrame([res])
    print(df.head())

In [4]:
predict, gt = runnn()

Loaded in 1.44 seconds
