# Evaluation

## Instalation

In [1]:
%%capture
# Uninstall existing potentially conflicting libraries
!pip uninstall -y torchvision torchaudio

# Install the correct PyTorch version for CUDA 11.8
# (This is a common CUDA version on Colab/cloud instances. Adjust if your CUDA version is different)
!pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118

# Install the other necessary libraries, potentially upgrading dependencies
!pip install --upgrade vllm==0.7.1 evaluate==0.4.3 rouge_score==0.1.2 bitsandbytes==0.45.1

# Re-install bitsandbytes specifically for CUDA 11.8 if necessary
# Sometimes a specific bitsandbytes version is needed for compatibility
!pip install bitsandbytes==0.45.1 --index-url https://download.pytorch.org/whl/cu118

## Global Variables

In [2]:
dataset_name = (
    input(
        "Enter the name of the generated dataset."
    )
)
print(f"{dataset_name=}")
model_name = (
    input(
        "Enter the name of fine-tuned LLM."
    )
)
print(f"{model_name=}")

Enter the name of the generated dataset.Kacper098/summarization_task
dataset_name='Kacper098/summarization_task'
Enter the name of fine-tuned LLM.Kacper098/Meta-Llama-3.1-8B-Instruct-Assistant-Summarization
model_name='Kacper098/Meta-Llama-3.1-8B-Instruct-Assistant-Summarization'


In [3]:
import torch


def get_gpu_info() -> str | None:
    """Gets GPU device name if available.

    Returns:
        str | None: Name of the GPU device if available, None if no GPU is found.
    """
    if not torch.cuda.is_available():
        return None

    gpu_name = torch.cuda.get_device_properties(0).name

    return gpu_name


active_gpu_name = get_gpu_info()

print("GPU type:")
print(active_gpu_name)

GPU type:
NVIDIA L4


Depending on the type of GPU you are using, we pick a max evaluation sample number to avoid waiting too much to generate the answers required for evaluation.

In [4]:
if active_gpu_name and "T4" in active_gpu_name:
    max_evaluation_samples = 8
elif active_gpu_name and ("A100" in active_gpu_name or "L4" in active_gpu_name):
    max_evaluation_samples = 70
elif active_gpu_name:
    max_evaluation_samples = 8
else:
    raise ValueError("No Nvidia GPU found.")

print("--- Parameters ---")
print(f"{max_evaluation_samples=}")

--- Parameters ---
max_evaluation_samples=70


## Load Fine-tuned LLM

In [5]:
from vllm import LLM

llm = LLM(
    model=model_name,
    max_model_len=4096,
    dtype="float16",
    quantization="bitsandbytes",
    load_format="bitsandbytes",
)

INFO 05-10 17:41:45 __init__.py:183] Automatically detected platform cuda.


config.json:   0%|          | 0.00/924 [00:00<?, ?B/s]

INFO 05-10 17:42:03 config.py:526] This model supports multiple tasks: {'reward', 'classify', 'score', 'embed', 'generate'}. Defaulting to 'generate'.
INFO 05-10 17:42:04 llm_engine.py:232] Initializing a V0 LLM engine (v0.7.1) with config: model='Kacper098/Meta-Llama-3.1-8B-Instruct-Assistant-Summarization', speculative_config=None, tokenizer='Kacper098/Meta-Llama-3.1-8B-Instruct-Assistant-Summarization', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.BITSANDBYTES, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=bitsandbytes, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_mo

tokenizer_config.json:   0%|          | 0.00/55.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

INFO 05-10 17:42:06 cuda.py:235] Using Flash Attention backend.
INFO 05-10 17:42:07 model_runner.py:1111] Starting to load model Kacper098/Meta-Llama-3.1-8B-Instruct-Assistant-Summarization...
INFO 05-10 17:42:07 loader.py:1078] Loading weights with BitsAndBytes quantization.  May take a while ...
INFO 05-10 17:42:07 weight_utils.py:251] Using model weights format ['*.safetensors']


model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 05-10 17:43:33 model_runner.py:1116] Loading model weights took 5.3422 GB
INFO 05-10 17:43:36 worker.py:266] Memory profiling takes 2.24 seconds
INFO 05-10 17:43:36 worker.py:266] the current vLLM instance can use total_gpu_memory (22.16GiB) x gpu_memory_utilization (0.90) = 19.94GiB
INFO 05-10 17:43:36 worker.py:266] model weights take 5.34GiB; non_torch_memory takes 0.05GiB; PyTorch activation peak memory takes 1.20GiB; the rest of the memory reserved for KV Cache is 13.35GiB.
INFO 05-10 17:43:36 executor_base.py:108] # CUDA blocks: 6835, # CPU blocks: 2048
INFO 05-10 17:43:36 executor_base.py:113] Maximum concurrency for 4096 tokens per request: 26.70x
INFO 05-10 17:43:38 model_runner.py:1435] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_uti

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:37<00:00,  1.08s/it]

INFO 05-10 17:44:16 model_runner.py:1563] Graph capturing finished in 38 secs, took 0.73 GiB
INFO 05-10 17:44:16 llm_engine.py:429] init engine (profile, create kv cache, warmup model) took 42.72 seconds





## Prepare Input Samples

In [6]:
from datasets import load_dataset

alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
You are a helpful assistant specialized in summarizing documents. Generate a concise TL;DR summary in markdown format having a maximum of 512 characters of the key findings from the provided documents, highlighting the most significant insights

### Input:
{}

### Response:
{}"""


def format_sample(sample: dict) -> str:
    return alpaca_prompt.format(sample["instruction"], "")

In [7]:
dataset = load_dataset(dataset_name, split="test")
dataset = dataset.select(range(max_evaluation_samples))

README.md:   0%|          | 0.00/529 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.49M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.54M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/748 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/94 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/94 [00:00<?, ? examples/s]

In [8]:
len(dataset)


70

In [9]:
dataset[0]["instruction"][:1000]

"[![Hugging Face's logo](/front/assets/huggingface_logo-noborder.svg) Hugging Face](/)\n\n  * [ Models](/models)\n  * [ Datasets](/datasets)\n  * [ Spaces](/spaces)\n  * [ Posts](/posts)\n  * [ Docs](/docs)\n  * [ Enterprise](/enterprise)\n  * [Pricing](/pricing)\n  * [Log In](/login)\n  * [Sign Up](/join)\n\n\n\nAmazon SageMaker documentation\n\nRun training on Amazon SageMaker\n\n# Amazon SageMaker\n\n🏡 View all docsAWS Trainium & InferentiaAccelerateAmazon SageMakerArgillaAutoTrainBitsandbytesChat UICompetitionsDataset viewerDatasetsDiffusersDistilabelEvaluateGradioHubHub Python LibraryHugging Face Generative AI Services (HUGS)Huggingface.jsInference API (serverless)Inference Endpoints (dedicated)LeaderboardsLightevalOptimumPEFTSafetensorsSentence TransformersTRLTasksText Embeddings InferenceText Generation InferenceTokenizersTransformersTransformers.jssmolagentstimm\n\nSearch documentation\n\n`Ctrl+K`\n\nmain EN [ 355](https://github.com/huggingface/hub-docs)\n\n[Hugging Face on Am

In [10]:
dataset[0]["answer"][:1000]

'```markdown\nTL;DR: This guide explains how to train 🤗 Transformers models on AWS SageMaker, managing outputs, using hyperparameters, and enabling checkpointing for efficient deployment.\n```'

In [11]:
dataset = dataset.map(lambda sample: {"prompt": format_sample(sample)})

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

In [12]:
dataset[0]

{'instruction': '[![Hugging Face\'s logo](/front/assets/huggingface_logo-noborder.svg) Hugging Face](/)\n\n  * [ Models](/models)\n  * [ Datasets](/datasets)\n  * [ Spaces](/spaces)\n  * [ Posts](/posts)\n  * [ Docs](/docs)\n  * [ Enterprise](/enterprise)\n  * [Pricing](/pricing)\n  * [Log In](/login)\n  * [Sign Up](/join)\n\n\n\nAmazon SageMaker documentation\n\nRun training on Amazon SageMaker\n\n# Amazon SageMaker\n\n🏡 View all docsAWS Trainium & InferentiaAccelerateAmazon SageMakerArgillaAutoTrainBitsandbytesChat UICompetitionsDataset viewerDatasetsDiffusersDistilabelEvaluateGradioHubHub Python LibraryHugging Face Generative AI Services (HUGS)Huggingface.jsInference API (serverless)Inference Endpoints (dedicated)LeaderboardsLightevalOptimumPEFTSafetensorsSentence TransformersTRLTasksText Embeddings InferenceText Generation InferenceTokenizersTransformersTransformers.jssmolagentstimm\n\nSearch documentation\n\n`Ctrl+K`\n\nmain EN [ 355](https://github.com/huggingface/hub-docs)\n\n[H

In [13]:
dataset["prompt"][0]

'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nYou are a helpful assistant specialized in summarizing documents. Generate a concise TL;DR summary in markdown format having a maximum of 512 characters of the key findings from the provided documents, highlighting the most significant insights\n\n### Input:\n[![Hugging Face\'s logo](/front/assets/huggingface_logo-noborder.svg) Hugging Face](/)\n\n  * [ Models](/models)\n  * [ Datasets](/datasets)\n  * [ Spaces](/spaces)\n  * [ Posts](/posts)\n  * [ Docs](/docs)\n  * [ Enterprise](/enterprise)\n  * [Pricing](/pricing)\n  * [Log In](/login)\n  * [Sign Up](/join)\n\n\n\nAmazon SageMaker documentation\n\nRun training on Amazon SageMaker\n\n# Amazon SageMaker\n\n🏡 View all docsAWS Trainium & InferentiaAccelerateAmazon SageMakerArgillaAutoTrainBitsandbytesChat UICompetitionsDataset viewerDatasetsDiffusersDistilabe

## Generate Answers

In [14]:
from vllm import SamplingParams

sampling_params = SamplingParams(
    temperature=0.0, top_p=0.95, min_p=0.05, max_tokens=4096
)
predictions = llm.generate(dataset["prompt"], sampling_params)

Token indices sequence length is longer than the specified maximum sequence length for this model (307301 > 131072). Running this sequence through the model will result in indexing errors
Processed prompts:   0%|          | 0/70 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]



Processed prompts:   1%|▏         | 1/70 [00:02<02:56,  2.56s/it, est. speed input: 1823.58 toks/s, output: 0.00 toks/s]



Processed prompts:  16%|█▌        | 11/70 [00:04<00:18,  3.26it/s, est. speed input: 19754.21 toks/s, output: 0.00 toks/s]



Processed prompts:  17%|█▋        | 12/70 [00:05<00:23,  2.44it/s, est. speed input: 16776.35 toks/s, output: 0.00 toks/s]



Processed prompts:  21%|██▏       | 15/70 [00:06<00:19,  2.79it/s, est. speed input: 24701.68 toks/s, output: 0.00 toks/s]



Processed prompts:  27%|██▋       | 19/70 [00:08<00:21,  2.34it/s, est. speed input: 22577.23 toks/s, output: 0.00 toks/s]



Processed prompts:  31%|███▏      | 22/70 [00:10<00:25,  1.89it/s, est. speed input: 29110.84 toks/s, output: 0.00 toks/s]



Processed prompts:  51%|█████▏    | 36/70 [00:11<00:07,  4.25it/s, est. speed input: 39152.51 toks/s, output: 0.00 toks/s]



Processed prompts:  54%|█████▍    | 38/70 [00:13<00:09,  3.54it/s, est. speed input: 37169.15 toks/s, output: 0.00 toks/s]



Processed prompts:  67%|██████▋   | 47/70 [00:15<00:06,  3.70it/s, est. speed input: 59054.13 toks/s, output: 0.00 toks/s]



Processed prompts: 100%|██████████| 70/70 [02:05<00:00,  1.79s/it, est. speed input: 7869.97 toks/s, output: 35.80 toks/s]


In [15]:
predictions[0].outputs[0].text

''

In [16]:
answers = [prediction.outputs[0].text for prediction in predictions]
answers[0]

''

In [17]:
import evaluate
import numpy as np

rouge = evaluate.load("rouge")


def compute_metrics(predictions: list[str], references: list[str]):
    result = rouge.compute(
        predictions=predictions, references=references, use_stemmer=True
    )
    result["mean_len"] = np.mean([len(p) for p in predictions])

    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [18]:
references = dataset["answer"]


In [19]:
references[0]

'```markdown\nTL;DR: This guide explains how to train 🤗 Transformers models on AWS SageMaker, managing outputs, using hyperparameters, and enabling checkpointing for efficient deployment.\n```'

In [20]:
validation_metrics = compute_metrics(answers, references)
print(validation_metrics)

{'rouge1': 0.1122, 'rouge2': 0.0519, 'rougeL': 0.0946, 'rougeLsum': 0.095, 'mean_len': 275.0571}
