# Required libraries and dependencies

In [1]:
!pip install transformers[torch] accelerate datasets trl peft torch py7zr -q -U
!pip install -q -U bitsandbytes
!pip install torchvision -q -U
!pip install llama-recipes
!pip install wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.1/315.1 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.8/245.8 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m797.2/797.2 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.4/209.4 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
import torchvision
import transformers

In [3]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import bitsandbytes as bnb
from pprint import pprint
from datasets import Dataset
import torch.nn as nn
import wandb
from datasets import load_dataset, concatenate_datasets
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          LlamaForCausalLM,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)

In [None]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [4]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
"""
Disabling tokenizer parallelism can be helpful in scenarios where you encounter issues related to parallel processing, such as race conditions or excessive memory usage. It can also make debugging easier by ensuring that operations are performed sequentially.
"""
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [6]:
!nvidia-smi

Thu Jul 25 09:54:29 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8               9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Load dataset
Here we load Samsumg/samsum dataset.

In [7]:
dataset = load_dataset("samsum",trust_remote_code=True)
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [8]:
train_dataset_filtered = dataset['train'].filter(lambda example: example['id'] != 13828807)
dataset['train'] = train_dataset_filtered

print(f"Train dataset size: {len(dataset['train'])}")
print(f"Test dataset size: {len(dataset['test'])}")
print(f"Validation dataset size: {len(dataset['validation'])}")

Filter:   0%|          | 0/14732 [00:00<?, ? examples/s]

Train dataset size: 14732
Test dataset size: 819
Validation dataset size: 818


# Load model

In [9]:
from huggingface_hub import login

login('INSERT YOUR TOKEN')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Load BitsAndBytesConfig to load a less precise version of the model to reduce memory usage

In [None]:
model_name = "meta-llama/Meta-Llama-3-8B"

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",            # from QLoRA
    bnb_4bit_compute_dtype=compute_dtype,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    torch_dtype=compute_dtype,
    quantization_config=bnb_config,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

max_seq_length = 1024 #max model input length
tokenizer = AutoTokenizer.from_pretrained(model_name, max_seq_length=max_seq_length)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Understanding the maximum tokenized dialogue length and the maximum tokenized summary length

In [None]:
def get_max_length(dataset, text_field):
    concatenated_dataset = concatenate_datasets([dataset["train"], dataset["validation"], dataset["test"]])

    # Tokenize the dataset and get input_ids
    tokenized_dataset = concatenated_dataset.map(
        lambda x: tokenizer(x[text_field], truncation=True),
        batched=True,
        remove_columns=[text_field]
    )

    # Find the maximum length of the tokenized input_ids
    max_length = max(len(x) for x in tokenized_dataset["input_ids"])

    return max_length

# Get maximum lengths for source and target fields
max_source_length = get_max_length(dataset, "dialogue")
print(f"Max source length: {max_source_length}")

max_target_length = get_max_length(dataset, "summary")
print(f"Max target length: {max_target_length}")

## Testing base model

In [10]:
sample = dataset['test'][0]

eval_prompt = f"""
You are a chat dialogue summarizer. Below is a chat between two or more people. Each message is prefixed by the speaker's name followed by a colon (:).

Your task is to summarize the chat producing a concise summary, retaining the important points and key information. If a person sends an image, its description will be provided within curved brackets.

Chat: {sample['dialogue']}
-------------------
Summary:""".strip()

In [None]:
model_input = tokenizer(eval_prompt, return_tensors="pt",padding=True,truncation=True).to(device)

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


You are a chat dialogue summarizer. Below is a chat between two or more people. Each message is prefixed by the speaker's name followed by a colon (:).

Your task is to summarize the chat producing a concise summary, retaining the important points and key information. If a person sends an image, its description will be provided within curved brackets.

Chat: Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye
-------------------
Summary: Hannah: Hey, do you have Betty's number? Amanda: Lemme check Hannah: <file_gif> Amanda: Sorry, can't find it. Amanda: Ask Larry Amanda: He called her last time we were at the park together Hannah: I don

We notice how the model doesn't understand the structure of a chat dialogue and is copying the dialogue directly.

# Fine tuning

Lora configuration for kbit training

In [None]:
from peft import get_peft_model, prepare_model_for_kbit_training, LoraConfig
from dataclasses import asdict
from llama_recipes.configs import lora_config as LORA_CONFIG

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj",],
)

model_peft = prepare_model_for_kbit_training(model)
model_peft = get_peft_model(model, peft_config)
model_peft.to(device)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear4bit(
  

In [None]:
model_peft.print_trainable_parameters()

trainable params: 167,772,160 || all params: 8,198,033,408 || trainable%: 2.0465


## Preparing data for fine tuning

Let's downsample the dataset since we do not have the capabilities to handle all of it.


In [11]:
print(dataset.keys())

dict_keys(['train', 'test', 'validation'])


In [12]:
train_size = 1500
validation_size = 300

df_train = dataset['train'].train_test_split(train_size=train_size, seed=42)['train']
df_validation = dataset['validation'].train_test_split(test_size=validation_size, seed=42)['test']

In [13]:
print("Downsampled train size:", len(df_train))
print("Downsampled validation size:", len(df_validation))

Downsampled train size: 1500
Downsampled validation size: 300


Let's insert a personalized prompt to make clearer the scope of the task to the model.

In [14]:
def generate_prompt(chat):
    return f"""
You are a chat dialogue summarizer. Below is a chat between two or more people. Each message is prefixed by the speaker's name followed by a colon (:).

Your task is to summarize the chat producing a concise summary, retaining the important points and key information. If a person sends an image, its description will be provided within curved brackets.

Chat:
{chat["dialogue"]}

-----------------------------------
Summary:
{chat["summary"]}
""".strip()

def generate_prompt_eval(chat):
    return f"""
You are a chat dialogue summarizer. Below is a chat between two or more people. Each message is prefixed by the speaker's name followed by a colon (:).

Your task is to summarize the chat producing a concise summary, retaining the important points and key information. If a person sends an image, its description will be provided within curved brackets.

Chat:
{chat["dialogue"]}

-----------------------------------
Summary:
""".strip()

Preprocess the data including the prompt

In [15]:
tokenized_train = [generate_prompt(chat) for chat in df_train]
tokenized_validation = [generate_prompt(chat) for chat in df_validation]

tokenized_train= pd.DataFrame(tokenized_train)
tokenized_train.columns = ['dialogue']
tokenized_train['summary'] = df_train['summary']
tokenized_validation = pd.DataFrame(tokenized_validation)
tokenized_validation.columns = ['dialogue']
tokenized_validation['summary'] = df_validation['summary']

train_dataset  = Dataset.from_pandas(pd.DataFrame(tokenized_train))
validation_dataset  = Dataset.from_pandas(pd.DataFrame(tokenized_validation))

In [16]:
pprint(train_dataset[0])

{'dialogue': 'You are a chat dialogue summarizer. Below is a chat between two '
             "or more people. Each message is prefixed by the speaker's name "
             'followed by a colon (:).\n'
             '\n'
             'Your task is to summarize the chat producing a concise summary, '
             'retaining the important points and key information. If a person '
             'sends an image, its description will be provided within curved '
             'brackets.\n'
             '\n'
             'Chat:\n'
             'Raymond: Can someone lend me a drill?\r\n'
             "Geoffrey: Sorry, mine's broken. \r\n"
             'Francis: You can borrow mine. I promised Molly to put up a new '
             'shelf in the kitchen :D\r\n'
             "Raymond: Haha, looks like I'll be doing you a favor?\r\n"
             "Francis: Definitely yes! Now I'm going to have a perfect excuse "
             ';)\r\n'
             'Geoffrey: Does anyone want to borrow my lawn mower?\n'


In [None]:
wandb.init(project="your_project")

[34m[1mwandb[0m: Currently logged in as: [33mbrigomarco11[0m ([33mbrigomarco11-university-of-padua[0m). Use [1m`wandb login --relogin`[0m to force relogin


## Trainer and TrainingArguments

In [None]:
output_dir="llama3-samsum"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    run_name = "last_run",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=50,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    report_to="wandb",
)

In [None]:
trainer = SFTTrainer(
    model=model_peft,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    peft_config=peft_config,
    tokenizer=tokenizer,
    dataset_text_field="dialogue",
    max_seq_length=max_seq_length,
    packing=False,
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False,
    }
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
trainer.train()

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
50,1.6455
100,1.3819
150,1.3165
200,1.3869
250,1.3366
300,1.3028
350,1.2777
400,1.3342
450,1.2636
500,1.3119


TrainOutput(global_step=750, training_loss=1.340212661743164, metrics={'train_runtime': 2320.9198, 'train_samples_per_second': 0.646, 'train_steps_per_second': 0.323, 'total_flos': 1.684786115764224e+16, 'train_loss': 1.340212661743164, 'epoch': 1.0})

Small test to see how it performs after fine tuning using appropriate generation config

In [None]:
chat = generate_prompt_eval(dataset['test'][0])

inputs = tokenizer(chat, return_tensors="pt")
output = trainer.model.generate(inputs.input_ids,
                                max_length=256,
                                min_length=50,
                                length_penalty=2.5,
                                num_beams=6,
                                repetition_penalty=2.5,
                                early_stopping=True,
                                no_repeat_ngram_size = 3
                                )
result = tokenizer.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


In [None]:
pprint(dataset['test'][0])

{'dialogue': "Hannah: Hey, do you have Betty's number?\n"
             'Amanda: Lemme check\n'
             'Hannah: <file_gif>\n'
             "Amanda: Sorry, can't find it.\n"
             'Amanda: Ask Larry\n'
             'Amanda: He called her last time we were at the park together\n'
             "Hannah: I don't know him well\n"
             'Hannah: <file_gif>\n'
             "Amanda: Don't be shy, he's very nice\n"
             'Hannah: If you say so..\n'
             "Hannah: I'd rather you texted him\n"
             'Amanda: Just text him 🙂\n'
             'Hannah: Urgh.. Alright\n'
             'Hannah: Bye\n'
             'Amanda: Bye bye',
 'id': '13862856',
 'summary': "Hannah needs Betty's number but Amanda doesn't have it. She needs "
            'to contact Larry.'}


In [None]:
pprint(result)

('You are a chat dialogue summarizer. Below is a chat between two or more '
 "people. Each message is prefixed by the speaker's name followed by a colon "
 '(:).\n'
 '\n'
 'Your task is to summarize the chat producing a concise summary, retaining '
 'the important points and key information. If a person sends an image, its '
 'description will be provided within curved brackets.\n'
 '\n'
 'Chat:\n'
 "Hannah: Hey, do you have Betty's number?\n"
 'Amanda: Lemme check\n'
 'Hannah: <file_gif>\n'
 "Amanda: Sorry, can't find it.\n"
 'Amanda: Ask Larry\n'
 'Amanda: He called her last time we were at the park together\n'
 "Hannah: I don't know him well\n"
 'Hannah: <file_gif>\n'
 "Amanda: Don't be shy, he's very nice\n"
 'Hannah: If you say so..\n'
 "Hannah: I'd rather you texted him\n"
 'Amanda: Just text him 🙂\n'
 'Hannah: Urgh.. Alright\n'
 'Hannah: Bye\n'
 'Amanda: Bye bye\n'
 '\n'
 '-----------------------------------\n'
 "Summary: Hannah asks Amanda for Betty's phone number, but Amanda d

Save the model to Huggingface

In [None]:
tokenizer.save_pretrained("llama3-samsum")
trainer.create_model_card(
    model_name="Llama3-samsum-QLora",
    language="en",
    tags=["summarization", "transformers", "llama3","Lora","QLora"],
    tasks="summarization"
)

# Push the results to the hub
trainer.push_to_hub("llama3-samsum")

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/671M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.50k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/MarcoBrigo11/trained_weigths/commit/a5b1844ddbeb9875706a127e1913d5d961feb38a', commit_message='llama3-samsum', commit_description='', oid='a5b1844ddbeb9875706a127e1913d5d961feb38a', pr_url=None, pr_revision=None, pr_num=None)

# Evaluation *(Work in progress)*

Using this function requires high computational resources.

In [None]:
!pip install evaluate
!pip install rouge_score

Let's evaluate the ROUGE with test set

In [18]:
import evaluate
rouge_metric = evaluate.load('rouge')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [42]:
def evaluate_model(model, tokenizer, texts, references, batch_size=8):
    model.eval()
    all_predictions = []
    all_references = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Evaluating", unit="batch"):
        batch_texts = texts[i:i+batch_size]
        batch_references = references[i:i+batch_size]

        print(f"Batch {i // batch_size + 1}/{len(texts) // batch_size + 1}")

        inputs = tokenizer(batch_texts, return_tensors="pt", max_length=max_seq_length,truncation=True,padding=True).to(device)
        with torch.no_grad():
            #print("Generating summaries...")
            summary_ids = model.generate(inputs.input_ids, max_length=512,
                                min_length=50,
                                length_penalty=2.5,
                                num_beams=6,
                                repetition_penalty=2.5,
                                early_stopping=True,
                                no_repeat_ngram_size = 3)
            #print("Summaries generated.")

        predictions = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]

        all_predictions.extend(predictions)
        all_references.extend(batch_references)

        #pprint(f"Predictions: {predictions}")
        #pprint(f"References: {batch_references}")

    result = rouge_metric.compute(predictions=all_predictions, references=all_references, use_stemmer=True)
    return result

In [20]:
test_size = 300

df_test = dataset['test'].train_test_split(test_size=test_size, seed=42)['test']

In [21]:
print(len(df_test))

300


In [22]:
tokenized_test = [generate_prompt_eval(chat) for chat in df_test]
tokenized_test = pd.DataFrame(tokenized_test)
tokenized_test.columns = ['dialogue']
tokenized_test['summary'] = df_test['summary']

test_dataset  = Dataset.from_pandas(pd.DataFrame(tokenized_test))

Loading the saved model and loading it with BitsAndBytesConfig

In [23]:
fine_tuned_model_id = "MarcoBrigo11/llama3-samsum"

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

max_seq_length = 1024

model_fine_tuned = AutoModelForCausalLM.from_pretrained(fine_tuned_model_id,
                                                        device_map=device,
                                                        torch_dtype=compute_dtype,
                                                        quantization_config=bnb_config
                                                        )

model_fine_tuned.config.use_cache = False
model_fine_tuned.config.pretraining_tp = 1

tokenizer_fine_tuned = AutoTokenizer.from_pretrained(fine_tuned_model_id,max_seq_length=max_seq_length)
tokenizer_fine_tuned.pad_token_id = tokenizer_fine_tuned.eos_token_id
tokenizer_fine_tuned.padding_side = "left"

adapter_config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/671M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/335 [00:00<?, ?B/s]

In [43]:
rouge_scores_final = evaluate_model(model_fine_tuned, tokenizer_fine_tuned, test_dataset['dialogue'], test_dataset['summary'], batch_size=4)

Starting evaluation...


Evaluating:   0%|          | 0/75 [00:00<?, ?batch/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Batch 1/76
Generating summaries...


Evaluating:   1%|▏         | 1/75 [01:29<1:50:34, 89.66s/batch]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Summaries generated.
("Predictions: ['You are a chat dialogue summarizer. Below is a chat between "
 "two or more people. Each message is prefixed by the speaker\\'s name "
 'followed by a colon (:).\\n\\nYour task is to summarize the chat producing a '
 'concise summary, retaining the important points and key information. If a '
 'person sends an image, its description will be provided within curved '
 'brackets.\\n\\nChat:\\nClaire: <file_photo>\\r\\nKim: Looks '
 "delicious...\\r\\nLinda: No way... Look what I\\'m cooking right "
 'now:\\r\\nLinda: <file_photo>\\r\\nClaire: hahahaha \\r\\nKim: Curry dream '
 'team\\r\\nClaire: Enjoy your dinner '
 ':*\\n\\n-----------------------------------\\nSummary: Claire sent Kim a '
 'photo of her dinner. Kim commented that it looked delicious. Linda sent a '
 'picture of what she was cooking at the moment. Claire said '
 '"hahahaha".\\r\\n-----------------------------------\\n-----------------------------------\\nPhoto: '
 "Claire\\'s dinner\

Evaluating:   1%|▏         | 1/75 [01:40<2:03:44, 100.33s/batch]

An error occurred: CUDA out of memory. Tried to allocate 4.58 GiB. GPU 0 has a total capacity of 14.75 GiB of which 1.92 GiB is free. Process 9680 has 12.83 GiB memory in use. Of the allocated memory 9.81 GiB is allocated by PyTorch, and 2.89 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)





In [44]:
pd.DataFrame(rouge_scores_final, index=[fine_tuned_model_id])

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
MarcoBrigo11/llama3-samsum,0.132285,0.05055,0.096523,0.12074
