In [1]:
%pip install \
    transformers \
    datasets \
    evaluate \
    rouge_score\
    loralib \
    bitsandbytes \
    peft --quiet \
    -U bitsandbytes

[33m  DEPRECATION: Building 'rouge_score' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'rouge_score'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import torch
import gc
import huggingface_hub
import time
import random
import evaluate
import pandas as pd
import bitsandbytes as bnb
from getpass import getpass
from tqdm.notebook import tqdm

from datasets import (
    load_dataset,
    concatenate_datasets
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoConfig,
    TrainingArguments,
    Trainer,
    # DataCollatorForSeq2Seq,
    DataCollatorForLanguageModeling
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    TaskType,
    PeftModel
)

In [15]:
hf_token = getpass("Hugging Face: ")
huggingface_hub.login(hf_token)

Hugging Face:  ········


# load dataset


## CNN/DailyMail

In [4]:
dataset = load_dataset("abisee/cnn_dailymail", "3.0.0")

README.md: 0.00B [00:00, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

## base preprocess

In [6]:
dataset["train"] = dataset["train"].remove_columns(["id"])
dataset["validation"] = dataset["validation"].remove_columns(["id"])
dataset["test"] = dataset["test"].remove_columns(["id"])

dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights'],
        num_rows: 11490
    })
})

In [7]:
df_train = pd.DataFrame(dataset['train'])
df_validation = pd.DataFrame(dataset['validation'])
df_test = pd.DataFrame(dataset['test'])

df = pd.concat([df_train, df_validation, df_test])

df

Unnamed: 0,article,highlights
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa..."
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non..."
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical..."
...,...,...
11485,Telecom watchdogs are to stop a rip-off that a...,Operators are charging up to 20p a minute - ev...
11486,The chilling reenactment of how executions are...,Bali Nine ringleaders will face the firing squ...
11487,It is a week which has seen him in deep water ...,Hardy was convicted of domestic abuse against ...
11488,"Despite the hype surrounding its first watch, ...",Apple sold more than 61 million iPhones in the...


In [8]:
df.isnull().sum()

article       0
highlights    0
dtype: int64

In [9]:
combined = concatenate_datasets([dataset["validation"], dataset["test"]])

dataset["test"] = combined

del dataset["validation"]

dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights'],
        num_rows: 287113
    })
    test: Dataset({
        features: ['article', 'highlights'],
        num_rows: 24858
    })
})

# base model (Aya8b)


In [16]:
model_name = "CohereLabs/aya-expanse-8b"

bnbConfig = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map = "auto",
    quantization_config=bnbConfig
)

tokenizer_config.json:   0%|          | 0.00/8.64k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/12.8M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/634 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

## ICL


### one shot

In [17]:
prompt = f"""
You are a professional summarizer for news articles.
Your task is to write a concise and accurate summary of a given news article.
The summary must follow the same style, tone, and length as the example provided.
Always put the final summary after <<<SUMMARY>>> marker.

Example:
Original article:
{dataset['test']['article'][2]}

Target summary:
<<<SUMMARY>>> {dataset['test']['highlights'][2]}

Now, summarize the following news article in the same style:
Original article:
{dataset['train']['article'][2]}

Summary:
<<<SUMMARY>>>
"""

In [18]:
device = torch.device("cuda")
inputs = tokenizer(prompt, return_tensors='pt').to(device)

In [19]:
model = model.eval()
with torch.no_grad():
  output = tokenizer.decode(
      model.generate(
          inputs["input_ids"],
          max_new_tokens=200,
      )[0],
      skip_special_tokens=True
  )
print(output)


You are a professional summarizer for news articles.
Your task is to write a concise and accurate summary of a given news article.
The summary must follow the same style, tone, and length as the example provided.
Always put the final summary after <<<SUMMARY>>> marker.

Example:
Original article:
(CNN)French striker Bafetimbi Gomis, who has a history of fainting, said he is now "feeling well" after collapsing during Swansea's 3-2 loss at Tottenham in the Premier League on Wednesday. The worrying incident occurred in the first half at White Hart Lane -- after Tottenham scored in the seventh minute -- but the 29-year-old left the pitch conscious following about five minutes of treatment. The Guardian added that he was wearing an oxygen mask. Play was temporarily stopped before resuming. As the match progressed, Swansea tweeted that Gomis was "fine," with manager Garry Monk using the same word to describe Gomis' condition. Gomis spent the night in hospital as a precaution, Swansea said o

### Zero shot

In [20]:
prompt = f"""
You are a professional summarizer for news articles.
Your task is to write a concise and accurate summary of a given news article.
Always put the final summary after <<<SUMMARY>>> marker.

Original article:
{dataset['train']['article'][0]}

Summary:
<<<SUMMARY>>>
"""

inputs = tokenizer(
    prompt,
    return_tensors="pt",
    padding=True,
    truncation=True,
    # max_length=200
).to(device)

# removing prompt from output ------------------------------

input_token_length = inputs["input_ids"].shape[1]

full_output_ids = model.generate(
    inputs["input_ids"],
    max_new_tokens=300,
    do_sample=False  # deterministic
)[0]

output_only_ids = full_output_ids[input_token_length:]

#-----------------------------------------------------------

cleaned_output = tokenizer.decode(
    output_only_ids,
    skip_special_tokens=True
)

cleaned_output

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


'Harry Potter star Daniel Radcliffe turns 18 on Monday, gaining access to a reported £20 million ($41.1 million) fortune.\nDespite his growing fame and riches, Radcliffe says he is keeping his feet firmly on the ground.\nHe has no plans to fritter his cash away on fast cars, drink and celebrity parties.\nRadcliffe will reprise his role as the boy wizard in the last two "Harry Potter" films.\nHe has also filmed a TV movie called "My Boy Jack" and will appear in "December Boys."\nHe made his stage debut playing a tortured teenager in "Equus" earlier this year.\nHe is braced for even closer media scrutiny now that he\'s legally an adult.'

## ROUGE Evaluation

In [21]:
# n_test = len(dataset['test'])

model_pred = []

for i in tqdm(range(222), desc="Processing summarizations"):
  # index = random.randint(0, n_test)
  # source = dataset['train']['article'][index]
  # target = dataset['train']['highlights'][index]

  article = dataset['test']['article'][i]

  prompt = f"""
  You are a professional summarizer for news articles.
  Your task is to write a concise and accurate summary of a given news article.
  Always put the final summary after <<<SUMMARY>>> marker.

  Original article:
  {article}

  Summary:
  <<<SUMMARY>>>
  """

  inputs = tokenizer(
      prompt,
      return_tensors="pt",
      padding=True,
      truncation=True,
      # max_length=200
  ).to(device)

  input_token_length = inputs["input_ids"].shape[1]

  full_output_ids = model.generate(
      inputs["input_ids"],
      max_new_tokens=300,
      do_sample=False  # deterministic
  )[0]

  output_only_ids = full_output_ids[input_token_length:]

  cleaned_output = tokenizer.decode(
      output_only_ids,
      skip_special_tokens=True
  )

  model_pred.append(cleaned_output)

Processing summarizations:   0%|          | 0/222 [00:00<?, ?it/s]

In [32]:
df_test = pd.DataFrame(dataset['test'])

df_sample = df_test.head(222)
df_sample["Pred(model)"] = model_pred
df_sample.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample["Pred(model)"] = model_pred


Unnamed: 0,article,highlights,Pred(model)
0,"(CNN)Share, and your gift will be multiplied. ...",Zully Broussard decided to give a kidney to a ...,A woman's decision to donate one of her kidne...
1,"(CNN)On the 6th of April 1996, San Jose Clash ...",The 20th MLS season begins this weekend .\nLea...,- MLS has grown from 10 teams in 1996 to 20 i...
2,"(CNN)French striker Bafetimbi Gomis, who has a...",Bafetimbi Gomis collapses within 10 minutes of...,French striker Bafetimbi Gomis of Swansea Cit...
3,(CNN)It was an act of frustration perhaps more...,Rory McIlroy throws club into water at WGC Cad...,"- Rory McIlroy, the world's No 1. golfer, lau..."
4,(CNN)A Pennsylvania community is pulling toget...,"Cayman Naib, 13, hasn't been heard from since ...",A Pennsylvania community is searching for an ...


### eval results

In [33]:
rouge = evaluate.load("rouge")

results = rouge.compute(
    predictions=df_sample["Pred(model)"].tolist(),
    references=df_sample["highlights"].tolist(),
    use_stemmer=True
)

scores_df = pd.DataFrame({
    "Metric": ["ROUGE-1", "ROUGE-2", "ROUGE-L"],
    "Score": [
        results["rouge1"],
        results["rouge2"],
        results["rougeL"]
    ]
})

scores_df["Score (%)"] = (scores_df["Score"] * 100).round(2)

print(scores_df)

Downloading builder script: 0.00B [00:00, ?B/s]

    Metric     Score  Score (%)
0  ROUGE-1  0.329092      32.91
1  ROUGE-2  0.129679      12.97
2  ROUGE-L  0.225285      22.53


# PEFT model

In [24]:
def print_number_of_trainable_model_parameters(model):
    all_model_params = model.num_parameters()
    trainable_model_params = sum(param.numel() for param in model.parameters() if param.requires_grad)

    percentage_trainable = 100 * trainable_model_params / all_model_params if all_model_params > 0 else 0

    return (f"Trainable model parameters: {trainable_model_params}\n"
            f"All model parameters: {all_model_params}\n"
            f"Percentage of trainable model parameters: {percentage_trainable:.2f}%")

print(print_number_of_trainable_model_parameters(model))

Trainable model parameters: 1048711168
All model parameters: 8028033024
Percentage of trainable model parameters: 13.06%


In [25]:
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [26]:
modules = find_all_linear_names(model)
modules

['gate_proj', 'q_proj', 'up_proj', 'down_proj', 'k_proj', 'v_proj', 'o_proj']

In [27]:
model = prepare_model_for_kbit_training(model)

In [28]:
peft_config = LoraConfig(
    r=16,
    lora_alpha=8,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
peft_model = get_peft_model(model, peft_config)
print(print_number_of_trainable_model_parameters(peft_model))

Trainable model parameters: 41943040
All model parameters: 8069976064
Percentage of trainable model parameters: 0.52%


In [29]:
def preprocess_fn(examples):
    prompts = [
        f"You are a professional summarizer for news articles.\n"
        f"Your task is to write a concise and accurate summary of a given news article.\n"
        f"Always put the final summary after <<<SUMMARY>>> marker.\n\n"
        f"Original article:\n"
        f"{src}\n\nSummary:\n<<<SUMMARY>>>"
        for src in examples["article"]
    ]
    targets = examples["highlights"]
    full_texts = [p + " " + t for p, t in zip(prompts, targets)]

    model_inputs = tokenizer(
        full_texts,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    labels = model_inputs["input_ids"].clone()

    for i, prompt in enumerate(prompts):
        prompt_ids = tokenizer(prompt, truncation=True, max_length=128)["input_ids"]
        prompt_len = len(prompt_ids)
        labels[i][:prompt_len] = -100  # mask prompt from loss

    model_inputs["labels"] = labels
    return model_inputs


In [30]:
# dtrain = dataset["train"].shuffle(seed=65).select(range(5000))

In [34]:
tokenized_train = dataset["train"].map(
# tokenized_train = dtrain.map(
    preprocess_fn,
    batched=True,
    remove_columns=["article", "highlights"]
)

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

In [35]:
# dtest = dataset["test"].shuffle(seed=65).select(range(1000))

In [36]:
tokenized_eval = dataset["test"].map(
# tokenized_eval = dtest.map(
    preprocess_fn,
    batched=True,
    remove_columns=["article", "highlights"]
)

Map:   0%|          | 0/24858 [00:00<?, ? examples/s]

In [44]:
output_dir = f'./peft-cnn-summarization-{str(int(time.time()))}'
peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    gradient_accumulation_steps=2,
    warmup_steps =1000,
    logging_strategy="steps",
    lr_scheduler_type="cosine",
    # fp16=True,
    learning_rate=2e-5,
    logging_steps=100,
    eval_steps=1000,
    max_steps=3000,
    label_names=["labels"],
    log_level="info",
    report_to="none",
)

PyTorch: setting up devices
average_tokens_across_devices is True but world size is 1. Setting it to False automatically.


In [46]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    # padding="max_length",
    # max_length=300,
    mlm=False,
    return_tensors="pt"
    
)

In [47]:
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    tokenizer=tokenizer
)

  peft_trainer = Trainer(
max_steps is given, it will override any value given in num_train_epochs


In [48]:
train_output = peft_trainer.train()

***** Running training *****
  Num examples = 287,113
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 3,000
  Number of trainable parameters = 41,943,040
  return fn(*args, **kwargs)


Step,Training Loss
100,2.4791
200,2.2281
300,1.6481
400,1.4452
500,1.4247
600,1.391
700,1.382
800,1.3871
900,1.3848
1000,1.3391


Saving model checkpoint to ./peft-cnn-summarization-1754841542/checkpoint-500
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--CohereLabs--aya-expanse-8b/snapshots/574bdb00b4dbbacae3d9666906045bafe5a5b44f/config.json
Model config CohereConfig {
  "architectures": [
    "CohereForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 5,
  "eos_token_id": 255001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "layer_norm_eps": 1e-05,
  "logit_scale": 0.125,
  "max_position_embeddings": 8192,
  "model_type": "cohere",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pad_token_id": 0,
  "rope_scaling": null,
  "rope_theta": 10000,
  "torch_dtype": "float16",
  "transformers_version": "4.55.0",
  "use_cache": true,
  "use_qk_norm": false,
  "vocab_size": 256000
}

chat template saved in ./peft-cnn-summarization-175484154

## save model

In [49]:
hf_token = getpass("Hugging Face: ")
huggingface_hub.login(hf_token)

Hugging Face:  ········


In [None]:
gc.collect()
torch.cuda.empty_cache()

In [50]:
peft_model_path = "./peft-cnn-summarization"
peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)


base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map = "auto",
    quantization_config=bnbConfig
)

peft_model = PeftModel.from_pretrained(base_model, peft_model_path)

merged_model = peft_model.merge_and_unload()

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--CohereLabs--aya-expanse-8b/snapshots/574bdb00b4dbbacae3d9666906045bafe5a5b44f/config.json
Model config CohereConfig {
  "architectures": [
    "CohereForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 5,
  "eos_token_id": 255001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "layer_norm_eps": 1e-05,
  "logit_scale": 0.125,
  "max_position_embeddings": 8192,
  "model_type": "cohere",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pad_token_id": 0,
  "rope_scaling": null,
  "rope_theta": 10000,
  "torch_dtype": "float16",
  "transformers_version": "4.55.0",
  "use_cache": true,
  "use_qk_norm": false,
  "vocab_size": 256000
}

chat template saved in ./peft-cnn-summarization/chat_template.jinja
tokenizer config file saved in ./peft-cnn-summarization/tokenizer_c

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing CohereForCausalLM.

All the weights of CohereForCausalLM were initialized from the model checkpoint at CohereLabs/aya-expanse-8b.
If your task is similar to the task the model of the checkpoint was trained on, you can already use CohereForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--CohereLabs--aya-expanse-8b/snapshots/574bdb00b4dbbacae3d9666906045bafe5a5b44f/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 5,
  "eos_token_id": 255001,
  "pad_token_id": 0
}



In [51]:
full_model_path = "./peft-cnnDailyMail-summarization-aya-8b"
merged_model.save_pretrained(full_model_path)
tokenizer.save_pretrained(full_model_path)

Configuration saved in ./peft-cnnDailyMail-summarization-aya-8b/config.json
Configuration saved in ./peft-cnnDailyMail-summarization-aya-8b/generation_config.json
The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at ./peft-cnnDailyMail-summarization-aya-8b/model.safetensors.index.json.
chat template saved in ./peft-cnnDailyMail-summarization-aya-8b/chat_template.jinja
tokenizer config file saved in ./peft-cnnDailyMail-summarization-aya-8b/tokenizer_config.json
Special tokens file saved in ./peft-cnnDailyMail-summarization-aya-8b/special_tokens_map.json


('./peft-cnnDailyMail-summarization-aya-8b/tokenizer_config.json',
 './peft-cnnDailyMail-summarization-aya-8b/special_tokens_map.json',
 './peft-cnnDailyMail-summarization-aya-8b/chat_template.jinja',
 './peft-cnnDailyMail-summarization-aya-8b/tokenizer.json')

In [52]:
merged_model.push_to_hub("MVesalA/cnnDailyMail-summarization-aya-8b-peft")
tokenizer.push_to_hub("MVesalA/cnnDailyMail-summarization-aya-8b-peft")

Configuration saved in /tmp/tmpvawqe07f/config.json
Configuration saved in /tmp/tmpvawqe07f/generation_config.json
The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at /tmp/tmpvawqe07f/model.safetensors.index.json.
Uploading the following files to MVesalA/cnnDailyMail-summarization-aya-8b-peft: config.json,generation_config.json,model-00001-of-00002.safetensors,model-00002-of-00002.safetensors,model.safetensors.index.json,README.md


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...7f/model-00002-of-00002.safetensors:   0%|          |  688kB / 1.05GB            

  ...7f/model-00001-of-00002.safetensors:   0%|          | 12.9kB / 4.98GB            

README.md: 0.00B [00:00, ?B/s]

chat template saved in /tmp/tmpd0xhwabb/chat_template.jinja
tokenizer config file saved in /tmp/tmpd0xhwabb/tokenizer_config.json
Special tokens file saved in /tmp/tmpd0xhwabb/special_tokens_map.json
Uploading the following files to MVesalA/cnnDailyMail-summarization-aya-8b-peft: chat_template.jinja,tokenizer_config.json,special_tokens_map.json,tokenizer.json,README.md


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/tmpd0xhwabb/tokenizer.json       :   8%|7         | 1.60MB / 20.1MB            

CommitInfo(commit_url='https://huggingface.co/MVesalA/cnnDailyMail-summarization-aya-8b-peft/commit/9283267935e1e25f3e0c14e8758881ad58193f65', commit_message='Upload tokenizer', commit_description='', oid='9283267935e1e25f3e0c14e8758881ad58193f65', pr_url=None, repo_url=RepoUrl('https://huggingface.co/MVesalA/cnnDailyMail-summarization-aya-8b-peft', endpoint='https://huggingface.co', repo_type='model', repo_id='MVesalA/cnnDailyMail-summarization-aya-8b-peft'), pr_revision=None, pr_num=None)

## ROUGE Evaluation

In [54]:
tokenizer = AutoTokenizer.from_pretrained("MVesalA/cnnDailyMail-summarization-aya-8b-peft")

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/20.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--MVesalA--cnnDailyMail-summarization-aya-8b-peft/snapshots/9283267935e1e25f3e0c14e8758881ad58193f65/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--MVesalA--cnnDailyMail-summarization-aya-8b-peft/snapshots/9283267935e1e25f3e0c14e8758881ad58193f65/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--MVesalA--cnnDailyMail-summarization-aya-8b-peft/snapshots/9283267935e1e25f3e0c14e8758881ad58193f65/tokenizer_config.json
loading file chat_template.jinja from cache at /root/.cache/huggingface/hub/models--MVesalA--cnnDailyMail-summarization-aya-8b-peft/snapshots/9283267935e1e25f3e0c14e8758881ad58193f65/chat_template.jinja
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [56]:
n_test = len(dataset['test'])

model_pred = []

for i in tqdm(range(222), desc="Processing summarizations"):
  # index = random.randint(0, n_test)
  # source = dataset['train']['article'][index]
  # target = dataset['train']['highlights'][index]

  article = dataset['test']['article'][i]

  prompt = f"""
  You are a professional summarizer for news articles.
  Your task is to write a concise and accurate summary of a given news article.
  Always put the final summary after <<<SUMMARY>>> marker.

  Original article:
  {article}

  Summary:
  <<<SUMMARY>>>
  """

  inputs = tokenizer(
      prompt,
      return_tensors="pt",
      padding=True,
      truncation=True,
      # max_length=200
  ).to(device)

  input_token_length = inputs["input_ids"].shape[1]

  full_output_ids = merged_model.generate(
      inputs["input_ids"],
      max_new_tokens=300,
      do_sample=False  # deterministic
  )[0]

  output_only_ids = full_output_ids[input_token_length:]

  cleaned_output = tokenizer.decode(
      output_only_ids,
      skip_special_tokens=True
  )

  model_pred.append(cleaned_output)

Processing summarizations:   0%|          | 0/222 [00:00<?, ?it/s]

In [57]:
df_sample = df_test.head(222)
df_sample["Pred(peft)"] = model_pred
df_sample.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sample["Pred(peft)"] = model_pred


Unnamed: 0,article,highlights,Pred(peft)
0,"(CNN)Share, and your gift will be multiplied. ...",Zully Broussard decided to give a kidney to a ...,A woman's decision to donate one of her kidne...
1,"(CNN)On the 6th of April 1996, San Jose Clash ...",The 20th MLS season begins this weekend .\nLea...,- MLS has grown from 10 teams in 1996 to 20 i...
2,"(CNN)French striker Bafetimbi Gomis, who has a...",Bafetimbi Gomis collapses within 10 minutes of...,French striker Bafetimbi Gomis of Swansea Cit...
3,(CNN)It was an act of frustration perhaps more...,Rory McIlroy throws club into water at WGC Cad...,"- Rory McIlroy, the world's No 1. golfer, lau..."
4,(CNN)A Pennsylvania community is pulling toget...,"Cayman Naib, 13, hasn't been heard from since ...",A Pennsylvania community is searching for an ...


### eval results

In [58]:
results = rouge.compute(
    predictions=df_sample["Pred(peft)"].tolist(),
    references=df_sample["highlights"].tolist(),
    use_stemmer=True
)

scores_df = pd.DataFrame({
    "Metric": ["ROUGE-1", "ROUGE-2", "ROUGE-L"],
    "Score": [
        results["rouge1"],
        results["rouge2"],
        results["rougeL"]
    ]
})

scores_df["Score (%)"] = (scores_df["Score"] * 100).round(2)

print(scores_df)

    Metric     Score  Score (%)
0  ROUGE-1  0.330633      33.06
1  ROUGE-2  0.131018      13.10
2  ROUGE-L  0.228428      22.84
