<a href="https://colab.research.google.com/github/Muhammad-junaid-mujtaba/llm-train/blob/main/llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from datasets import load_dataset


dataset = load_dataset("cnn_dailymail", '3.0.0')


print("Dataset Structure:")
print(dataset)

print("\nExample from the training set:")
for i in range(3):
    print(f"\nExample {i + 1}:")
    print(f"Article: {dataset['train'][i]['article'][:500]}...")
    print(f"Highlights: {dataset['train'][i]['highlights']}")


Dataset Structure:
DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

Example from the training set:

Example 1:
Article: LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as s...
Highlights: Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Youn

In [None]:
! pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:0

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset
from evaluate import load
import tensorflow as tf



tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
! pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
def preprocess_function(examples):
    inputs = examples['article']
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)


    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['highlights'], max_length=128, truncation=True)

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [None]:
tokenizer_dataset=dataset.map(preprocess_function,batched=True)
training=tokenizer_dataset['train'].to_tf_dataset(
    columns=['input_ids','attention_mask'],
    label_cols=['labels'],
    shuffle=True,
    batch_size=16

)
evaluation=tokenizer_dataset['validation'].to_tf_dataset(
    columns=['input_ids','attention_mask'],
    label_cols=['labels'],
    shuffle=False,
    batch_size=16

)



Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, Seq2SeqTrainingArguments
from datasets import load_dataset
from evaluate import load
import tensorflow as tf

# Use Seq2SeqTrainingArguments instead of TrainingArguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    save_total_limit=3,
    predict_with_generate=True,
    logging_dir='./logs',
    logging_steps=10,
)



In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding='max_length',
    max_length=1024,
    label_pad_token_id=tokenizer.pad_token_id,
    return_tensors="pt",
)

# Step 7: Create Trainer Instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenizer_dataset['train'],
    eval_dataset=tokenizer_dataset['validation'],
    data_collator=data_collator,
)

# Step 8: Train the Model
trainer.train()

In [None]:
rouge = load('rouge')

def compute_metrics(predictions, labels):
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE scores
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    return result

# Step 10: Inference - Generate Summary for a New Input

test_text = """
    Facebook’s plans to integrate WhatsApp, Instagram and Messenger have come under scrutiny from the UK’s
    antitrust regulator, which warned the move could stifle competition.
"""

# Preprocess and generate a summary
inputs = tokenizer(test_text, max_length=1024, return_tensors="pt", truncation=True)
summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=150, early_stopping=True)

# Decode and print the generated summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("\nGenerated Summary:", summary)