In [1]:
! pip install sentencepiece
! pip install rouge_score



In [2]:
import torch
import numpy as np
import datasets

from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)

# from tabulate import tabulate
import nltk
from datetime import datetime
import os

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
os.environ["trust_remote_code"] = "True"

In [5]:
WANDB_INTEGRATION = True
if WANDB_INTEGRATION:
    import wandb

    wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: ahmed-sakr79 (ahmed-sakr79-asu). Use `wandb login --relogin` to force relogin


In [6]:
# models = {
#     "bart-small": "facebook/bart-base",  # BART small equivalent
#     "t5-small": "t5-small",               # T5 small
#     "pegasus-small": "google/pegasus-xsum" # Pegasus small equivalent
# }

In [6]:
model_name = "pegasus-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
encoder_max_length = 512  
decoder_max_length = 128

In [8]:
data = datasets.load_dataset("cnn_dailymail", "3.0.0", split="train[:100000]")

In [9]:
def flatten(example):
    return {
        "document": example["article"],
        "summary": example["highlights"],
    }

def listToSamples(example):
    result = {"document": example["document"], "summary": example["summary"]}
    return result

In [10]:
dataset = data.map(flatten)

In [11]:
dataset = dataset.map(listToSamples)

In [12]:
train_data_txt, validation_data_txt = dataset.train_test_split(test_size=0.3).values()

In [13]:
def preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["document"], batch["summary"]
    source_tokenized = tokenizer(
        source, padding="max_length", truncation=True, max_length=max_source_length
    )
    target_tokenized = tokenizer(
        target, padding="max_length", truncation=True, max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}
    # Ignore padding in the loss
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch

In [14]:
train_data = train_data_txt.map(
    lambda batch: preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=train_data_txt.column_names,
)

Map:   0%|          | 0/70000 [00:00<?, ? examples/s]

In [15]:
validation_data = validation_data_txt.map(
    lambda batch: preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=validation_data_txt.column_names,
)

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

In [16]:
nltk.download("punkt", quiet=True)

metric = datasets.load_metric("rouge")


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

  metric = datasets.load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [17]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [26]:
training_args = Seq2SeqTrainingArguments(
    output_dir="results",
    num_train_epochs=1,  
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=4,  # demo
    per_device_eval_batch_size=4,
    # learning_rate=3e-05,
    warmup_steps=500,
    weight_decay=0.1,
    label_smoothing_factor=0.1,
    predict_with_generate=True,
    logging_dir="logs",
    logging_steps=50,
    save_total_limit=3,
    load_best_model_at_end=True,  # Load the best model at the end of training
    # metric_for_best_model="eval_loss",  # Use evaluation loss to determine the best model
    # greater_is_better=False,  # Lower evaluation loss indicates a better model
    evaluation_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",        # Save at the end of each epoch
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],

)

In [19]:
if WANDB_INTEGRATION:
    wandb_run = wandb.init(
        project="text_summarizer",
        config={
            "per_device_train_batch_size": training_args.per_device_train_batch_size,
            "learning_rate": training_args.learning_rate,
            "dataset": "cnn_dailynews ",
        },
    )

    now = datetime.now()
    current_time = now.strftime("%H%M%S")
    wandb_run.name = "run_" + "_" + current_time

In [20]:
import gc

gc.collect()

torch.cuda.empty_cache()

In [66]:
#used for bart
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,3.343,3.201088,23.4154,10.3877,19.3834,21.8192,19.9942


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=4375, training_loss=3.398768709891183, metrics={'train_runtime': 5250.2268, 'train_samples_per_second': 13.333, 'train_steps_per_second': 0.833, 'total_flos': 2.13407760384e+16, 'train_loss': 3.398768709891183, 'epoch': 1.0})

In [63]:
#used for pegasus
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
1,0.89,0.88,0.42,0.3,0.35


Non-default generation parameters: {'max_length': 64, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}
Non-default generation parameters: {'max_length': 64, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=12250, training_loss=3.244249363490513, metrics={'train_runtime': 21943.1858, 'train_samples_per_second': 2.233, 'train_steps_per_second': 0.558, 'total_flos': 7.0791878148096e+16, 'train_loss': 3.244249363490513, 'epoch': 1.0})

In [28]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [29]:
#used for t5
trainer.train()

  0%|          | 0/17500 [00:00<?, ?it/s]

{'loss': 3.5264, 'grad_norm': 1.9784258604049683, 'learning_rate': 5e-06, 'epoch': 0.0}
{'loss': 3.3818, 'grad_norm': 2.0811662673950195, 'learning_rate': 1e-05, 'epoch': 0.01}
{'loss': 3.4705, 'grad_norm': 1.8946020603179932, 'learning_rate': 1.5e-05, 'epoch': 0.01}
{'loss': 3.3589, 'grad_norm': 2.123340129852295, 'learning_rate': 2e-05, 'epoch': 0.01}
{'loss': 3.4446, 'grad_norm': 2.4568710327148438, 'learning_rate': 2.5e-05, 'epoch': 0.01}
{'loss': 3.3812, 'grad_norm': 2.0378899574279785, 'learning_rate': 3e-05, 'epoch': 0.02}
{'loss': 3.4396, 'grad_norm': 2.5617034435272217, 'learning_rate': 3.5e-05, 'epoch': 0.02}
{'loss': 3.4313, 'grad_norm': 2.099320411682129, 'learning_rate': 4e-05, 'epoch': 0.02}
{'loss': 3.4003, 'grad_norm': 2.774787425994873, 'learning_rate': 4.5e-05, 'epoch': 0.03}
{'loss': 3.4129, 'grad_norm': 2.1701996326446533, 'learning_rate': 5e-05, 'epoch': 0.03}
{'loss': 3.4743, 'grad_norm': 2.319622755050659, 'learning_rate': 4.985294117647059e-05, 'epoch': 0.03}
{'

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


{'train_runtime': 6976.1134, 'train_samples_per_second': 10.034, 'train_steps_per_second': 2.509, 'train_loss': 3.4348117667061944, 'epoch': 1.0}


TrainOutput(global_step=17500, training_loss=3.4348117667061944, metrics={'train_runtime': 6976.1134, 'train_samples_per_second': 10.034, 'train_steps_per_second': 2.509, 'total_flos': 9473926103040000.0, 'train_loss': 3.4348117667061944, 'epoch': 1.0})

In [30]:
if WANDB_INTEGRATION:
    wandb_run.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/gen_len,▁
eval/loss,▁
eval/rouge1,▁
eval/rouge2,▁
eval/rougeL,▁
eval/rougeLsum,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇██▁▁▂▂▃▃▃▄▄▄▅▅▅▆▆▇▇▇██

0,1
eval/gen_len,18.9564
eval/loss,3.24308
eval/rouge1,23.1447
eval/rouge2,9.6291
eval/rougeL,18.9967
eval/rougeLsum,21.5909
eval/runtime,2701.9733
eval/samples_per_second,11.103
eval/steps_per_second,2.776
total_flos,9473926103040000.0


In [31]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [70]:
model.push_to_hub("text_summarizer_bart")

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ahmeddsakrr/text_summarizer_bart/commit/bfb75d22e427c4650ac0255307b28704f20eacbe', commit_message='Upload BartForConditionalGeneration', commit_description='', oid='bfb75d22e427c4650ac0255307b28704f20eacbe', pr_url=None, pr_revision=None, pr_num=None)

In [67]:
model.push_to_hub("text_summarizer_pegasus")

Non-default generation parameters: {'max_length': 64, 'num_beams': 8, 'length_penalty': 0.6, 'forced_eos_token_id': 1}


model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ahmeddsakrr/text_summarizer_pegasus/commit/7e59cdbaa18832c1c2a4bd77a9a3339cef838f25', commit_message='Upload PegasusForConditionalGeneration', commit_description='', oid='7e59cdbaa18832c1c2a4bd77a9a3339cef838f25', pr_url=None, pr_revision=None, pr_num=None)

In [32]:
model.push_to_hub("text_summarizer_t5")

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ahmeddsakrr/text_summarizer_t5/commit/5a59969f4cecc6416a007b01f3910dee7f47a79f', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='5a59969f4cecc6416a007b01f3910dee7f47a79f', pr_url=None, pr_revision=None, pr_num=None)

In [33]:
from transformers import pipeline
# pipeline = pipeline("text2text-generation", model="ahmeddsakrr/text_summarizer_bart", tokenizer="facebook/bart-base", device="cuda:0", temperature=1)
pipeline = pipeline("text2text-generation", model="ahmeddsakrr/text_summarizer_t5", tokenizer="t5-small", device="cuda:0", temperature=1)

config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/118 [00:00<?, ?B/s]

In [34]:
article = """ANKARA: Egyptian President Abdel Fattah al-Sisi arrived in Ankara on Wednesday for talks with Turkish counterpart Tayyip Erdogan, the first such presidential visit in 12 years, amid a warming of long-frozen relations between the regional powers.

Erdogan had travelled to Cairo in February, his first trip to Egypt since 2012, taking a major step toward rebuilding ties that were severely strained for a decade.

Relations between Ankara and Cairo collapsed in 2013 after Egypt's then-army chief Sisi led the ouster of the Muslim Brotherhood's Mohamed Mursi, a Turkish ally who had become Egypt's first democratically elected president the year before. Mursi visited Turkey as president in 2012.

"Turkey-Egypt relations will be reviewed in all their aspects, and possible joint steps in the coming period to further develop cooperation will be discussed," the Turkish presidency's communications office said late on Tuesday.

"There will be an exchange of views on current regional and global issues, especially the Israeli attacks on Gaza and the occupied Palestinian territories".

Erdogan met Sisi at Ankara airport before they left together in Erdogan's motorcade for the presidential palace for a welcome ceremony. They will also chair the first meeting of the Turkey-Egypt High-Level Strategic Cooperation Council.

A joint press conference is scheduled for 5pm (1400 GMT).

In a statement, Sisi said his visit - and that of Erdogan in February - showed Ankara and Cairo's common will to launch a new phase of friendship and cooperation.

Ties between the two countries began thawing in 2020 when Ankara launched a diplomatic charm offensive to ease tensions with its estranged regional rivals, including the United Arab Emirates, Saudi Arabia and Egypt.

Turkey and Egypt mutually reappointed ambassadors last year, and Ankara has said it would provide Cairo with armed drones. Erdogan said in Cairo that the countries wanted to boost annual trade by $5 billion to $15 billion in the short term.

Turkey's state-owned Anadolu news agency said the two countries would sign around 20 agreements during Sisi's visit to cooperate on energy, defence, tourism, health, culture and education. It said deepening cooperation on renewable energy and liquefied natural gas (LNG) was also planned.

Turkey, which has condemned Israel for its war against Hamas, has sent thousands of tonnes of aid to Egypt for Palestinians and praised Cairo's humanitarian efforts and role as negotiator in Gaza truce talks."""

In [35]:
pipeline(article)

Token indices sequence length is longer than the specified maximum sequence length for this model (564 > 512). Running this sequence through the model will result in indexing errors


[{'generated_text': 'Erdogan and Erdogan will meet in Ankara for talks . Turkey and Egypt have re'}]

In [36]:
prompt_2 = """That may help explain why Musk was willing to close X’s office in Brazil rather than comply with a judge’s orders to remove some accounts for sharing alleged misinformation and hate speech. Once Musk shuttered the office in August, X no longer had a legal representative in the country, which was part of a Supreme Court judge’s justification for banning the app last week."""

In [37]:
pipeline(prompt_2)

[{'generated_text': 'Musk shuttered X’s office in Brazil in August . X no longer'}]