In [3]:
!pip install -q transformers datasets evaluate rouge_score accelerate

In [5]:
!pip install protobuf==3.20.3



In [6]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

import os
print(os.listdir("/kaggle/input/samsum-dataset-text-summarization")) 

df_train = pd.read_csv("/kaggle/input/samsum-dataset-text-summarization/samsum-train.csv")
df_val = pd.read_csv("/kaggle/input/samsum-dataset-text-summarization/samsum-validation.csv")
df_test = pd.read_csv("/kaggle/input/samsum-dataset-text-summarization/samsum-test.csv")

train_dataset = Dataset.from_pandas(df_train)
val_dataset = Dataset.from_pandas(df_val)
test_dataset = Dataset.from_pandas(df_test)

dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

print(dataset)

['samsum-train.csv', 'samsum_dataset', 'samsum-test.csv', 'samsum-validation.csv']
DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
})


In [8]:
model_checkpoint = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

prefix = "summarize: "

def preprocess_function(examples):
    inputs = [prefix + str(doc or "") for doc in examples["dialogue"]]
    
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer([str(s or "") for s in examples["summary"]], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)
print("Preprocessing successful!")

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]



Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Preprocessing successful!


In [9]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
import numpy as np

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

training_args = Seq2SeqTrainingArguments(
    output_dir="./hoshiBmaTchi_model",
    eval_strategy="epoch",            
    learning_rate=2e-5,
    per_device_train_batch_size=4,   
    per_device_eval_batch_size=4,    
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

2025-12-14 15:29:11.612843: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765726151.630589     229 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765726151.635859     229 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,1.758,1.466857,0.4631,0.2228,0.3874,0.3873,17.2237
2,1.5697,1.430724,0.4703,0.2317,0.3951,0.3955,17.3435
3,1.5241,1.422379,0.4716,0.2327,0.3969,0.3973,17.379




TrainOutput(global_step=2763, training_loss=1.597115040688337, metrics={'train_runtime': 3708.9184, 'train_samples_per_second': 11.916, 'train_steps_per_second': 0.745, 'total_flos': 1.732187380174848e+16, 'train_loss': 1.597115040688337, 'epoch': 3.0})

In [10]:
eval_results = trainer.evaluate()

print(f"ROUGE-1: {eval_results['eval_rouge1']:.4f}")
print(f"ROUGE-2: {eval_results['eval_rouge2']:.4f}")
print(f"ROUGE-L: {eval_results['eval_rougeL']:.4f}")

ROUGE-1: 0.4716
ROUGE-2: 0.2327
ROUGE-L: 0.3969


In [12]:

model_to_test = trainer.model
tokenizer_to_test = tokenizer

sample_text = """
omg guys you won't believe what happened today at the cafe. 
I was ordering my usual iced latte and the barista totally spilled it all over the counter. 
I felt so bad for him, he looked so stressed! But then the manager came out and gave me a free croissant 
to say sorry. It was actually the best croissant I've ever had. 
Totally made my day better after that rough start!
"""

input_text = "summarize: " + sample_text
inputs = tokenizer_to_test(input_text, return_tensors="pt").to("cuda")

outputs = model_to_test.generate(inputs["input_ids"], max_new_tokens=50, num_beams=4, early_stopping=True)
summary = tokenizer_to_test.decode(outputs[0], skip_special_tokens=True)

print("Original Caption:")
print(sample_text)
print("-" * 30)
print("BapTion Summary:")
print(summary)

Original Caption:

omg guys you won't believe what happened today at the cafe. 
I was ordering my usual iced latte and the barista totally spilled it all over the counter. 
I felt so bad for him, he looked so stressed! But then the manager came out and gave me a free croissant 
to say sorry. It was actually the best croissant I've ever had. 
Totally made my day better after that rough start!

------------------------------
BapTion Summary:
The barista spilled her iced latte all over the counter. The manager gave her a free croissant.


In [14]:
import shutil

output_path = "./baption_final_model"
trainer.save_model(output_path)
tokenizer.save_pretrained(output_path)

shutil.make_archive("baption_model", 'zip', output_path)

print("Success! Go to the 'Output' section of Kaggle (right sidebar) and download 'baption_model.zip'.")

✅ Success! Go to the 'Output' section of Kaggle (right sidebar) and download 'baption_model.zip'.


In [15]:
from IPython.display import FileLink

FileLink(r'baption_model.zip')

In [19]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [20]:
repo_name = "Maungvee/baption-summarizer" 

trainer.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

print(f"Model uploaded to: https://huggingface.co/{repo_name}")

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Model uploaded to: https://huggingface.co/Maungvee/baption-summarizer
