In [1]:
import json
import shutil
from datasets import Dataset, DatasetDict
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

# Load JSON dataset
with open("/kaggle/input/notes-explanation-and-summarization-dataset/Notes Explanation and Summarization Preprocessed Dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Convert to input-output pairs: explanation → summary
formatted_data = [{"input_text": "summarize: " + d["explanation"], "target_text": d["summary"]} for d in data]

# Split into training and evaluation
train_data, eval_data = train_test_split(formatted_data, test_size=0.1, random_state=42)

# Hugging Face DatasetDict
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "eval": Dataset.from_list(eval_data)
})

# Load tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-large")

# Tokenization function
def tokenize_function(examples):
    model_inputs = tokenizer(examples["input_text"], padding="max_length", truncation=True, max_length=512)
    labels = tokenizer(examples["target_text"], padding="max_length", truncation=True, max_length=128)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the datasets
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Load T5 model
model = T5ForConditionalGeneration.from_pretrained("t5-large")

# Path to save model
model_path = "/kaggle/working/t5_summary_model"

# Training args
training_args = TrainingArguments(
    output_dir=model_path,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=3e-4,
    weight_decay=0.01,
    num_train_epochs=2,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    push_to_hub=False,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
)

# Train the model
trainer.train()

# Save final model
model.save_pretrained(model_path, safe_serialization=True)
tokenizer.save_pretrained(model_path)

# Optional: Remove unnecessary files (if they exist)
shutil.rmtree(f"{model_path}/checkpoint-*/optimizer.pt", ignore_errors=True)
shutil.rmtree(f"{model_path}/checkpoint-*/scheduler.pt", ignore_errors=True)
shutil.rmtree(f"{model_path}/checkpoint-*/rng_state.pth", ignore_errors=True)

# Save final checkpoint as main model
final_checkpoint = f"{model_path}/checkpoint-{trainer.state.global_step}"
shutil.move(final_checkpoint, model_path)

# Optional: Convert model to FP16 (if space matters and your device supports it)
model.half()
model.save_pretrained(model_path, safe_serialization=True)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/2688 [00:00<?, ? examples/s]

Map:   0%|          | 0/299 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.5583,0.574562
2,0.3614,0.513873


Error: Destination path '/kaggle/working/t5_summary_model/checkpoint-5376' already exists

In [5]:
# Convert to ZIP for download
!zip -r summary_model.zip /kaggle/working/t5_summary_model
print("Model saved and zipped for download!")

  adding: kaggle/working/t5_summary_model/ (stored 0%)
  adding: kaggle/working/t5_summary_model/generation_config.json (deflated 29%)
  adding: kaggle/working/t5_summary_model/model.safetensors (deflated 7%)
  adding: kaggle/working/t5_summary_model/tokenizer_config.json (deflated 94%)
  adding: kaggle/working/t5_summary_model/spiece.model (deflated 48%)
  adding: kaggle/working/t5_summary_model/special_tokens_map.json (deflated 85%)
  adding: kaggle/working/t5_summary_model/config.json (deflated 62%)
Model saved and zipped for download!


In [6]:
from IPython.display import FileLink
FileLink(r"summary_model.zip")

In [22]:
import json
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

In [24]:
# Load a few samples from the dataset
with open("Notes Explanation and Summarization Preprocessed Dataset.json", "r") as f:
    data = json.load(f)[:5]  # Just first 5 samples

# Load fine-tuned model and tokenizer
model_path = "t5_summary_model"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

# Init metrics
smoothing_fn = SmoothingFunction().method4
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

bleu_scores = []
rouge1_f1s = []
rouge2_f1s = []
rougeL_f1s = []

# Evaluate
for i, item in enumerate(data):
    input_text = "summarize: " + item["explanation"]
    reference = item["summary"]

    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=512)
    summary_ids = model.generate(
        input_ids=input_ids,
        max_length=512,
        num_beams=8,
        no_repeat_ngram_size=3,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True
    )
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    # BLEU
    ref_tokens = [reference.split()]
    gen_tokens = generated_summary.split()
    bleu = sentence_bleu(ref_tokens, gen_tokens, smoothing_function=smoothing_fn)
    bleu_scores.append(bleu)

    # ROUGE
    rouge = scorer.score(reference, generated_summary)
    rouge1_f1s.append(rouge['rouge1'].fmeasure)
    rouge2_f1s.append(rouge['rouge2'].fmeasure)
    rougeL_f1s.append(rouge['rougeL'].fmeasure)

    # Print
    print(f"--- Example {i+1} ---")
    print(f"Generated: {generated_summary}")
    print(f"Reference: {reference}")
    print(f"BLEU Score: {bleu:.4f}")
    print(f"ROUGE-1 F1: {rouge['rouge1'].fmeasure:.4f}")
    print(f"ROUGE-2 F1: {rouge['rouge2'].fmeasure:.4f}")
    print(f"ROUGE-L F1: {rouge['rougeL'].fmeasure:.4f}")
    print()

# Averages
print("==== AVERAGE METRICS ====")
print(f"Average BLEU Score: {sum(bleu_scores)/len(bleu_scores):.4f}")
print(f"Average ROUGE-1 F1: {sum(rouge1_f1s)/len(rouge1_f1s):.4f}")
print(f"Average ROUGE-2 F1: {sum(rouge2_f1s)/len(rouge2_f1s):.4f}")
print(f"Average ROUGE-L F1: {sum(rougeL_f1s)/len(rougeL_f1s):.4f}")

--- Example 1 ---
Generated: In summary, an Operating System is a crucial software component that manages the resources and interactions between the hardware and software components of a computer system. It provides abstraction, resource allocation, resource protection, and fault tolerance, allowing programs to interact with the hardware without knowing the underlying details. By understanding the key principles and functions of an OS, you can appreciate the importance of this critical component in modern computing.
Reference: In summary, an Operating System performs three main functions: process management, memory management, and I/O management. It provides abstraction, resource allocation, resource protection, and fault tolerance to ensure the efficient and reliable operation of computer systems. By understanding what operating systems do, we can appreciate the critical role they play in facilitating our interactions with computers and enabling us to accomplish tasks efficiently and 

* BLEU	0.4397	Moderate lexical overlap — good, but BLEU is stricter and can penalize paraphrasing.
* ROUGE-1 F1	0.7215	Strong unigram overlap — the model captures key concepts very well.
* ROUGE-2 F1	0.5713	Good phrase-level fluency — summaries are fairly well-phrased.
* ROUGE-L F1	0.6316	Decent structure/order similarity — model preserves logical flow.