In [1]:
import json
import shutil
from datasets import Dataset, DatasetDict
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

# Load JSON dataset
with open("/kaggle/input/notes-explanation-and-summarization-dataset/Notes Explanation and Summarization Preprocessed Dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Convert to list of input-output pairs
formatted_data = [{"input_text": "explain: " + d["topic_name"], "target_text": d["explanation"]} for d in data]

# Split into training (90%) and validation (10%)
train_data, eval_data = train_test_split(formatted_data, test_size=0.1, random_state=42)

# Convert to Hugging Face Dataset format
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "eval": Dataset.from_list(eval_data)
})

# Load tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-large")

# Tokenization function
def tokenize_function(examples):
    model_inputs = tokenizer(examples["input_text"], padding="max_length", truncation=True, max_length=512)
    labels = tokenizer(examples["target_text"], padding="max_length", truncation=True, max_length=512)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize datasets
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Load pre-trained T5 model (using T5-large for best performance)
model = T5ForConditionalGeneration.from_pretrained("t5-large")

# Define model save path
model_path = "/kaggle/working/t5_explanation_model"

# Training configurations
training_args = TrainingArguments(
    output_dir=model_path,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=3e-4,
    weight_decay=0.01,
    num_train_epochs=2,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    push_to_hub=False,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
)

# Train model
trainer.train()

# Save final model - Only essential files
model.save_pretrained(model_path, safe_serialization=True)
tokenizer.save_pretrained(model_path)

# Remove irrelevant files (optimizer, scheduler, and other unnecessary files)
shutil.rmtree(f"{model_path}/checkpoint-*/optimizer.pt", ignore_errors=True)
shutil.rmtree(f"{model_path}/checkpoint-*/scheduler.pt", ignore_errors=True)
shutil.rmtree(f"{model_path}/checkpoint-*/rng_state.pth", ignore_errors=True)

# Save the last checkpoint only (after training)
final_checkpoint = f"{model_path}/checkpoint-{trainer.state.global_step}"
shutil.move(final_checkpoint, model_path)

# Optionally, convert the model to fp16 for space efficiency (if desired)
model.half()  # Convert model to fp16

# Save the reduced model
model.save_pretrained(model_path, safe_serialization=True)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/2688 [00:00<?, ? examples/s]

Map:   0%|          | 0/299 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,1.5202,1.29018
2,1.1057,1.156026


Error: Destination path '/kaggle/working/t5_explanation_model/checkpoint-5376' already exists

In [33]:
# Convert to ZIP for download
!zip -r model.zip /kaggle/working/t5_explanation_model
print("Model saved and zipped for download!")

  adding: kaggle/working/t5_explanation_model/ (stored 0%)
  adding: kaggle/working/t5_explanation_model/config.json (deflated 62%)
  adding: kaggle/working/t5_explanation_model/tokenizer_config.json (deflated 94%)
  adding: kaggle/working/t5_explanation_model/model.safetensors (deflated 8%)
  adding: kaggle/working/t5_explanation_model/spiece.model (deflated 48%)
  adding: kaggle/working/t5_explanation_model/generation_config.json (deflated 29%)
  adding: kaggle/working/t5_explanation_model/special_tokens_map.json (deflated 85%)
Model saved and zipped for download!


In [34]:
from IPython.display import FileLink
FileLink(r"model.zip")

In [12]:
import json
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

In [14]:
# Load JSON dataset (first 5 samples only)
with open("Notes Explanation and Summarization Preprocessed Dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)[:5]

# Load fine-tuned explanation generation model
model_path = "t5_explanation_model"
tokenizer = T5Tokenizer.from_pretrained(model_path)
model = T5ForConditionalGeneration.from_pretrained(model_path)

# Initialize metrics
smoother = SmoothingFunction().method4
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

bleu_scores = []
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

# Evaluation loop
for i, sample in enumerate(data):
    input_text = "explain: " + sample["topic_name"]
    reference = sample["explanation"]

    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=512)
    output_ids = model.generate(
        input_ids=input_ids,
        max_length=512,
        num_beams=8,
        no_repeat_ngram_size=3,
        repetition_penalty=2.0
    )
    generated = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Compute BLEU
    bleu = sentence_bleu([reference.split()], generated.split(), smoothing_function=smoother)
    bleu_scores.append(bleu)

    # Compute ROUGE
    rouge_scores = rouge.score(reference, generated)
    rouge1_scores.append(rouge_scores['rouge1'].fmeasure)
    rouge2_scores.append(rouge_scores['rouge2'].fmeasure)
    rougeL_scores.append(rouge_scores['rougeL'].fmeasure)

    # Print example output
    print(f"--- Example {i+1} ---")
    print(f"Input: {input_text}")
    print(f"Generated:\n{generated}\n")
    print(f"Reference:\n{reference}\n")
    print(f"BLEU: {bleu:.4f}")
    print(f"ROUGE-1 F1: {rouge_scores['rouge1'].fmeasure:.4f}")
    print(f"ROUGE-2 F1: {rouge_scores['rouge2'].fmeasure:.4f}")
    print(f"ROUGE-L F1: {rouge_scores['rougeL'].fmeasure:.4f}")
    print()

# Average scores
print("==== AVERAGE METRICS ====")
print(f"Average BLEU: {sum(bleu_scores)/len(bleu_scores):.4f}")
print(f"Average ROUGE-1 F1: {sum(rouge1_scores)/len(rouge1_scores):.4f}")
print(f"Average ROUGE-2 F1: {sum(rouge2_scores)/len(rouge2_scores):.4f}")
print(f"Average ROUGE-L F1: {sum(rougeL_scores)/len(rougeL_scores):.4f}")

--- Example 1 ---
Input: explain: What Operating Systems Do
Generated:
What Operating Systems Do: A Comprehensive Explanation An operating system (OS) is a software that manages computer hardware resources and provides common services to computer programs. It acts as an intermediary between the user and the computer hardware, controlling the allocation of system resources such as memory, CPU time, and input/output (I/O) devices. Key Principles: Resource Management: The OS manages the creation, execution, and termination of processes (programs) running on the computer. It allocates and deallocates system resources as needed, ensuring that each process has a unique identity and does not interfere with other processes. Process Management: When a process is created, it creates, runs, and terminates its execution. It ensures that multiple processes can run concurrently without interfering with each other's execution. Interrupt Handling: As the process is executed, the OS handles interrupts 

* BLEU	0.1492	Low lexical overlap — your generated explanations use different words/structure than the references.
* ROUGE-1 F1	0.5608	Moderate keyword overlap — your model captures some key terms and content.
* ROUGE-2 F1	0.2510	Low phrase overlap — model likely not matching multi-word expressions well.
* ROUGE-L F1	0.3298	Low structure similarity — flow/order of content differs from reference.