In [None]:
# Ensure necessary libraries are installed
!pip install transformers datasets pandas

# Import required libraries
from google.colab import files
from IPython.display import display # Import display
from datasets import Dataset
import pandas as pd
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import os
import torch # Import torch

# --- Step 1: Upload the dataset (if needed) ---
# If you have already uploaded cleaned_samsum.csv to your Colab environment
# or mounted Google Drive where it resides, you can skip this upload step.
# Otherwise, uncomment the following lines to upload the file.
# print("Please upload the 'cleaned_samsum.csv' file.")
# uploaded = files.upload()

# --- Step 2: Load and prepare the data ---

# Define the path to your CSV file
csv_file_path = "cleaned_samsum.csv"

# Load the CSV into a Pandas DataFrame
df = None # Initialize df to None before the try block

try:
    df = pd.read_csv(csv_file_path)
    print(f"Successfully loaded {csv_file_path}")
except FileNotFoundError:
    print(f"Error: The file '{csv_file_path}' was not found.")
    print("Please upload the file using the 'files.upload()' function or ensure it's in the correct path.")
    # Exiting the cell here because subsequent steps depend on the dataframe
    # If you want to handle this differently (e.g., just print error and continue
    # but skip dependent steps), you would need more complex logic.
    # For this example, we'll keep the exit to match the original intent
    # of stopping if the file is not found.
    exit()

# Only proceed if df was successfully loaded
if df is not None:
    # Keep only the cleaned columns and rename them
    df = df[['cleaned_text', 'cleaned_summary']].rename(columns={
        'cleaned_text': 'input',
        'cleaned_summary': 'target'
    })
    print("DataFrame prepared with 'input' and 'target' columns.")
    print("First 5 rows of the prepared DataFrame:")
    display(df.head()) # Use display for better formatting in Colab

    # Convert the Pandas DataFrame to a Hugging Face Dataset
    dataset = Dataset.from_pandas(df)
    print(f"\nHugging Face Dataset created. Total samples: {len(dataset)}")

    # Split the dataset into training and testing sets
    dataset = dataset.train_test_split(test_size=0.1, seed=42) # Added seed for reproducibility
    print(f"Dataset split into training ({len(dataset['train'])} samples) and test ({len(dataset['test'])} samples).")

    # --- Step 3: Load the Tokenizer ---

    # Define the model checkpoint for the tokenizer and model
    model_checkpoint = "t5-small"
    print(f"\nLoading tokenizer from: {model_checkpoint}")
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    print("Tokenizer loaded.")

    # --- Step 4: Tokenize the Dataset ---

    # Define the preprocessing function
    def preprocess(example):
        """Tokenizes input text and target summaries."""
        # Define max lengths for input and target - adjusted based on typical summary tasks
        max_input_length = 512 # Original was 512, keeping it
        max_target_length = 128 # Original was 128, keeping it

        inputs = tokenizer(
            example["input"],
            padding="max_length",
            truncation=True,
            max_length=max_input_length
        )
        targets = tokenizer(
            example["target"],
            padding="max_length",
            truncation=True,
            max_length=max_target_length
        )
        # Set the tokenized target as labels for training
        inputs["labels"] = targets["input_ids"]
        return inputs

    print("\nApplying preprocessing (tokenization) to the dataset...")
    # Apply the preprocess function to the dataset
    tokenized_dataset = dataset.map(
        preprocess,
        batched=True,          # Process in batches for efficiency
        remove_columns=["input", "target"] # Remove original text columns
    )
    print("Dataset tokenization complete.")
    print("Example of a tokenized sample (showing input_ids and labels):")
    # Print a sample from the tokenized dataset - adjust index as needed
    print(tokenized_dataset["train"][0].keys())


    # --- Step 5: Load the Model ---

    print(f"\nLoading model from: {model_checkpoint}")
    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
    print("Model loaded.")

    # --- Step 6: Set Training Arguments ---

    print("\nSetting up training arguments...")
    training_args = Seq2SeqTrainingArguments(
        output_dir="./results",             # Output directory for model checkpoints and results
        eval_strategy="epoch",              # Evaluate at the end of each epoch
        learning_rate=2e-4,                 # Learning rate
        per_device_train_batch_size=8,      # Batch size for training on each device
        per_device_eval_batch_size=8,       # Batch size for evaluation on each device
        weight_decay=0.01,                  # Weight decay for regularization
        save_total_limit=3,                 # Limit the total number of saved checkpoints
        num_train_epochs=3,                 # Total number of training epochs
        predict_with_generate=True,         # Use generation for prediction (required for summarization)
        logging_dir="./logs",               # Directory for logging
        logging_steps=10,                   # Log training information every X steps
        fp16=True,                          # Enable mixed precision training (requires GPU)
        report_to=["none"]                  # Explicitly disable reporting to any online service
    )
    print("Training arguments set.")
    # Check for GPU availability and warn if fp16 is enabled without one
    if training_args.fp16 and not torch.cuda.is_available():
        print("\nWarning: fp16 is enabled but no GPU found. Training will likely fail or be very slow. Set fp16=False if not using GPU.")


    # --- Step 7: Define the Trainer ---

    # Get train/val splits from the tokenized dataset
    train_dataset = tokenized_dataset["train"]
    val_dataset = tokenized_dataset["test"]

    # Define data collator for Seq2Seq tasks
    # This collator pads the batches dynamically to the longest sequence in the batch
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    print("\nData collator defined.")

    # Disable Weights & Biases logging if not needed
    # This environment variable also helps in some cases
    os.environ["WANDB_DISABLED"] = "true"
    print("WANDB logging disabled.")

    # Define the Seq2Seq Trainer
    print("Defining the Seq2SeqTrainer...")
    trainer = Seq2SeqTrainer(
        model=model,                  # The loaded model
        args=training_args,           # The training arguments
        train_dataset=train_dataset,  # The training dataset
        eval_dataset=val_dataset,     # The evaluation dataset
        tokenizer=tokenizer,          # The tokenizer
        data_collator=data_collator,  # The data collator
    )
    print("Trainer defined.")

    # --- Step 8: Train the Model ---

    print("\nStarting model training...")
    trainer.train()
    print("Training finished successfully!")

    # --- Step 9: Save the Model and Tokenizer ---

    # Define the directory to save the fine-tuned model
    save_directory = "./t5-finetuned-model"

    print(f"\nSaving the fine-tuned model and tokenizer to '{save_directory}'...")

    # Create the directory if it doesn't exist
    os.makedirs(save_directory, exist_ok=True)

    # Save the model
    trainer.save_model(save_directory)

    # Save the tokenizer
    tokenizer.save_pretrained(save_directory)

    print("Model and tokenizer saved.")

    # You can optionally verify the saved files
    # !ls ./t5-finetuned-model

In [None]:
!ls ./t5-finetuned-model

config.json		special_tokens_map.json  tokenizer.json
generation_config.json	spiece.model		 training_args.bin
model.safetensors	tokenizer_config.json


In [None]:
# Ensure necessary libraries are installed (already done)
# !pip install zip

# Zip the saved model directory
print("Zipping the fine-tuned model directory...")
# -r: recursive (include subdirectories)
# ./t5-finetuned-model.zip: name of the output zip file
# ./t5-finetuned-model: the directory to zip
!zip -r t5-finetuned-model.zip ./t5-finetuned-model

print("\nModel directory zipped successfully!")
print("You can now download 't5-finetuned-model.zip' from the Colab file browser sidebar.")