In [1]:
!pip install transformers wandb rouge_score
!pip install datasets==2.21.0
!pip install evaluate
!pip install sacrebleu

Collecting wandb
  Downloading wandb-0.18.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.7 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.14.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.9 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.29,>=1.0.0->wandb)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.29,>=1.0.0->wandb)
  Downloadin

In [2]:
import torch
from transformers import GPT2TokenizerFast, GPT2LMHeadModel, DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, concatenate_datasets, DatasetDict
import numpy as np
from rouge_score import rouge_scorer
import wandb
from transformers.integrations import WandbCallback


# load dataset
def load_billsum():
    """
    Load and combine the 'train' and 'test' splits of the Billsum dataset into a single 'train' set,
    and use 'ca_test' as the 'test' set.

    Returns:
        DatasetDict: A dictionary containing 'train' and 'test' datasets.
    """
    # Define the split names
    training_splits = ['train', 'test']
    test_split = 'ca_test'

    datasets_to_combine = []

    # Load and combine training splits
    for split in training_splits:
        try:
            ds = load_dataset("billsum", split=split)
            print(f"Loaded split: {split} with {len(ds)} examples.")
            datasets_to_combine.append(ds)
        except Exception as e:
            print(f"Could not load split '{split}': {e}")

    if not datasets_to_combine:
        raise ValueError("No training datasets were loaded. Please check the split names.")

    # Concatenate 'train' and 'test' splits into a single 'train' set
    combined_train = concatenate_datasets(datasets_to_combine)
    print(f"Combined train dataset size: {len(combined_train)} examples.")

    # Load the 'ca_test' split as the test set
    try:
        test_ds = load_dataset("billsum", split=test_split)
        print(f"Loaded test split: {test_split} with {len(test_ds)} examples.")
    except Exception as e:
        raise ValueError(f"Could not load test split '{test_split}': {e}")

    # Create a DatasetDict with 'train' and 'test' splits
    dataset_dict = DatasetDict({
        "train": combined_train,
        "test": test_ds
    })

    print(f"DatasetDict keys: {dataset_dict.keys()}")
    return dataset_dict


# Initialize tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token


# Preprocess function
def preprocess_function(examples, tokenizer):
    inputs = [f"Summarize the following text in a concise manner:\n\n{doc}\n\nSummary:" for doc in examples["text"]]
    summaries = [summary for summary in examples["summary"]]

    concatenated = [inp + " " + summ for inp, summ in zip(inputs, summaries)]

    tokenized = tokenizer(
        concatenated,
        max_length=1024 + 150,  # Adjust based on desired input and summary lengths
        truncation=True,
        padding="max_length"
    )

    labels = []
    for inp, summ in zip(inputs, summaries):
        inp_ids = tokenizer.encode(inp, add_special_tokens=False)
        summ_ids = tokenizer.encode(summ, add_special_tokens=False)

        full_ids = inp_ids + summ_ids

        label = [-100] * len(inp_ids) + summ_ids
        label = label + [-100] * (max_length := 1024 + 150 - len(label))
        labels.append(label[:max_length])

    tokenized["labels"] = labels
    return tokenized


# Load and preprocess the dataset
dataset = load_billsum()
tokenized_datasets = dataset.map(                             # Preprocess the dataset using a lambda to pass the tokenizer
    lambda examples: preprocess_function(examples, tokenizer),
    batched=True,
    remove_columns=['text', 'summary', 'title']  # Remove original text columns to save memory
)
# Limit the test set to 100 examples
tokenized_datasets["test"] = tokenized_datasets["test"].shuffle(seed=42).select(range(100))
print(f"Limited test dataset size: {len(tokenized_datasets['test'])} examples.")


# Function to freeze layers based on variant type
def freeze_layers(model, variant_type):
    if variant_type == "noNorm":
        for name, param in model.named_parameters():
            if "ln" in name:
                param.requires_grad = False
    elif variant_type == "AttnOnly":
        for name, param in model.named_parameters():
            if "ln_2" in name:  # Freeze FFN layer norm
                param.requires_grad = False
    elif variant_type == "FFNonly":
        for name, param in model.named_parameters():
            if "ln_1" in name:  # Freeze attention layer norm
                param.requires_grad = False
    # For baseModel, we don't freeze any layers

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



Downloading readme:   0%|          | 0.00/7.27k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/91.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/15.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.12M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18949 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3269 [00:00<?, ? examples/s]

Generating ca_test split:   0%|          | 0/1237 [00:00<?, ? examples/s]

Loaded split: train with 18949 examples.
Loaded split: test with 3269 examples.
Combined train dataset size: 22218 examples.
Loaded test split: ca_test with 1237 examples.
DatasetDict keys: dict_keys(['train', 'test'])


Map:   0%|          | 0/22218 [00:00<?, ? examples/s]

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]

Limited test dataset size: 100 examples.


In [3]:
import evaluate  # Import the evaluate library

# Initialize ROUGE and BLEU metrics
rouge = evaluate.load("rouge")
# bleu = evaluate.load("bleu")


import sacrebleu

def compute_metrics(eval_pred):
    """
    Compute ROUGE and BLEU metrics for summarization using SacreBLEU with smoothing.

    Args:
        eval_pred (EvalPrediction): Contains predictions and label_ids.

    Returns:
        dict: Average ROUGE and BLEU scores.
    """
    predictions, labels = eval_pred

    # Convert logits to token IDs by taking the argmax over the vocabulary dimension
    pred_ids = np.argmax(predictions, axis=-1)

    # Decode the predicted token IDs to text
    decoded_preds = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)

    # Replace -100 in the labels with the pad token ID and decode
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Ensure that the predictions and references are lists of strings
    # and remove any leading/trailing whitespace
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # Compute ROUGE scores using the evaluate library
    rouge_result = rouge.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )

    # Compute BLEU scores using SacreBLEU with smoothing
    bleu_scores = sacrebleu.corpus_bleu(
        decoded_preds,
        [decoded_labels],
        smooth_method='exp',       # Exponential smoothing
        smooth_value=0.1,
        force=True,                # Force compute even if length mismatch
        lowercase=True,            # Normalize case
        tokenize='13a'             # Tokenizer type (SacreBLEU default)
    )

    bleu_score = bleu_scores.score  # SacreBLEU returns a score attribute

    # Aggregate the results
    result = {
        "rouge1": rouge_result["rouge1"],
        "rouge2": rouge_result["rouge2"],
        "rougeL": rouge_result["rougeL"],
        "bleu": bleu_score
    }

    # Optional: Calculate average prediction length
    prediction_lens = [len(pred.split()) for pred in decoded_preds]
    result["gen_len"] = np.mean(prediction_lens)

    # Round the results to four decimal places for readability
    result = {k: round(v, 4) for k, v in result.items()}

    return result

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [10]:
# Fine-tuning function
def fine_tune_model(model, tokenizer, dataset, output_dir, variant, norm_type):
    # Initialize wandb run
    wandb.init(project=f"GPT-Valkyrie_{norm_type}-124m__{variant}__Billsum", reinit=True)
    run_name = wandb.run.name

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=1,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=50,
        evaluation_strategy="steps",
        eval_steps=200,
        save_steps=200,
        load_best_model_at_end=True,
        metric_for_best_model="rougeL",
        report_to="wandb",
        run_name=run_name,
        save_total_limit=3,
        fp16=True,
    )

    from transformers import DataCollatorForLanguageModeling
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False  # Causal language modeling
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"], # even tho its meant to be called "validation" instead
        tokenizer=tokenizer,
        data_collator=data_collator,  # Use the updated data collator
        compute_metrics=compute_metrics,
        callbacks=[WandbCallback()],
    )

    trainer.train()
    wandb.finish()
    return trainer.model, run_name

In [11]:
# Main training loop
variants = ["baseModel", "noNorm", "AttnOnly", "FFNonly"]
norm_types = ["LN", "RMSN"]

for norm_type in norm_types:
    for variant in variants:
        print(f"Processing {norm_type} {variant} model...")

        # Use the correct base model for each variant
        model_path = f"shng2025/GPT-Valkyrie_{norm_type}-124m__{variant}__"
        model = GPT2LMHeadModel.from_pretrained(model_path)

        model.config.pad_token_id = tokenizer.pad_token_id
        # Print to verify
        print(f"Tokenizer pad token: {tokenizer.pad_token}")
        print(f"Tokenizer pad token ID: {tokenizer.pad_token_id}")
        print(f"Model pad token ID: {model.config.pad_token_id}")

        freeze_layers(model, variant)

        output_dir = f"./results/{norm_type}/{variant}"
        fine_tuned_model, run_name = fine_tune_model(model, tokenizer, tokenized_datasets, output_dir, variant, norm_type)

        # Save the model locally
        local_save_dir = f"./local_models/GPT-Valkyrie_{norm_type}-124m__{variant}__Billsum"
        fine_tuned_model.save_pretrained(local_save_dir)
        tokenizer.save_pretrained(local_save_dir)
        print(f"Model saved locally to {local_save_dir}")

        # Push the model to your HuggingFace Hub repository
        new_repo_name = f"shng2025/GPT-Valkyrie_{norm_type}-124m__{variant}__Billsum"
        fine_tuned_model.push_to_hub(new_repo_name, branch=run_name)
        tokenizer.push_to_hub(new_repo_name, branch=run_name)
        print(f"Model pushed to HuggingFace Hub: {new_repo_name}, branch: {run_name}")

print("Training completed for all variants and normalization types.")

Processing LN baseModel model...
Tokenizer pad token: <|endoftext|>
Tokenizer pad token ID: 50256
Model pad token ID: 50256


VBox(children=(Label(value='0.019 MB of 0.019 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁
train/global_step,▁▁
train/learning_rate,▁▁
train/loss,▁▁

0,1
train/epoch,0.00018
train/global_step,1.0
train/grad_norm,inf
train/learning_rate,0.0
train/loss,4.3294


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
WandbCallback
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss


VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▃▃▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇██
train/global_step,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇██
train/grad_norm,▆▆▆▆█▄▄▄▄▃▄▃▄▄▂▂▂▂▂▁▂▁▂▂▂▁▂▂▂▂▁▂▁▂▁▁▁▁▂▁
train/learning_rate,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/loss,▆▇▆▆▆▆▇█▆▃▃▅▅▃▃▄▄▅▃▃▃▄▁▄▃▂▂▃▂▂▂▂▂▄▃▁▂▁▂▁

0,1
total_flos,209033625600000.0
train/epoch,0.018
train/global_step,100.0
train/grad_norm,3.72775
train/learning_rate,1e-05
train/loss,2.4009
train_loss,3.38224
train_runtime,34.281
train_samples_per_second,11.668
train_steps_per_second,2.917


Model saved locally to ./local_models/GPT-Valkyrie_LN-124m__baseModel__Billsum


README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Model pushed to HuggingFace Hub: shng2025/GPT-Valkyrie_LN-124m__baseModel__Billsum, branch: vital-star-19
Processing LN noNorm model...


config.json:   0%|          | 0.00/840 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

[34m[1mwandb[0m: Currently logged in as: [33mshng2025[0m ([33mmarlborough-college-malaysia[0m). Use [1m`wandb login --relogin`[0m to force relogin


Tokenizer pad token: <|endoftext|>
Tokenizer pad token ID: 50256
Model pad token ID: 50256


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
WandbCallback
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss


VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇██
train/global_step,▁▁▁▁▂▂▂▂▂▂▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇███
train/grad_norm,▇▅█▇▆▅▅▄▂▃▃▂▂▂▂▂▂▂▂▂▂▁▁▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▇▇▇▇███
train/loss,██▇▇▇▅▆▆▃▄▅▅▄▄▄▅▃▄▅▃▃▄▂▂▃▂▂▂▂▂▂▂▃▃▂▁▂▂▁▂

0,1
total_flos,209033625600000.0
train/epoch,0.018
train/global_step,100.0
train/grad_norm,4.25429
train/learning_rate,1e-05
train/loss,3.0883
train_loss,4.83129
train_runtime,33.6269
train_samples_per_second,11.895
train_steps_per_second,2.974


Model saved locally to ./local_models/GPT-Valkyrie_LN-124m__noNorm__Billsum


README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Model pushed to HuggingFace Hub: shng2025/GPT-Valkyrie_LN-124m__noNorm__Billsum, branch: zany-river-1
Processing LN AttnOnly model...


config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Tokenizer pad token: <|endoftext|>
Tokenizer pad token ID: 50256
Model pad token ID: 50256


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
WandbCallback
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss


VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇████
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇████
train/grad_norm,██▆▅▄▃▃▃▃▃▃▃▃▃▄▃▂▂▂▃▂▂▂▂▂▂▁▂▁▁▂▂▁▁▁▁▂▁▂▁
train/learning_rate,▁▁▁▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇██
train/loss,▇▇▇▇▇█▇▆▆▇▃▆▄▂▅▄▄▃▄▃▃▃▄▃▂▃▂▂▂▂▁▂▂▁▁▁▃▁▂▁

0,1
total_flos,209033625600000.0
train/epoch,0.018
train/global_step,100.0
train/grad_norm,3.18667
train/learning_rate,1e-05
train/loss,2.5861
train_loss,3.92494
train_runtime,35.4303
train_samples_per_second,11.29
train_steps_per_second,2.822


Model saved locally to ./local_models/GPT-Valkyrie_LN-124m__AttnOnly__Billsum


README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Model pushed to HuggingFace Hub: shng2025/GPT-Valkyrie_LN-124m__AttnOnly__Billsum, branch: dashing-sun-1
Processing LN FFNonly model...


config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Tokenizer pad token: <|endoftext|>
Tokenizer pad token ID: 50256
Model pad token ID: 50256


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
WandbCallback
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss


VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇████
train/global_step,▁▁▂▂▂▂▂▂▂▂▃▃▄▄▄▅▅▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/grad_norm,██▇▆▆▆█▃▃▃▂▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▂▁▁▁▁▁▃▃▁▁
train/learning_rate,▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇██
train/loss,████▇█▇▇▆▆▄▄▂▅▃▄▃▃▅▃▂▂▄▃▂▄▂▃▂▃▂▃▃▃▁▂▁▁▂▂

0,1
total_flos,209033625600000.0
train/epoch,0.018
train/global_step,100.0
train/grad_norm,4.77869
train/learning_rate,1e-05
train/loss,3.0316
train_loss,4.51975
train_runtime,33.7511
train_samples_per_second,11.851
train_steps_per_second,2.963


Model saved locally to ./local_models/GPT-Valkyrie_LN-124m__FFNonly__Billsum


README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Model pushed to HuggingFace Hub: shng2025/GPT-Valkyrie_LN-124m__FFNonly__Billsum, branch: jolly-universe-1
Processing RMSN baseModel model...


config.json:   0%|          | 0.00/751 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Tokenizer pad token: <|endoftext|>
Tokenizer pad token ID: 50256
Model pad token ID: 50256


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
WandbCallback
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss


VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇█
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇████
train/grad_norm,▇█▇▇▇▅▄▅▄▅▅▄▄▃▃▃▃▂▂▂▂▂▂▃▁▂▂▂▂▁▂▂▁▁▁▁▁▁▁
train/learning_rate,▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇██
train/loss,▇▇▆▆▆█▆▆█▇▅▂▃▄▃▄▆▅▃▃▂▂▅▂▄▃▂▂▂▂▄▁▁▂▁▂▂▂▂▁

0,1
total_flos,209033625600000.0
train/epoch,0.018
train/global_step,100.0
train/grad_norm,3.64347
train/learning_rate,1e-05
train/loss,2.5107
train_loss,3.53201
train_runtime,35.581
train_samples_per_second,11.242
train_steps_per_second,2.81


Model saved locally to ./local_models/GPT-Valkyrie_RMSN-124m__baseModel__Billsum


README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Model pushed to HuggingFace Hub: shng2025/GPT-Valkyrie_RMSN-124m__baseModel__Billsum, branch: noble-elevator-1
Processing RMSN noNorm model...


config.json:   0%|          | 0.00/842 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Tokenizer pad token: <|endoftext|>
Tokenizer pad token ID: 50256
Model pad token ID: 50256


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
WandbCallback
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss


VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███████
train/grad_norm,█▆▇▆▆▅▅▄▅▄▄▃▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁
train/learning_rate,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇█████
train/loss,▇▇▇▇██▇▆▇▃▅▅▅▃▄▄▃▃▄▃▃▃▃▄▄▃▁▁▄▃▁▂▂▂▁▂▁▂▂▁

0,1
total_flos,209033625600000.0
train/epoch,0.018
train/global_step,100.0
train/grad_norm,4.10658
train/learning_rate,1e-05
train/loss,2.9822
train_loss,4.43003
train_runtime,33.9856
train_samples_per_second,11.77
train_steps_per_second,2.942


Model saved locally to ./local_models/GPT-Valkyrie_RMSN-124m__noNorm__Billsum


README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Model pushed to HuggingFace Hub: shng2025/GPT-Valkyrie_RMSN-124m__noNorm__Billsum, branch: polar-durian-1
Processing RMSN AttnOnly model...


config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Tokenizer pad token: <|endoftext|>
Tokenizer pad token ID: 50256
Model pad token ID: 50256


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
WandbCallback
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss


VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇▇▇▇▇█████
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇██
train/grad_norm,██▇▇█▆▆▆▅▃▃▃▃▂▂▃▃▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁▂▂▁▁▁▁
train/learning_rate,▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇████
train/loss,▇▇▇██▆▇▇▅▇▄▄▂▃▅▄▃▃▄▄▅▃▅▂▃▂▃▂▂▂▂▂▃▂▂▁▂▃▂▁

0,1
total_flos,209033625600000.0
train/epoch,0.018
train/global_step,100.0
train/grad_norm,3.33658
train/learning_rate,1e-05
train/loss,2.6718
train_loss,3.91567
train_runtime,35.6519
train_samples_per_second,11.22
train_steps_per_second,2.805


Model saved locally to ./local_models/GPT-Valkyrie_RMSN-124m__AttnOnly__Billsum


README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Model pushed to HuggingFace Hub: shng2025/GPT-Valkyrie_RMSN-124m__AttnOnly__Billsum, branch: icy-voice-1
Processing RMSN FFNonly model...


config.json:   0%|          | 0.00/845 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Tokenizer pad token: <|endoftext|>
Tokenizer pad token ID: 50256
Model pad token ID: 50256


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
WandbCallback
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss


VBox(children=(Label(value='0.020 MB of 0.020 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇███
train/grad_norm,█▇▇▇▆▄▃▂▄▃▂▂▃▂▂▃▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/learning_rate,▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▇▇▇██████
train/loss,▇▇▇▇▆▇██▇▆▅▄▅▄▅▃▃▄▃▅▅▃▁▂▃▂▂▂▂▂▂▂▂▂▃▁▂▂▂▁

0,1
total_flos,209033625600000.0
train/epoch,0.018
train/global_step,100.0
train/grad_norm,3.82202
train/learning_rate,1e-05
train/loss,2.9543
train_loss,4.1593
train_runtime,34.1454
train_samples_per_second,11.715
train_steps_per_second,2.929


Model saved locally to ./local_models/GPT-Valkyrie_RMSN-124m__FFNonly__Billsum


README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Model pushed to HuggingFace Hub: shng2025/GPT-Valkyrie_RMSN-124m__FFNonly__Billsum, branch: different-armadillo-1
Training completed for all variants and normalization types.


In [None]:
!ls

# Trail of Local Checkpoint

In [None]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel

# Path to the checkpoint directory
checkpoint_dir = "./results/LN/baseModel/checkpoint-10"

# Load the tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained(checkpoint_dir)
tokenizer.pad_token = tokenizer.eos_token  # Ensure pad token is set

# Load the model
model = GPT2LMHeadModel.from_pretrained(checkpoint_dir)
model.config.pad_token_id = tokenizer.eos_token_id  # Set pad token ID

In [None]:
import torch

# Function to generate a summary for a single article
def generate_summary(article_text, max_length=128, num_beams=4, early_stopping=True):
    """
    Generate a summary for the given article text.

    Args:
        article_text (str): The article to summarize.
        max_length (int): The maximum length of the summary.
        num_beams (int): Number of beams for beam search.
        early_stopping (bool): Whether to stop the beam search when at least num_beams sentences are finished per batch.

    Returns:
        str: The generated summary.
    """
    # Prepare the input by adding the prefix if used during training
    input_text = "summarize: " + article_text
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)

    # Generate summary (move to GPU if available)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = inputs.to(device)

    summary_ids = model.generate(
        inputs,
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=early_stopping,
        no_repeat_ngram_size=3,  # To avoid repetition
    )

    # Decode the generated summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Example usage
article = """
The government has announced a new policy aimed at reducing carbon emissions by 40% over the next decade.
This initiative includes investing in renewable energy sources, implementing stricter regulations on industrial pollution,
and promoting electric vehicles. Environmentalists have welcomed the move, citing it as a significant step towards combating climate change.
However, some industries have expressed concerns about the economic impact and the feasibility of meeting these targets.
"""

summary = generate_summary(article)
print("Generated Summary:\n", summary)


# Checking Loaded Repos

In [4]:
!pip install evaluate sacrebleu rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=a460ad583b35ac750c1ada48ab16fbee3a6d4478edaf8c77127d06713e8e178e
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [27]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Choose one of your model variants
model_name = "shng2025/GPT-Valkyrie_LN-124m__baseModel__Billsum"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Ensure the tokenizer has a pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def generate_summary(text, max_input_length=1024, max_new_tokens=150):
    # Prepare the input text with the "summarize: " prefix
    prompt = f"summarize: {text}"

    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt", max_length=max_input_length, truncation=True, padding=True)

    # Generate the summary
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=max_new_tokens,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            do_sample=True,
            temperature=0.7
        )

    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Remove the "summarize: " prefix and the input text from the generated text
    summary = generated_text[len(prompt):].strip()
    return summary

# Sample text to summarize
text1 = "The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country."

# Generate summary
summary1 = generate_summary(text1)

print("Original text 1:")
print(text1)
print("\nGenerated summary 1:")
print(summary1)

# Test with a different input
text2 = "The Inflation Reduction Act aims to curb inflation by reducing the deficit, lowering prescription drug prices, and investing in domestic energy production while promoting clean energy. The bill is designed to raise $739 billion in revenue through strategies such as establishing a 15% corporate minimum tax and improving IRS tax enforcement."

# Generate summary for the second text
summary2 = generate_summary(text2)

print("\nOriginal text 2:")
print(text2)
print("\nGenerated summary 2:")
print(summary2)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Original text 1:
The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country.

Generated summary 1:
SEC. 2. INCREASED INCOME TAX ON THE COST OF THE ENTIRE HOUSE.

   (a) In General.--Subpart D of part IV of subchapter A of chapter 1 
of the Internal Revenue Code of 1986 (relating to additional itemized ____) is amended by adding at the end ____________ the following new section:
 _____  ``(d) Allowance of Credit.--
In lieu of any tax imposed by this chapter for the taxable year for ____________________ ______, the Secretary may forgive any increased tax under this _______  $100,000.00 for each $1,500,00  TITLE III--COST FOR THE DISTRICT OF

Original text 2:
The Inflation Reduction Act aims to curb inflation by reducing the deficit, lowering prescription drug prices, and investin

# Load Metrics

In [45]:
import torch
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
from datasets import load_dataset
from tqdm import tqdm
import evaluate
import sacrebleu
import numpy as np

def load_model_and_tokenizer(model_name):
    tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id
    return model, tokenizer

def generate_summary(model, tokenizer, text, max_input_length=874, max_summary_length=150):
    prompt = f"summarize: {text} summary:"
    inputs = tokenizer(prompt, return_tensors="pt", max_length=max_input_length, truncation=True)
    input_ids = inputs.input_ids.to(model.device)
    attention_mask = inputs.attention_mask.to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=max_summary_length,
            num_return_sequences=1,
            no_repeat_ngram_size=3,
            do_sample=True,
            temperature=0.7,
            top_k=50,
            top_p=0.95,
        )

    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extract summary after 'summary:'
    generated_summary = full_output.split("summary:")[-1].strip()

    return generated_summary

def compute_metrics(predictions, labels):
    rouge = evaluate.load("rouge")
    rouge_result = rouge.compute(predictions=predictions, references=labels, use_stemmer=True)

    bleu_scores = sacrebleu.corpus_bleu(predictions, [labels], smooth_method='exp', smooth_value=0.1, force=True, lowercase=True, tokenize='13a')
    bleu_score = bleu_scores.score

    result = {
        "rouge1": rouge_result["rouge1"].mid.fmeasure,
        "rouge2": rouge_result["rouge2"].mid.fmeasure,
        "rougeL": rouge_result["rougeL"].mid.fmeasure,
        "bleu": bleu_score
    }

    prediction_lens = [len(pred.split()) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    result = {k: round(v, 4) for k, v in result.items()}
    return result

def evaluate_model(model_name, dataset):
    print(f"Evaluating model: {model_name}")
    model, tokenizer = load_model_and_tokenizer(model_name)
    model.to('cuda' if torch.cuda.is_available() else 'cpu')
    model.eval()

    generated_summaries = []
    reference_summaries = []

    for i, example in enumerate(tqdm(dataset, desc="Generating summaries")):
        print(f"\nProcessing example {i+1}")
        generated_summary = generate_summary(model, tokenizer, example['text'])
        if generated_summary:
            generated_summaries.append(generated_summary)
            print(f"Generated summary (label):\n{generated_summary}\n")
            reference_summaries.append(example['summary'])
            print(f"Original summary (label):\n{example['summary']}\n")
        else:
            print(f"Warning: Empty summary generated")

        if i == 4:  # Process only 5 examples for quick debugging
            break

    if not generated_summaries:
        print(f"Warning: No valid summaries generated for {model_name}")
        return None

    scores = compute_metrics(generated_summaries, reference_summaries)

    print(f"\nScores for {model_name}:")
    for metric, score in scores.items():
        print(f"{metric}: {score:.4f}")
    print("\n")

    return scores

# Load the BillSum dataset (ca_test split)
dataset = load_dataset("billsum", split="ca_test")

# List of models to evaluate
models_to_evaluate = [
    "shng2025/GPT-Valkyrie_LN-124m__baseModel__Billsum",
    # Add other model variants here
]

# Evaluate each model
results = {}
for model_name in models_to_evaluate:
    results[model_name] = evaluate_model(model_name, dataset)

# Print overall comparison
print("Overall Comparison:")
for model_name, scores in results.items():
    if scores:
        print(f"{model_name}:")
        for metric, score in scores.items():
            print(f"  {metric}: {score:.4f}")
        print()
    else:
        print(f"{model_name}: No valid scores")

# Optionally, save results to a file
import json
with open('model_comparison_results.json', 'w') as f:
    json.dump(results, f, indent=2)
print("Results saved to model_comparison_results.json")

Evaluating model: shng2025/GPT-Valkyrie_LN-124m__baseModel__Billsum


Generating summaries:   0%|          | 0/1237 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Processing example 1


Generating summaries:   0%|          | 1/1237 [00:24<8:15:44, 24.07s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated summary (label):
summarize: The people of the State of California do enact as follows:


SECTION 1.
The Legislature finds and declares all of the following:
(a) (1) Since 1899 congressionally chartered veterans’ organizations have provided a valuable service to our nation’s returning service members. These organizations help preserve the memories and incidents of the great hostilities fought by our nation, and preserve and strengthen comradeship among members.
(2) These veterans’ organizations also own and manage various properties including lodges, posts, and fraternal halls. These properties act as a safe haven where veterans of all ages and their families can gather together to find camaraderie and fellowship, share stories, and seek support from people who understand their unique experiences. This aids in the healing process for these returning veterans, and ensures their health and happiness.
(b) As a result of congressional chartering of these veterans’ organizations, t

Generating summaries:   0%|          | 2/1237 [00:42<7:07:25, 20.77s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated summary (label):
summarize: The people of the State of California do enact as follows:


SECTION 1.
Section 1170.02 is added to the Penal Code, to read:
1170.02.
A prisoner is not eligible for resentence or recall pursuant to subdivision (e) of Section 1170 if he or she was convicted of first-degree murder if the victim was a peace officer, as defined in Section 830.1, 830.2, 830.3, 830.31, 830.32, 830.33, 830.34, 830.35, 830.36, 830.37, 830.4, 830.5, 830.6, 830.10, 830.11, or 830.12, who was killed while engaged in the performance of his or her duties, and the individual knew, or reasonably should have known, that the victim was a peace officer engaged in the performance of his or her duties, or the victim was a peace officer or a former peace officer under any of the above-enumerated sections, and was intentionally killed in retaliation for the performance of his or her official duties.
SEC. 2.
Section 3550 of the Penal Code is amended to read:
3550.
(a) Notwithstanding any

Generating summaries:   0%|          | 3/1237 [01:03<7:04:55, 20.66s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated summary (label):
summarize: The people of the State of California do enact as follows:


SECTION 1.
The Legislature finds and declares all of the following:
(a) California has long been known as the land of opportunity, the republic of the future. But for too many of its residents the future is receding. Inequality continues to rise — even though California has one of the most progressive tax structures in the nation.
(b) Something more is needed; a new philosophy of governance that focuses on the overall progressive outcome that can be achieved through modernizing our tax system and investing in the means of upward mobility, above all job creating infrastructure and public higher education for our increasingly youthful population.
(c) Beyond these foundations, building and sustaining a middle class means new jobs with good wages. Small businesses, like plumbing contractors, auto repair shops, and restaurants that account for over 90 percent of the state’s businesses and well

Generating summaries:   0%|          | 4/1237 [01:23<7:00:29, 20.46s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated summary (label):
summarize: The people of the State of California do enact as follows:


SECTION 1.
Section 75220 of the Public Resources Code is amended to read:
75220.
(a) The Transit and Intercity Rail Capital Program is hereby created to fund transformative capital improvements, as defined in subdivision (d), that will modernize California’s intercity, commuter, and urban rail systems and bus and ferry transit systems to achieve all of the following policy objectives:
(1) Reduce emissions of greenhouse gases.
(2) Expand and improve transit service to increase ridership.
(3) Integrate the rail service of the state’s various rail operators, including integration with the high-speed rail system.
(4) Improve transit safety.
(b) The Transportation Agency shall evaluate applications consistent with the criteria set forth in this part and approve a multiyear program of projects for funding pursuant to Section 75224, which may be revised as necessary.
(c) The California Transport

Generating summaries:   0%|          | 4/1237 [01:42<8:45:42, 25.58s/it]

Generated summary (label):
summarize: The people of the State of California do enact as follows:


SECTION 1.
This act shall be known, and may be cited, as the California POLST eRegistry Pilot Act.
SEC. 2.
Section 4788 is added to the Probate Code, to read:
4788.
(a)    For purposes of this section:
(1) “Authority” means the Emergency Medical Services Authority.
(2) “Authorized user” means a person authorized by the authority to submit information to, or to receive information from, the POLST eRegistry Pilot, including health care providers, as defined in Section 4781, and their designees.
(3) “POLST” means a Physician Orders for Life Sustaining Treatment that fulfills the requirements, in any format, of Section 4780.
(4) “POLST eRegistry Pilot” means the California POLST eRegistry Pilot Act established pursuant to this section to make electronic, in addition to other modes of submission and transmission, POLST information available to authorized users.
(b) (1) The authority shall esta




AttributeError: 'numpy.float64' object has no attribute 'mid'

Certainly! I'll explain how your original fine-tuning code works, focusing on the input and output process. Let's break it down step by step:

1. Data Preparation:
   Your `load_billsum()` function loads the BillSum dataset, combining 'train' and 'test' splits for training, and using 'ca_test' as the test set.

2. Tokenization and Preprocessing:
   The `preprocess_function()` is key here. It does the following:
   
   a. Adds a "summarize: " prefix to each input text.
   b. Tokenizes the input (text + prefix) with a max length of 1024 tokens.
   c. Tokenizes the summary (label) with a max length of 128 tokens.
   d. Replaces padding token IDs in the labels with -100 (to be ignored by the loss function).

3. Model Input:
   During training, the model receives:
   - Input IDs: Tokenized text with "summarize: " prefix (max 1024 tokens)
   - Attention mask: Indicating which tokens are padding and which are real input
   - Labels: Tokenized summary (max 128 tokens), with padding replaced by -100

4. Training Process:
   The model is trained to predict the next token given the previous tokens. It learns to generate the summary tokens after the "summarize: " prefix and the input text.

5. Loss Calculation:
   The model computes loss only on the summary part (where labels are not -100), effectively learning to generate summaries.

6. Generation (Inference):
   During inference, the model is given the input text with the "summarize: " prefix and is expected to generate the summary tokens.

The low training loss (1.6) suggests that the model was learning to predict the next tokens in the summaries reasonably well. However, there could be a few reasons why the output quality might seem poor during evaluation:

1. Overfitting: The model might have memorized training examples but struggles with generalization.

2. Prompt Format: If the evaluation script doesn't use the exact same "summarize: " prefix, the model might get confused.

3. Generation Parameters: The quality of generated summaries can be highly sensitive to parameters like temperature, top_k, top_p, etc.

4. Evaluation Metric Mismatch: The training loss measures token prediction accuracy, while human-perceived quality or ROUGE scores measure different aspects of summarization quality.

5. Limited Fine-tuning: With only 100 steps of fine-tuning, the model might not have fully adapted to the summarization task.

To diagnose the issue:

1. Check if the evaluation script uses the same "summarize: " prefix.
2. Try generating summaries with different decoding strategies (e.g., adjust temperature, use beam search).
3. Manually inspect a few generated summaries to understand what type of errors the model is making.
4. Consider increasing the number of training steps or epochs.

Would you like me to provide a modified evaluation script that aligns more closely with your training process?