In [1]:
!pip install transformers wandb rouge_score
!pip install datasets==2.21.0



In [3]:
import torch
from transformers import GPT2TokenizerFast, GPT2ForSequenceClassification, DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from rouge_score import rouge_scorer
import wandb
from transformers.integrations import WandbCallback

# Load CNN/DailyMail dataset
def load_cnn_dailymail():
    dataset = load_dataset("cnn_dailymail", "3.0.0")
    return dataset

# Initialize tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Preprocess function
def preprocess_function(examples):
    inputs = [doc + " TL;DR: " for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

    labels = tokenizer(examples["highlights"], max_length=128, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Load and preprocess the dataset
dataset = load_cnn_dailymail()
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

# Function to freeze layers based on variant type
def freeze_layers(model, variant_type):
    if variant_type == "noNorm":
        for name, param in model.named_parameters():
            if "ln" in name:
                param.requires_grad = False
    elif variant_type == "AttnOnly":
        for name, param in model.named_parameters():
            if "ln_2" in name:  # Freeze FFN layer norm
                param.requires_grad = False
    elif variant_type == "FFNonly":
        for name, param in model.named_parameters():
            if "ln_1" in name:  # Freeze attention layer norm
                param.requires_grad = False
    # For baseModel, we don't freeze any layers

# Compute metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge scores
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = [scorer.score(label, pred) for label, pred in zip(decoded_labels, decoded_preds)]

    return {
        "rouge1": np.mean([score['rouge1'].fmeasure for score in scores]),
        "rouge2": np.mean([score['rouge2'].fmeasure for score in scores]),
        "rougeL": np.mean([score['rougeL'].fmeasure for score in scores]),
    }

# Fine-tuning function
def fine_tune_model(model, tokenizer, dataset, output_dir, variant, norm_type):
    # Initialize wandb run
    wandb.init(project=f"GPT-Valkyrie_{norm_type}-124m__{variant}__CNN-DM", reinit=True)
    run_name = wandb.run.name

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        evaluation_strategy="steps",
        eval_steps=500,
        save_steps=1000,
        load_best_model_at_end=True,
        report_to="wandb",
        run_name=run_name,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"],
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_metrics,
        callbacks=[WandbCallback()],
    )

    trainer.train()
    wandb.finish()
    return trainer.model, run_name

# Main training loop
variants = ["baseModel", "noNorm", "AttnOnly", "FFNonly"]
norm_types = ["LN", "RMSN"]

for norm_type in norm_types:
    for variant in variants:
        print(f"Processing {norm_type} {variant} model...")

        # Use the correct base model for each variant
        model_path = f"shng2025/GPT-Valkyrie_{norm_type}-124m__{variant}__"
        model = GPT2ForSequenceClassification.from_pretrained(model_path)

        freeze_layers(model, variant)

        output_dir = f"./results/{norm_type}/{variant}"
        fine_tuned_model, run_name = fine_tune_model(model, tokenizer, tokenized_datasets, output_dir, variant, norm_type)

        # Save the model locally
        local_save_dir = f"./local_models/GPT-Valkyrie_{norm_type}-124m__{variant}__CNN-DM"
        fine_tuned_model.save_pretrained(local_save_dir)
        tokenizer.save_pretrained(local_save_dir)
        print(f"Model saved locally to {local_save_dir}")

        # Push the model to your HuggingFace Hub repository
        new_repo_name = f"shng2025/GPT-Valkyrie_{norm_type}-124m__{variant}__CNN-DM"
        fine_tuned_model.push_to_hub(new_repo_name, branch=run_name)
        tokenizer.push_to_hub(new_repo_name, branch=run_name)
        print(f"Model pushed to HuggingFace Hub: {new_repo_name}, branch: {run_name}")

print("Training completed for all variants and normalization types.")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/257M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/259M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

Processing LN baseModel model...


config.json:   0%|          | 0.00/751 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at shng2025/GPT-Valkyrie_LN-124m__baseModel__ and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
WandbCallback


AssertionError: Cannot handle batch sizes > 1 if no padding token is defined.