In [1]:
# --- Install Libraries ---
!pip install transformers datasets peft torchinfo timm -q
!pip install nvidia-ml-py3 -q
!pip install ipywidgets -q

In [2]:
# --- Imports ---
import os
import torch
import torch.nn as nn
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    Trainer, TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset
from torchinfo import summary
from peft import LoraConfig, get_peft_model
from timm.layers import LayerNorm2d

In [3]:
# --- Setup Tokenizer and Base Model ---
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("distilgpt2")
model.config.pad_token_id = tokenizer.pad_token_id
model.gradient_checkpointing_enable()

In [4]:
# --- Dynamic Tanh ---
class DynamicTanh(nn.Module):
    def __init__(self, normalized_shape, channels_last, alpha_init_value=0.5):
        super().__init__()
        self.normalized_shape = normalized_shape
        self.alpha_init_value = alpha_init_value
        self.channels_last = channels_last

        self.alpha = nn.Parameter(torch.ones(1) * alpha_init_value)
        self.weight = nn.Parameter(torch.ones(normalized_shape))
        self.bias = nn.Parameter(torch.zeros(normalized_shape))

    def forward(self, x):
        x = torch.tanh(self.alpha * x)
        if self.channels_last:
            x = x * self.weight + self.bias
        else:
            x = x * self.weight[:, None, None] + self.bias[:, None, None]
        return x

    def extra_repr(self):
        return f"normalized_shape={self.normalized_shape}, alpha_init_value={self.alpha_init_value}, channels_last={self.channels_last}"


def convert_ln_to_dyt(module):
    module_output = module
    if isinstance(module, nn.LayerNorm):
        module_output = DynamicTanh(module.normalized_shape, not isinstance(module, LayerNorm2d))
    for name, child in module.named_children():
        module_output.add_module(name, convert_ln_to_dyt(child))
    del module
    return module_output

In [5]:
model = convert_ln_to_dyt(model)

In [6]:
# 1. Freeze all base model parameters
for param in model.parameters():
    param.requires_grad = False

# 2. Unfreeze DyT parameters
for module in model.modules():
    if isinstance(module, DynamicTanh):
        for param in module.parameters():
            param.requires_grad = True


In [7]:
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): DynamicTanh(normalized_shape=(768,), alpha_init_value=0.5, channels_last=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): DynamicTanh(normalized_shape=(768,), alpha_init_value=0.5, channels_last=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): DynamicTanh(normalized_shape=(768,), alpha_init_value=0.5, channels_last=True)
  )
  (lm_head): Linear(in_features=768, out_features=5

In [8]:
for name, param in model.named_parameters():
    print(f"{name:60} requires_grad = {param.requires_grad}")


transformer.wte.weight                                       requires_grad = False
transformer.wpe.weight                                       requires_grad = False
transformer.h.0.ln_1.alpha                                   requires_grad = True
transformer.h.0.ln_1.weight                                  requires_grad = True
transformer.h.0.ln_1.bias                                    requires_grad = True
transformer.h.0.attn.c_attn.weight                           requires_grad = False
transformer.h.0.attn.c_attn.bias                             requires_grad = False
transformer.h.0.attn.c_proj.weight                           requires_grad = False
transformer.h.0.attn.c_proj.bias                             requires_grad = False
transformer.h.0.ln_2.alpha                                   requires_grad = True
transformer.h.0.ln_2.weight                                  requires_grad = True
transformer.h.0.ln_2.bias                                    requires_grad = True
transforme

In [9]:
trainable = 0
frozen = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        trainable += param.numel()
    else:
        frozen += param.numel()

print(f"Trainable params: {trainable:,}")
print(f"Frozen params:    {frozen:,}")
print(f"Total params:     {trainable + frozen:,}")


Trainable params: 19,981
Frozen params:    81,892,608
Total params:     81,912,589


In [10]:
print("Trainable parameters:")
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)


Trainable parameters:
transformer.h.0.ln_1.alpha
transformer.h.0.ln_1.weight
transformer.h.0.ln_1.bias
transformer.h.0.ln_2.alpha
transformer.h.0.ln_2.weight
transformer.h.0.ln_2.bias
transformer.h.1.ln_1.alpha
transformer.h.1.ln_1.weight
transformer.h.1.ln_1.bias
transformer.h.1.ln_2.alpha
transformer.h.1.ln_2.weight
transformer.h.1.ln_2.bias
transformer.h.2.ln_1.alpha
transformer.h.2.ln_1.weight
transformer.h.2.ln_1.bias
transformer.h.2.ln_2.alpha
transformer.h.2.ln_2.weight
transformer.h.2.ln_2.bias
transformer.h.3.ln_1.alpha
transformer.h.3.ln_1.weight
transformer.h.3.ln_1.bias
transformer.h.3.ln_2.alpha
transformer.h.3.ln_2.weight
transformer.h.3.ln_2.bias
transformer.h.4.ln_1.alpha
transformer.h.4.ln_1.weight
transformer.h.4.ln_1.bias
transformer.h.4.ln_2.alpha
transformer.h.4.ln_2.weight
transformer.h.4.ln_2.bias
transformer.h.5.ln_1.alpha
transformer.h.5.ln_1.weight
transformer.h.5.ln_1.bias
transformer.h.5.ln_2.alpha
transformer.h.5.ln_2.weight
transformer.h.5.ln_2.bias
transf

In [11]:
# --- Summary ---
from torchinfo import summary

summary(model, input_size=(1, 128), dtypes=[torch.int64])

Layer (type:depth-idx)                             Output Shape              Param #
GPT2LMHeadModel                                    [1, 12, 128, 64]          --
├─GPT2Model: 1-1                                   [1, 12, 128, 64]          --
│    └─Embedding: 2-1                              [1, 128, 768]             (38,597,376)
│    └─Embedding: 2-2                              [1, 128, 768]             (786,432)
│    └─Dropout: 2-3                                [1, 128, 768]             --
│    └─ModuleList: 2-4                             --                        --
│    │    └─GPT2Block: 3-1                         [1, 128, 768]             7,087,874
│    │    └─GPT2Block: 3-2                         [1, 128, 768]             7,087,874
│    │    └─GPT2Block: 3-3                         [1, 128, 768]             7,087,874
│    │    └─GPT2Block: 3-4                         [1, 128, 768]             7,087,874
│    │    └─GPT2Block: 3-5                         [1, 128, 768]      

In [12]:
for name, module in model.named_modules():
    if "fc" in name:
        print(name)


transformer.h.0.mlp.c_fc
transformer.h.1.mlp.c_fc
transformer.h.2.mlp.c_fc
transformer.h.3.mlp.c_fc
transformer.h.4.mlp.c_fc
transformer.h.5.mlp.c_fc


In [13]:
# --- PEFT Config (LoRA Injection) ---
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["c_attn", "c_fc", "c_proj"],  # DistilGPT2 uses c_attn in attention, # c_attn + MLP layers
    task_type="CAUSAL_LM",
)

In [14]:
peft_model = get_peft_model(model, peft_config)



In [15]:
for name, param in peft_model.named_parameters():
    if "lora" in name and param.requires_grad:
        print(name)

base_model.model.transformer.h.0.attn.c_attn.lora_A.default.weight
base_model.model.transformer.h.0.attn.c_attn.lora_B.default.weight
base_model.model.transformer.h.0.attn.c_proj.lora_A.default.weight
base_model.model.transformer.h.0.attn.c_proj.lora_B.default.weight
base_model.model.transformer.h.0.mlp.c_fc.lora_A.default.weight
base_model.model.transformer.h.0.mlp.c_fc.lora_B.default.weight
base_model.model.transformer.h.0.mlp.c_proj.lora_A.default.weight
base_model.model.transformer.h.0.mlp.c_proj.lora_B.default.weight
base_model.model.transformer.h.1.attn.c_attn.lora_A.default.weight
base_model.model.transformer.h.1.attn.c_attn.lora_B.default.weight
base_model.model.transformer.h.1.attn.c_proj.lora_A.default.weight
base_model.model.transformer.h.1.attn.c_proj.lora_B.default.weight
base_model.model.transformer.h.1.mlp.c_fc.lora_A.default.weight
base_model.model.transformer.h.1.mlp.c_fc.lora_B.default.weight
base_model.model.transformer.h.1.mlp.c_proj.lora_A.default.weight
base_model

In [16]:
print("Trainable parameters after PEFT injection:")
peft_model.print_trainable_parameters()


Trainable parameters after PEFT injection:
trainable params: 589,824 || all params: 82,502,413 || trainable%: 0.7149


In [17]:
# --- Model Summary ---
summary(peft_model, input_size=(1, 128), dtypes=[torch.int64])

Layer (type:depth-idx)                                       Output Shape              Param #
PeftModelForCausalLM                                         [1, 12, 128, 64]          --
├─LoraModel: 1-1                                             [1, 12, 128, 64]          --
│    └─GPT2LMHeadModel: 2-1                                  --                        --
│    │    └─GPT2Model: 3-1                                   [1, 12, 128, 64]          82,502,413
│    │    └─Linear: 3-2                                      [1, 128, 50257]           (38,597,376)
Total params: 121,099,789
Trainable params: 589,824
Non-trainable params: 120,509,965
Total mult-adds (G): 81.71
Input size (MB): 0.00
Forward/backward pass size (MB): 148.39
Params size (MB): 484.40
Estimated Total Size (MB): 632.79

In [18]:
# --- Load Alpaca Dataset ---
dataset = dataset = load_dataset("chardizard/modified-rewild", split="train")

In [19]:
print(dataset)

Dataset({
    features: ['prompt', 'response'],
    num_rows: 319842
})


In [20]:
print(dataset.features)

{'prompt': Value(dtype='string', id=None), 'response': Value(dtype='string', id=None)}


In [21]:
def preprocess(batch):
    full_texts = [prompt.strip() + "\n\n" + response.strip()
                  for prompt, response in zip(batch['prompt'], batch['response'])]

    tokenized_batch = tokenizer(
        full_texts,
        truncation=True,
        padding="max_length",
        max_length=256,
    )

    # Add labels for causal LM (same as input_ids)
    tokenized_batch["labels"] = tokenized_batch["input_ids"].copy()

    return tokenized_batch


In [22]:
tokenized_dataset = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset.column_names,
    desc="Tokenizing modified RE-WILD"
)

Tokenizing modified RE-WILD:   0%|          | 0/319842 [00:00<?, ? examples/s]

In [23]:
print(tokenized_dataset[0])

{'input_ids': [43380, 117, 10310, 233, 165, 251, 95, 32573, 247, 162, 106, 113, 37863, 227, 22522, 117, 32573, 249, 26193, 234, 165, 247, 235, 34932, 235, 171, 120, 248, 164, 241, 251, 165, 251, 249, 162, 252, 250, 17739, 115, 17312, 231, 46479, 225, 32573, 249, 162, 114, 230, 44293, 244, 23513, 26193, 98, 17739, 227, 163, 119, 112, 37955, 163, 112, 254, 163, 255, 231, 43291, 18796, 101, 16764, 164, 241, 251, 165, 251, 249, 162, 252, 250, 40792, 28938, 104, 17312, 231, 32014, 34932, 237, 21410, 25465, 47078, 35050, 252, 250, 165, 227, 116, 171, 120, 234, 31965, 117, 26344, 104, 42468, 162, 253, 254, 162, 103, 105, 165, 227, 116, 161, 240, 234, 164, 233, 117, 162, 252, 250, 165, 227, 116, 163, 255, 231, 31965, 102, 164, 112, 101, 21410, 28938, 104, 34932, 237, 31965, 117, 26344, 104, 165, 45865, 171, 120, 234, 22522, 225, 20015, 105, 164, 95, 104, 21689, 19526, 241, 33699, 222, 28938, 116, 162, 242, 114, 20015, 98, 28938, 236, 171, 120, 234, 47797, 121, 46479, 225, 32573, 249, 47797, 22

In [24]:
for i in range(5):
    print(f"Original prompt:\n{dataset[i]['prompt']}")
    print(f"Original response:\n{dataset[i]['response']}")
    print("\nTokenized and Decoded Text:")
    print(tokenizer.decode(tokenized_dataset[i]['input_ids'], skip_special_tokens=True))
    print("=" * 80)


Original prompt:
对下面这段内容进行降重：蓝靛果具有促进消化、补充维生素等作用。蓝靛果中含有大量的天然果酸，特别是柠檬酸和苹果酸等物质的含量特别高，它们被人体所吸收以后，能促进胃部消化液分泌，也能加快肠胃蠕动，提高肠胃的消化能力，对消化不良以及腹部胀痛，都有很好的预防和缓解作用。
Original response:
蓝靛果有助于消化和补充维生素。它富含多种天然果酸，如柠檬酸和苹果酸，这些成分在人体内可以促进胃液分泌，加速肠道蠕动，从而增强消化功能，对于消化不良和腹胀等问题有良好的预防和缓解效果。

Tokenized and Decoded Text:
对下面这段内容进行降重：蓝靛果具有促进消化、补充维生素等作用。蓝靛果中含有大量的天然果酸，特别是柠檬酸和苹果酸等物质的含量特别高，它们被人体所吸收以后，能促进胃部消化液分泌，也能加快肠胃蠕动，提高肠胃的消化能力，对消化不良以及�
Original prompt:
translate this into english please: Nein, aber wir erstellen einen wöchentlichen Statusbericht, in dem Sie die aktuellen Fortschritte sehen können.
Original response:
No, but we create a weekly status report where you can see the current progress.

Tokenized and Decoded Text:
translate this into english please: Nein, aber wir erstellen einen wöchentlichen Statusbericht, in dem Sie die aktuellen Fortschritte sehen können.

No, but we create a weekly status report where you can see the current progress.
Original prompt:
(I was worried about first attending this school. After

In [25]:
split_datasets = tokenized_dataset.train_test_split(test_size=0.05, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

In [26]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [31]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=500,  # Evaluate every 500 steps
    save_strategy="steps",
    save_steps=1000,  # Save every 1000 steps
    save_total_limit=2,  # Keep last 2 checkpoints
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    warmup_ratio=0.1,
    # warmup_steps=500,
    lr_scheduler_type="cosine",  # Smooth LR decay
    optim="adamw_torch",
    weight_decay=0.01,
    learning_rate=5e-5, # SMALL LR for GPT-2 finetuning
    fp16=True,
    logging_dir="./logs",
    logging_steps=100,
    report_to="none",  # disable WandB
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [32]:
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [33]:
torch.cuda.synchronize()
torch.cuda.empty_cache() 

In [34]:
trainer.train()

Step,Training Loss,Validation Loss
500,2.0808,8.461587
1000,2.1452,8.447293
1500,2.1535,8.511703
2000,2.1307,8.44915
2500,2.1135,8.450711
3000,2.1472,8.377264
3500,2.1226,8.370788
4000,2.054,8.357649
4500,2.1301,8.418232
5000,2.1308,8.352201




TrainOutput(global_step=18991, training_loss=2.097398328176011, metrics={'train_runtime': 4835.6917, 'train_samples_per_second': 62.835, 'train_steps_per_second': 3.927, 'total_flos': 2.012397313635072e+16, 'train_loss': 2.097398328176011, 'epoch': 1.0})

In [35]:
import copy
dyt_log_history = copy.deepcopy(trainer.state.log_history)

In [36]:
import math

final_loss = trainer.evaluate()["eval_loss"]
final_perplexity = math.exp(final_loss)
print(f"Final Evaluation Perplexity: {final_perplexity}")




Final Evaluation Perplexity: 3921.053169695442


In [37]:
# --- Setup Tokenizer and Base Model ---
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("distilgpt2")
model.config.pad_token_id = tokenizer.pad_token_id

print(model)


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [38]:


for name, param in model.named_parameters():
    print(f"{name:60} requires_grad = {param.requires_grad}")

trainable = 0
frozen = 0
for name, param in model.named_parameters():
    if param.requires_grad:
        trainable += param.numel()
    else:
        frozen += param.numel()

print(f"Trainable params: {trainable:,}")
print(f"Frozen params:    {frozen:,}")
print(f"Total params:     {trainable + frozen:,}")

print("Trainable parameters:")
for name, param in model.named_parameters():
    if param.requires_grad:
        print(name)

# --- Summary ---
from torchinfo import summary

summary(model, input_size=(1, 128), dtypes=[torch.int64])

for name, module in model.named_modules():
    if "fc" in name:
        print(name)

transformer.wte.weight                                       requires_grad = True
transformer.wpe.weight                                       requires_grad = True
transformer.h.0.ln_1.weight                                  requires_grad = True
transformer.h.0.ln_1.bias                                    requires_grad = True
transformer.h.0.attn.c_attn.weight                           requires_grad = True
transformer.h.0.attn.c_attn.bias                             requires_grad = True
transformer.h.0.attn.c_proj.weight                           requires_grad = True
transformer.h.0.attn.c_proj.bias                             requires_grad = True
transformer.h.0.ln_2.weight                                  requires_grad = True
transformer.h.0.ln_2.bias                                    requires_grad = True
transformer.h.0.mlp.c_fc.weight                              requires_grad = True
transformer.h.0.mlp.c_fc.bias                                requires_grad = True
transformer.h.0.

In [39]:


# --- PEFT Config (LoRA Injection) ---
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["c_attn", "c_fc", "c_proj"],  # DistilGPT2 uses c_attn in attention, # c_attn + MLP layers
    task_type="CAUSAL_LM",
)

peft_model = get_peft_model(model, peft_config)

for name, param in peft_model.named_parameters():
    if "lora" in name and param.requires_grad:
        print(name)

print("Trainable parameters after PEFT injection:")
peft_model.print_trainable_parameters()

# --- Model Summary ---
summary(peft_model, input_size=(1, 128), dtypes=[torch.int64])

# --- Load the Dataset ---
dataset = dataset = load_dataset("chardizard/modified-rewild", split="train")

print(dataset)

print(dataset.features)



base_model.model.transformer.h.0.attn.c_attn.lora_A.default.weight
base_model.model.transformer.h.0.attn.c_attn.lora_B.default.weight
base_model.model.transformer.h.0.attn.c_proj.lora_A.default.weight
base_model.model.transformer.h.0.attn.c_proj.lora_B.default.weight
base_model.model.transformer.h.0.mlp.c_fc.lora_A.default.weight
base_model.model.transformer.h.0.mlp.c_fc.lora_B.default.weight
base_model.model.transformer.h.0.mlp.c_proj.lora_A.default.weight
base_model.model.transformer.h.0.mlp.c_proj.lora_B.default.weight
base_model.model.transformer.h.1.attn.c_attn.lora_A.default.weight
base_model.model.transformer.h.1.attn.c_attn.lora_B.default.weight
base_model.model.transformer.h.1.attn.c_proj.lora_A.default.weight
base_model.model.transformer.h.1.attn.c_proj.lora_B.default.weight
base_model.model.transformer.h.1.mlp.c_fc.lora_A.default.weight
base_model.model.transformer.h.1.mlp.c_fc.lora_B.default.weight
base_model.model.transformer.h.1.mlp.c_proj.lora_A.default.weight
base_model

In [40]:

def preprocess(batch):
    full_texts = [prompt.strip() + "\n\n" + response.strip()
                  for prompt, response in zip(batch['prompt'], batch['response'])]

    tokenized_batch = tokenizer(
        full_texts,
        truncation=True,
        padding="max_length",
        max_length=512,
    )

    # Add labels for causal LM (same as input_ids)
    tokenized_batch["labels"] = tokenized_batch["input_ids"].copy()

    return tokenized_batch

tokenized_dataset = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset.column_names,
    desc="Tokenizing modified RE-WILD"
)

print(tokenized_dataset[0])

for i in range(5):
    print(f"Original prompt:\n{dataset[i]['prompt']}")
    print(f"Original response:\n{dataset[i]['response']}")
    print("\nTokenized and Decoded Text:")
    print(tokenizer.decode(tokenized_dataset[i]['input_ids'], skip_special_tokens=True))
    print("=" * 80)

split_datasets = tokenized_dataset.train_test_split(test_size=0.05, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']


{'input_ids': [43380, 117, 10310, 233, 165, 251, 95, 32573, 247, 162, 106, 113, 37863, 227, 22522, 117, 32573, 249, 26193, 234, 165, 247, 235, 34932, 235, 171, 120, 248, 164, 241, 251, 165, 251, 249, 162, 252, 250, 17739, 115, 17312, 231, 46479, 225, 32573, 249, 162, 114, 230, 44293, 244, 23513, 26193, 98, 17739, 227, 163, 119, 112, 37955, 163, 112, 254, 163, 255, 231, 43291, 18796, 101, 16764, 164, 241, 251, 165, 251, 249, 162, 252, 250, 40792, 28938, 104, 17312, 231, 32014, 34932, 237, 21410, 25465, 47078, 35050, 252, 250, 165, 227, 116, 171, 120, 234, 31965, 117, 26344, 104, 42468, 162, 253, 254, 162, 103, 105, 165, 227, 116, 161, 240, 234, 164, 233, 117, 162, 252, 250, 165, 227, 116, 163, 255, 231, 31965, 102, 164, 112, 101, 21410, 28938, 104, 34932, 237, 31965, 117, 26344, 104, 165, 45865, 171, 120, 234, 22522, 225, 20015, 105, 164, 95, 104, 21689, 19526, 241, 33699, 222, 28938, 116, 162, 242, 114, 20015, 98, 28938, 236, 171, 120, 234, 47797, 121, 46479, 225, 32573, 249, 47797, 22

In [42]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=500,  # Evaluate every 500 steps
    save_strategy="steps",
    save_steps=1000,  # Save every 1000 steps
    save_total_limit=2,  # Keep last 2 checkpoints
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    warmup_ratio=0.1,
    # warmup_steps=500,
    lr_scheduler_type="cosine",  # Smooth LR decay
    optim="adamw_torch",
    weight_decay=0.01,
    learning_rate=5e-5, # SMALL LR for GPT-2 finetuning
    fp16=True,
    logging_dir="./logs",
    logging_steps=100,
    report_to="none",  # disable WandB
)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss
500,0.7565,3.037802
1000,0.7727,2.992818
1500,0.7581,2.945789
2000,0.7508,2.898295
2500,0.7371,2.86351
3000,0.7427,2.836487
3500,0.747,2.826399




KeyboardInterrupt: 

In [None]:
norm_log_history = copy.deepcopy(trainer.state.log_history)

import math

final_loss = trainer.evaluate()["eval_loss"]
final_perplexity = math.exp(final_loss)
print(f"Final Evaluation Perplexity: {final_perplexity}")

In [None]:
import math
norm_perplexity = [math.exp(l) for l in norm_eval_losses]
dyt_perplexity = [math.exp(l) for l in dyt_eval_losses]

In [None]:
!pip install matplotlib -q
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Plot combined loss
import matplotlib.pyplot as plt

# Norm logs
norm_steps = [entry['step'] for entry in norm_log_history if 'loss' in entry and 'learning_rate' in entry]
norm_losses = [entry['loss'] for entry in norm_log_history if 'loss' in entry and 'learning_rate' in entry]

# DyT logs
dyt_steps = [entry['step'] for entry in dyt_log_history if 'loss' in entry and 'learning_rate' in entry]
dyt_losses = [entry['loss'] for entry in dyt_log_history if 'loss' in entry and 'learning_rate' in entry]

plt.figure(figsize=(8,5))
plt.plot(norm_steps, norm_losses, label="Norm Training Loss", color='blue')
plt.plot(dyt_steps, dyt_losses, label="DyT Training Loss", color='green')
plt.xlabel("Training Steps")
plt.ylabel("Loss")
plt.title("Training Loss: DyT vs Normalization")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Norm eval logs
norm_eval_steps = [r['step'] for r in norm_eval_log if 'eval_loss' in r]
norm_eval_losses = [r['eval_loss'] for r in norm_eval_log if 'eval_loss' in r]

# DyT eval logs
dyt_eval_steps = [r['step'] for r in dyt_eval_log if 'eval_loss' in r]
dyt_eval_losses = [r['eval_loss'] for r in dyt_eval_log if 'eval_loss' in r]

# Plot
plt.figure(figsize=(8,5))
plt.plot(norm_eval_steps, norm_eval_losses, label='Norm Eval Loss', color='orange')
plt.plot(dyt_eval_steps, dyt_eval_losses, label='DyT Eval Loss', color='green')
plt.xlabel('Training Steps')
plt.ylabel('Validation Loss')
plt.title('Validation Loss: DyT vs Norm')
plt.grid(True)
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(8,5))
plt.plot(train_loss_steps, train_losses, label='Training Loss', color='blue')
plt.plot(eval_loss_steps, eval_losses, label='Validation Loss', color='orange')
plt.xlabel('Training Steps')
plt.ylabel('Loss')
plt.title('Training and Validation Loss vs Steps')
plt.grid(True)
plt.legend()
plt.show()


In [None]:
# Norm logs
norm_train_steps = [r['step'] for r in norm_logs if 'loss' in r and 'learning_rate' in r]
norm_train_losses = [r['loss'] for r in norm_logs if 'loss' in r and 'learning_rate' in r]
norm_eval_steps = [r['step'] for r in norm_logs if 'eval_loss' in r]
norm_eval_losses = [r['eval_loss'] for r in norm_logs if 'eval_loss' in r]

# DyT logs
dyt_train_steps = [r['step'] for r in dyt_logs if 'loss' in r and 'learning_rate' in r]
dyt_train_losses = [r['loss'] for r in dyt_logs if 'loss' in r and 'learning_rate' in r]
dyt_eval_steps = [r['step'] for r in dyt_logs if 'eval_loss' in r]
dyt_eval_losses = [r['eval_loss'] for r in dyt_logs if 'eval_loss' in r]


import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))

# Norm model
plt.plot(norm_train_steps, norm_train_losses, label='Norm Training Loss', color='blue', linestyle='-')
plt.plot(norm_eval_steps, norm_eval_losses, label='Norm Validation Loss', color='blue', linestyle='--')

# DyT model
plt.plot(dyt_train_steps, dyt_train_losses, label='DyT Training Loss', color='green', linestyle='-')
plt.plot(dyt_eval_steps, dyt_eval_losses, label='DyT Validation Loss', color='green', linestyle='--')

plt.xlabel('Training Steps')
plt.ylabel('Loss')
plt.title('Training and Validation Loss: Norm vs DyT')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
norm_perplexities = [math.exp(l) for l in norm_eval_losses]
dyt_perplexities = [math.exp(l) for l in dyt_eval_losses]

plt.figure(figsize=(10,6))
plt.plot(norm_eval_steps, norm_perplexities, label='Norm Validation Perplexity', color='blue', linestyle='--')
plt.plot(dyt_eval_steps, dyt_perplexities, label='DyT Validation Perplexity', color='green', linestyle='--')
plt.xlabel('Training Steps')
plt.ylabel('Perplexity')
plt.title('Validation Perplexity: Norm vs DyT')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
total_params = sum(p.numel() for p in peft_model.parameters())
trainable_params = sum(p.numel() for p in peft_model.parameters() if p.requires_grad)
frozen_params = total_params - trainable_params

labels = ['Trainable (LoRA)', 'Frozen']
sizes = [trainable_params, frozen_params]
colors = ['#ff9999','#66b3ff']

plt.figure(figsize=(6,6))
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
plt.title('Parameter Distribution After LoRA Injection')
plt.axis('equal')
plt.show()


In [None]:
def generate_response(prompt, model, tokenizer, max_new_tokens=100):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    model = model.to(device)

    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_k=50,
            top_p=0.95,
            num_return_sequences=1
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded


In [None]:
prompt = "Translate to English: Je suis étudiant à NYU."

dyt_output = generate_response(prompt, dyt_model, tokenizer)
norm_output = generate_response(prompt, norm_model, tokenizer)

print("DyT Model Output:\n", dyt_output)
print("\nNorm Model Output:\n", norm_output)


In [None]:
# For DyT model
trainer.save_model("./distilgpt2_dyt_peft")
tokenizer.save_pretrained("./distilgpt2_dyt_peft")

# For Norm model
trainer.save_model("./distilgpt2_norm_peft")
tokenizer.save_pretrained("./distilgpt2_norm_peft")


In [None]:
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("./distilgpt2_dyt_peft")

# Load base model
base_model = AutoModelForCausalLM.from_pretrained("distilgpt2")

# Load fine-tuned PEFT model (DyT)
model = PeftModel.from_pretrained(base_model, "./distilgpt2_dyt_peft")

# Set pad token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

print("DyT model and tokenizer loaded!")


tokenizer = AutoTokenizer.from_pretrained("./distilgpt2_norm_peft")
base_model = AutoModelForCausalLM.from_pretrained("distilgpt2")
model = PeftModel.from_pretrained(base_model, "./distilgpt2_norm_peft")
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id
print("Norm model and tokenizer loaded!")


In [None]:
def generate_response(prompt, model, tokenizer, max_new_tokens=100):
    import torch
    from transformers import AutoTokenizer

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id

    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_k=50,
            top_p=0.95,
            num_return_sequences=1
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
# For DyT
dyt_response = generate_response("Translate to English: Je suis étudiant.", dyt_model, dyt_tokenizer)

# For Norm
norm_response = generate_response("Translate to English: Je suis étudiant.", norm_model, norm_tokenizer)


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Example loading DyT model
tokenizer_dyt = AutoTokenizer.from_pretrained("./distilgpt2_dyt_peft")
base_model_dyt = AutoModelForCausalLM.from_pretrained("distilgpt2")
dyt_model = PeftModel.from_pretrained(base_model_dyt, "./distilgpt2_dyt_peft")

# Likewise for Norm


In [None]:
prompt = "Explain how rainbows are formed."

# Generate response from Norm model
norm_response = generate_response(prompt, norm_model, norm_tokenizer)
print("=== Norm Model Response ===")
print(norm_response)

print("\n")

# Generate response from DyT model
dyt_response = generate_response(prompt, dyt_model, dyt_tokenizer)
print("=== DyT Model Response ===")
print(dyt_response)
