In [1]:
!pip install -q transformers datasets accelerate


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.9 -m pip install --upgrade pip[0m


# Baseline

In [None]:
import torch

torch.cuda.synchronize()
torch.cuda.empty_cache()  # Clears the cache
torch.cuda.reset_peak_memory_stats()  #

from transformers import AutoModelForCausalLM, AutoTokenizer, \
    TrainingArguments, Trainer, logging, DataCollatorForLanguageModeling
from datasets import load_dataset
from pynvml import *

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

print("💎 Before training:")
print_gpu_utilization()

# Suppress less critical logs
logging.set_verbosity_error()

# Load the dataset with both training and evaluation splits
train_dataset = load_dataset("roneneldan/TinyStories", split="train[:100]")
eval_dataset = load_dataset("roneneldan/TinyStories", split="train[100:200]")

HF_cardname = "openai-community/gpt2"
# HF_cardname = "openai-community/gpt2-medium"
# HF_cardname = "openai-community/gpt2-large"
# HF_cardname = "openai-community/gpt2-XL"
# HF_cardname = "facebook/opt-125m"
# HF_cardname = "facebook/opt-350m"
# HF_cardname = "facebook/opt-1.3b"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(HF_cardname, use_fast=False)
EOS_TOKEN = tokenizer.eos_token

# Ensure the tokenizer has a padding token, set EOS_TOKEN as padding token if not present
if tokenizer.pad_token is None:
    tokenizer.pad_token = EOS_TOKEN

# Function to process the dataset
def formatting_func(examples):
    inputs = [tokenizer(text + EOS_TOKEN, truncation=True, max_length=512, padding="max_length", return_tensors="pt") for text in examples['text']]
    return {'input_ids': [input['input_ids'].squeeze() for input in inputs], 'labels': [input['input_ids'].squeeze() for input in inputs]}

# Process the datasets
processed_train_dataset = train_dataset.map(formatting_func, batched=True, remove_columns=["text"])
processed_eval_dataset = eval_dataset.map(formatting_func, batched=True, remove_columns=["text"])

# Initialize Data Collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

print("💎 Dataset loaded")
print_gpu_utilization()

# Define and load the model
model = AutoModelForCausalLM.from_pretrained(HF_cardname)

print("💎 Model loaded")
print_gpu_utilization()

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
)

# Initialize the trainer with the data collator
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_eval_dataset,
    data_collator=data_collator
)

# Start training
trainer.train()

def print_summary(trainer):
    # Access training result metrics directly from the trainer state
    print(f"💎 Training time: {trainer.state.log_history[-1]['train_runtime']:.2f} seconds")
    print(f"Samples/second: {trainer.state.log_history[-1]['train_samples_per_second']:.2f}")
    print_gpu_utilization()
    num_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)
    print(f"Total Trainable Parameters: {num_params}")

print_summary(trainer)

💎 Before training:
GPU memory occupied: 5700 MB.
💎 Dataset loaded
GPU memory occupied: 5700 MB.


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

💎 Model loaded
GPU memory occupied: 5700 MB.




{'eval_loss': 2.3189377784729004, 'eval_runtime': 4.0651, 'eval_samples_per_second': 24.6, 'eval_steps_per_second': 3.198, 'epoch': 1.0}
{'eval_loss': 2.273000717163086, 'eval_runtime': 4.1252, 'eval_samples_per_second': 24.241, 'eval_steps_per_second': 3.151, 'epoch': 2.0}
{'eval_loss': 2.2519288063049316, 'eval_runtime': 4.1712, 'eval_samples_per_second': 23.974, 'eval_steps_per_second': 3.117, 'epoch': 3.0}
{'eval_loss': 2.2411301136016846, 'eval_runtime': 4.203, 'eval_samples_per_second': 23.792, 'eval_steps_per_second': 3.093, 'epoch': 4.0}
{'eval_loss': 2.2386133670806885, 'eval_runtime': 4.1096, 'eval_samples_per_second': 24.333, 'eval_steps_per_second': 3.163, 'epoch': 5.0}
{'train_runtime': 105.6071, 'train_samples_per_second': 4.735, 'train_steps_per_second': 0.615, 'train_loss': 2.203471491887019, 'epoch': 5.0}
💎 Training time: 105.61 seconds
Samples/second: 4.74
GPU memory occupied: 12080 MB.
Total Trainable Parameters: 124439808


In [None]:
# Encode the prompt text
input_ids = tokenizer.encode(
    "A long time ago in a galaxy far far away...",
    return_tensors="pt").cuda()

# Generate text using the model
output_ids = model.generate(input_ids, max_length=100)

# Decode the generated ids to text
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Print the generated text
print(generated_text)

A long time ago in a galaxy far far away...

A long time ago in a galaxy far away... A long time ago, a young girl named Lily was playing with her toy.

Lily was so excited! She wanted to play with her toy! But she couldn't. She couldn't play with her toy. She couldn't play with her toy. She couldn't play with her toy. She couldn't play with her toy!

Lily was so scared!


# 🤖 EXPERIMENT: training gpt2-like model with conv1d filters from scratch

In [None]:
print(input_ids)
short_ids = input_ids[:, :5]
print(short_ids)

NameError: name 'input_ids' is not defined

In [None]:
pred_vec1 = model(input_ids)[0][:, 0, :]
pred_vec2 = model(short_ids)[0][:, 0, :]
print(pred_vec1)
print(pred_vec2)
print(torch.linalg.norm(pred_vec1 - pred_vec2))

tensor([[-32.2106, -31.9089, -34.7183, -34.6814, -33.1329, -33.6744, -32.2762,
         -32.9669, -30.9345, -34.0401, -33.3299, -28.8423, -29.7336, -28.3824,
         -31.4148, -33.5976, -32.4900, -32.8397, -33.2518, -33.3086, -33.6257,
         -33.9383, -33.6765, -33.9673, -34.0025, -30.5542, -31.7219, -34.7232,
         -33.8429, -34.1196, -32.4132, -34.9633, -32.8016, -33.3249, -33.7593,
         -33.5905, -34.0067, -34.0373, -33.6612, -33.8630]], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([[-32.2107, -31.9090, -34.7184, -34.6815, -33.1330, -33.6745, -32.2762,
         -32.9670, -30.9346, -34.0401, -33.3300, -28.8423, -29.7336, -28.3825,
         -31.4148, -33.5976, -32.4901, -32.8397, -33.2519, -33.3086, -33.6258,
         -33.9384, -33.6766, -33.9674, -34.0026, -30.5543, -31.7220, -34.7233,
         -33.8430, -34.1196, -32.4133, -34.9633, -32.8016, -33.3250, -33.7594,
         -33.5906, -34.0068, -34.0374, -33.6613, -33.8631]], device='cuda:0',
       grad_fn=<Slice

In [None]:
len(model(input_ids)[1])

12

In [None]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
long_input = tokenizer.encode(
    "A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\
    A long time ago in a galaxy far far away...\.",
    return_tensors="pt").cuda()

print(long_input.shape)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
model(long_input)[0].shape

torch.Size([1, 1202, 50257])

In [None]:
config = model.config

# Maximum sequence length
max_seq_len = config.n_positions
print(f"Maximum sequence length: {max_seq_len}")


Maximum sequence length: 1024


In [None]:
import torch
import torch.nn as nn
from transformers import GPT2Model, GPT2PreTrainedModel, GPT2Config, GPT2LMHeadModel

from transformers import AutoModelForCausalLM, AutoTokenizer, \
    TrainingArguments, Trainer, logging, DataCollatorForLanguageModeling
from datasets import load_dataset
from pynvml import *
import random
import numpy as np

def fix_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

fix_seed()  # Apply the fixed seed

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

print("💎 Before training:")
print_gpu_utilization()

# Suppress less critical logs
logging.set_verbosity_error()

# Load the dataset with both training and evaluation splits
train_dataset = load_dataset("roneneldan/TinyStories", split="train[:100]")
eval_dataset = load_dataset("roneneldan/TinyStories", split="train[100:200]")

HF_cardname = "openai-community/gpt2"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(HF_cardname, use_fast=False)
EOS_TOKEN = tokenizer.eos_token

# Ensure the tokenizer has a padding token, set EOS_TOKEN as padding token if not present
if tokenizer.pad_token is None:
    tokenizer.pad_token = EOS_TOKEN

# Function to process the dataset
def formatting_func(examples):
    inputs = [tokenizer(text + EOS_TOKEN, truncation=True, max_length=512, padding="max_length", return_tensors="pt") for text in examples['text']]
    return {'input_ids': [input['input_ids'].squeeze() for input in inputs], 'labels': [input['input_ids'].squeeze() for input in inputs]}

# Process the datasets
fix_seed()
processed_train_dataset = train_dataset.map(formatting_func, batched=True, remove_columns=["text"])
processed_eval_dataset = eval_dataset.map(formatting_func, batched=True, remove_columns=["text"])

# Initialize Data Collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

print("💎 Dataset loaded")
print_gpu_utilization()

class CustomGPT2Model(GPT2LMHeadModel):
    def __init__(self, config):
        super().__init__(config)

        # Define the custom layers
        self.conv1d = nn.Conv1d(in_channels=config.hidden_size, out_channels=config.hidden_size, kernel_size=2, stride=2)
        # self.conv1d = nn.Identity(27)
        self.conv_transpose1d = nn.ConvTranspose1d(in_channels=config.hidden_size, out_channels=config.hidden_size, kernel_size=2, stride=2)
        # self.conv_transpose1d = nn.Identity(27)

        # GPT2Model is the transformer part of the GPT2 model
        self.transformer = GPT2Model(config)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None):
        # Embedding layer
        if position_ids is None:
            position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device)
            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)

        inputs_embeds = self.transformer.wte(input_ids)
        position_embeds = self.transformer.wpe(position_ids)
        hidden_states = inputs_embeds + position_embeds

        # Custom conv1d layer: Reduce sequence length by half
        hidden_states = hidden_states.permute(0, 2, 1)  # Change to shape (batch_size, emb_dim, seq_len)
        hidden_states = self.conv1d(hidden_states)
        hidden_states = hidden_states.permute(0, 2, 1)  # Change back to shape (batch_size, seq_len/2, emb_dim)

        # Usual transformer forward pass on reduced sequence length
        transformer_outputs = self.transformer(
            inputs_embeds=hidden_states,  # Use the processed hidden states here
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=None,
            head_mask=head_mask,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict
        )

        hidden_states = transformer_outputs[0]  # Output of the transformer

        # Custom conv_transpose1d layer: Revert the sequence length to the original size
        hidden_states = hidden_states.permute(0, 2, 1)  # Change to shape (batch_size, emb_dim, seq_len/2)
        hidden_states = self.conv_transpose1d(hidden_states)
        hidden_states = hidden_states.permute(0, 2, 1)  # Change back to shape (batch_size, seq_len, emb_dim)

        # The final output projection layer
        lm_logits = self.lm_head(hidden_states)

        loss = None
        if labels is not None:
            shift_logits = lm_logits[:, :-1, :].contiguous()
            shift_labels = labels[:, 1:].contiguous()
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

        if not return_dict:
            output = (lm_logits,) + transformer_outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return {'loss': loss, 'logits': lm_logits, 'hidden_states': transformer_outputs.hidden_states, 'attentions': transformer_outputs.attentions}

# Initialize the custom model
config = GPT2Config.from_pretrained(HF_cardname)
model = CustomGPT2Model(config)

# Tokenizer remains the same
tokenizer = AutoTokenizer.from_pretrained(HF_cardname, use_fast=False)
fix_seed()

# Continue with the training setup as in your original code
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
)
fix_seed()

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_eval_dataset,
    data_collator=data_collator
)

# Evaluate the model before training
print("💎 Evaluating model before training:")
fix_seed()
eval_results = trainer.evaluate()
print(f"Pre-training evaluation loss: {eval_results['eval_loss']:.4f}")

# Start training
trainer.train()

def print_summary(trainer):
    # Access training result metrics directly from the trainer state
    print(f"💎 Training time: {trainer.state.log_history[-1]['train_runtime']:.2f} seconds")
    print(f"Samples/second: {trainer.state.log_history[-1]['train_samples_per_second']:.2f}")
    print_gpu_utilization()
    num_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)
    print(f"Total Trainable Parameters: {num_params}")

print_summary(trainer)


💎 Before training:
GPU memory occupied: 11808 MB.
💎 Dataset loaded
GPU memory occupied: 11808 MB.




💎 Evaluating model before training:
{'eval_loss': 10.8544340133667, 'eval_runtime': 2.9063, 'eval_samples_per_second': 34.407, 'eval_steps_per_second': 4.473}
Pre-training evaluation loss: 10.8544
{'eval_loss': 10.120983123779297, 'eval_runtime': 3.0421, 'eval_samples_per_second': 32.872, 'eval_steps_per_second': 4.273, 'epoch': 1.0}
{'eval_loss': 9.829243659973145, 'eval_runtime': 3.0655, 'eval_samples_per_second': 32.621, 'eval_steps_per_second': 4.241, 'epoch': 2.0}
{'eval_loss': 9.551410675048828, 'eval_runtime': 2.9699, 'eval_samples_per_second': 33.671, 'eval_steps_per_second': 4.377, 'epoch': 3.0}
{'eval_loss': 9.31762981414795, 'eval_runtime': 2.9191, 'eval_samples_per_second': 34.257, 'eval_steps_per_second': 4.453, 'epoch': 4.0}
{'eval_loss': 9.231056213378906, 'eval_runtime': 2.867, 'eval_samples_per_second': 34.879, 'eval_steps_per_second': 4.534, 'epoch': 5.0}
{'train_runtime': 81.6397, 'train_samples_per_second': 6.124, 'train_steps_per_second': 0.796, 'train_loss': 9.707

In [None]:
model.

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2SdpaAttention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

# 🤖 Experiment: training gpt2 model without HF trainer with explicit PyTorch model

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AdamW, AutoConfig, AutoModelForCausalLM
from transformers.modeling_outputs import CausalLMOutputWithPast
from datasets import load_dataset
from pynvml import *

# Custom GELU activation (gelu_new)
def gelu_new(x):
    return 0.5 * x * (1.0 + torch.tanh(torch.sqrt(2.0 / torch.pi) * (x + 0.044715 * torch.pow(x, 3))))

# Custom Transformer block with gelu_new activation
class GPT2Block(nn.Module):
    def __init__(self, config):
        super(GPT2Block, self).__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
        self.attn = nn.MultiheadAttention(config.n_embd, config.n_head, dropout=config.attn_pdrop)
        self.ln_2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
        self.mlp = nn.Sequential(
            nn.Linear(config.n_embd, config.n_inner),
            nn.ReLU() if config.activation_function == "relu" else gelu_new,
            nn.Linear(config.n_inner, config.n_embd),
            nn.Dropout(config.resid_pdrop),
        )

    def forward(self, x, layer_past=None, attention_mask=None, head_mask=None, use_cache=False, output_attentions=False):
        attn_output, attn_weights = self.attn(self.ln_1(x), self.ln_1(x), self.ln_1(x), need_weights=output_attentions, attn_mask=attention_mask)
        x = x + attn_output
        x = x + self.mlp(self.ln_2(x))
        return x, attn_weights

# Custom GPT-2 Model definition
class GPT2Model(nn.Module):
    def __init__(self, config):
        super(GPT2Model, self).__init__()
        self.config = config
        self.embed_tokens = nn.Embedding(config.vocab_size, config.n_embd)
        self.embed_positions = nn.Embedding(config.max_position_embeddings, config.n_embd)
        self.layers = nn.ModuleList([GPT2Block(config) for _ in range(config.n_layer)])
        self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
        self.head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

    def forward(self, input_ids, attention_mask=None, labels=None):
        input_shape = input_ids.size()
        device = input_ids.device

        inputs_embeds = self.embed_tokens(input_ids)
        position_ids = torch.arange(0, input_shape[-1], dtype=torch.long, device=device).unsqueeze(0)
        position_embeds = self.embed_positions(position_ids)

        hidden_states = inputs_embeds + position_embeds

        for layer in self.layers:
            hidden_states, _ = layer(hidden_states)

        hidden_states = self.ln_f(hidden_states)
        logits = self.head(hidden_states)

        loss = None
        if labels is not None:
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits
        )


# Load pretrained weights from Hugging Face
def load_pretrained_gpt2_weights(model, pretrained_model_name):
    # Load the pretrained model from Hugging Face
    pretrained_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name)

    # Transfer the weights
    model_dict = model.state_dict()
    pretrained_dict = pretrained_model.state_dict()

    pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
    model_dict.update(pretrained_dict)
    model.load_state_dict(model_dict)

    return model


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

print("💎 Before training:")
print_gpu_utilization()

# Load the dataset with both training and evaluation splits
train_dataset = load_dataset("roneneldan/TinyStories", split="train[:100]")
eval_dataset = load_dataset("roneneldan/TinyStories", split="train[100:200]")

HF_cardname = "openai-community/gpt2"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(HF_cardname, use_fast=False)
EOS_TOKEN = tokenizer.eos_token

# Ensure the tokenizer has a padding token, set EOS_TOKEN as padding token if not present
if tokenizer.pad_token is None:
    tokenizer.pad_token = EOS_TOKEN

# Function to process the dataset
def formatting_func(examples):
    inputs = [tokenizer(text + EOS_TOKEN, truncation=True, max_length=512, padding="max_length", return_tensors="pt") for text in examples['text']]
    return {'input_ids': torch.stack([input['input_ids'].squeeze() for input in inputs]),
            'labels': torch.stack([input['input_ids'].squeeze() for input in inputs])}

# Process the datasets
processed_train_dataset = train_dataset.map(formatting_func, batched=True, remove_columns=["text"])
processed_eval_dataset = eval_dataset.map(formatting_func, batched=True, remove_columns=["text"])

# Create DataLoaders
train_dataloader = DataLoader(processed_train_dataset, batch_size=8, shuffle=True)
eval_dataloader = DataLoader(processed_eval_dataset, batch_size=8)

# Load the configuration for GPT-2
config = AutoConfig.from_pretrained(HF_cardname)

# Define the model using the custom architecture
model = GPT2Model(config)
model = load_pretrained_gpt2_weights(model, HF_cardname)
model = model.cuda()

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Training loop
num_epochs = 5
model.train()

print("💎 Model and optimizer initialized")
print_gpu_utilization()

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    running_loss = 0.0
    for batch in train_dataloader:
        inputs = batch['input_ids'].cuda()
        labels = batch['labels'].cuda()

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        outputs = model(input_ids=inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_loss = running_loss / len(train_dataloader)
    print(f"Training loss: {avg_loss:.4f}")

    # Evaluation step
    model.eval()
    eval_loss = 0.0
    with torch.no_grad():
        for batch in eval_dataloader:
            inputs = batch['input_ids'].cuda()
            labels = batch['labels'].cuda()

            outputs = model(input_ids=inputs, labels=labels)
            eval_loss += outputs.loss.item()

    avg_eval_loss = eval_loss / len(eval_dataloader)
    print(f"Validation loss: {avg_eval_loss:.4f}")
    model.train()

# Training summary
def print_summary():
    print("💎 Training completed")
    print_gpu_utilization()
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total Trainable Parameters: {num_params}")

print_summary()


💎 Before training:
GPU memory occupied: 12080 MB.


TypeError: empty() received an invalid combination of arguments - got (tuple, dtype=NoneType, device=NoneType), but expected one of:
 * (tuple of ints size, *, tuple of names names, torch.memory_format memory_format, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
 * (tuple of ints size, *, torch.memory_format memory_format, Tensor out, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)


# NanoGPT

In [4]:
from transformers import GPT2LMHeadModel, set_seed, pipeline

model_hf = GPT2LMHeadModel.from_pretrained("gpt2")
sd_hf = model_hf.state_dict()

for k, v in sd_hf.items():
    print(k, v.shape)

transformer.wte.weight torch.Size([50257, 768])
transformer.wpe.weight torch.Size([1024, 768])
transformer.h.0.ln_1.weight torch.Size([768])
transformer.h.0.ln_1.bias torch.Size([768])
transformer.h.0.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias torch.Size([2304])
transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias torch.Size([768])
transformer.h.0.ln_2.weight torch.Size([768])
transformer.h.0.ln_2.bias torch.Size([768])
transformer.h.0.mlp.c_fc.weight torch.Size([768, 3072])
transformer.h.0.mlp.c_fc.bias torch.Size([3072])
transformer.h.0.mlp.c_proj.weight torch.Size([3072, 768])
transformer.h.0.mlp.c_proj.bias torch.Size([768])
transformer.h.1.ln_1.weight torch.Size([768])
transformer.h.1.ln_1.bias torch.Size([768])
transformer.h.1.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.1.attn.c_attn.bias torch.Size([2304])
transformer.h.1.attn.c_proj.weight torch.Size([768, 768])
transformer.h.1.attn.c_proj.bias 

In [9]:
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("No, Luke, I am your ", max_length=30, num_return_sequences=5)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'No, Luke, I am your ___________!"\n\n"Get up!" I yelled, dragging my feet with me. I wasn\'t the'},
 {'generated_text': 'No, Luke, I am your \xa0friend." You had to come after him. What about this?!\n"There\'s no turning back…'},
 {'generated_text': 'No, Luke, I am your ______________.\n\nYour father says, "Luke! I need you to go to that place and'},
 {'generated_text': 'No, Luke, I am your ~~"\n\n"And you\'re not my, aha!!" Aang roared\n\nHis face turned'},
 {'generated_text': 'No, Luke, I am your _____, all of you!"\n\n"Oh, God! How dare you let even a tiny baby!"'}]

## gpt2 from scratch

In [3]:
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
import tiktoken

class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        # to split embeddings dimension across attention heads
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3*config.n_embd)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        # regularization
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        # not really a 'bias', more of a mask, but following the OpenAI/HF naming
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                             .view(1, 1, config.block_size, config.block_size))
        
    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
        # calculate query, key, values for all heads in batch and move head forward to be 
        # nh is "number of heads", hs is "head size" and C (number of channels) = nh * hs
        # e.g. in GPT-2 (124M) n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        # attention (materializes the large (T, T) matrix for all the queries and keys)
        # att = (q @ k.transpose(-2, -1)) * (1.0/math.sqrt(k.size(-1)))
        # att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
        # att = F.softmax(att, dim=-1)
        # y = att @ v
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)

        y = y.transpose(1, 2).contiguous().view(B, T, C)
        # output projection
        y = self.c_proj(y)
        return y

class MLP(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4*config.n_embd)
        self.gelu = nn.GELU(approximate="tanh")
        self.c_proj = nn.Linear(4*config.n_embd, config.n_embd)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x


class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

@dataclass
class GPTConfig:
    block_size: int = 1024 # max sequence length
    vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
    n_layer: int = 12 # number of layers
    n_head: int = 12 # number of heads
    n_embd: int = 768 # embedding dimension

class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            h  = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
            ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # weight sharing scheme
        self.transformer.wte.weight = self.lm_head.weight

    def forward(self, idx):
        # idx is of shape (B, T)
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is {self.config.block_size}"
        # forward the token and position embeddings
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
        pos_emb = self.transformer.wpe(pos) # position emeddings of shape (T, n_embd)
        tok_emb = self.transformer.wte(idx) # token emeddings of shape (B, T, n_embd)
        x = tok_emb + pos_emb
        # forward the blocks of transformer
        for block in self.transformer.h:
            x = block(x)
        # forward the final layernorm and the classifier
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x) # (B, T, vocab_size)
        return logits
    
    @classmethod
    def from_pretrained(cls, model_type):
        """Loads pretrained GPT-2 model weights from huggingface"""
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        from transformers import GPT2LMHeadModel
        print("loading weights from pretrained gpt: %s" % model_type)

        # n_layer, n_head and n_embd are determined from model_type
        config_args = {
            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
        }[model_type]
        config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
        config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
        # create a from-scratch initialized minGPT model
        config = GPTConfig(**config_args)
        model = GPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param

        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
        # this means that we have to transpose these weights when we import them
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model

model = GPT.from_pretrained('gpt2')
model.eval()

# attempt to autodetect device
device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"

print(f"using device: {device}")

model.to(device)


enc = tiktoken.get_encoding("gpt2")
tokens = enc.encode("Без труда не вытащишь рыбку из ")
batch_size = 50
tokens = torch.tensor(tokens, dtype=torch.long)
tokens = tokens.unsqueeze(0).repeat(batch_size, 1)
x = tokens.to(device)

max_length = 50
torch.manual_seed(228)
while x.size(1) < max_length:
    logits = model(x)
    # take the logits at the last position
    logits = logits[:, -1, :]
    probs = F.softmax(logits, dim=-1)
    # do top-k sampling of 50 (default HF pipeline)
    topk_probs, topk_indices = torch.topk(probs, 10, dim=-1)
    # select a token from the top-k probabilities
    ix = torch.multinomial(topk_probs, 1)
    # gather the corresponding indices
    xcol = torch.gather(topk_indices, -1, ix)
    # append to the sequence
    x = torch.cat((x, xcol), dim=1)

# print the generated text
for i in range(batch_size):
    tokens = x[i, :max_length].tolist()
    decoded = enc.decode(tokens)
    print(">", decoded)

loading weights from pretrained gpt: gpt2
using device: mps
> Без труда не вытащишь рыбку из ти велецино 
> Без труда не вытащишь рыбку из травно идена �
> Без труда не вытащишь рыбку из что исчи зн
> Без труда не вытащишь рыбку из сератика и гр
> Без труда не вытащишь рыбку из струда сольно�
> Без труда не вытащишь рыбку из собойскогахи
> Без труда не вытащишь рыбку из седбломано п
> Без труда не вытащишь рыбку из шка не венед
> Без труда не вытащишь рыбку из этрын врезы
> Без труда не вытащишь рыбку из сомаючие, д
> Без труда не вытащишь рыбку из убрашимова.
> Без труда не вытащишь рыбку из ули всото сво
> Без труда не вытащишь рыбку из удиниительно �
> Без труда не вытащишь рыбку из тутольком, от
> Без труда не вытащишь рыбку из рейчных с �
> Без труда не вытащишь рыбку из рекжиция ут
> Без труда не вытащишь рыбку из чтоть мудово�
> Без труда не вытащишь рыбку из фесноюнияе,
> Без труда не вытащишь рыбку из треженикией
> Без труда не вытащишь рыбку из рездичние м
> Без труда не выта