### Install dependencies

In [1]:
# !pip install tiktoken --quiet

### Import libraries

In [2]:
import re
import json
import string
import tiktoken
from collections import Counter
from typing import List, Dict, Optional
import unicodedata
import threading
import time

import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import Optional, Tuple, List
import numpy as np

from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from tqdm import tqdm
import os

### Data Processor

In [3]:
import re
import unicodedata

class DataProcessor:
    """
    A utility class to clean and format raw text data for language model training.
    Attributes:
        endoftext_token (str): A special token appended to each processed text segment.
    """

    def __init__(self, endoftext_token: str = "<|endoftext|>"):
        self.endoftext_token = endoftext_token

    def clean_text(self, text: str) -> str:
        """
        Cleans input text by:
        - Normalizing Unicode characters to ASCII (NFKD normalization).
        - Removing all non-alphanumeric characters except spaces.
        - Collapsing multiple spaces into one.
        Args:
            text (str): The raw input text.
        Returns:
            str: The cleaned text.
        """
        # Normalize to ASCII using NFKD to remove accents/diacritics
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
        # Remove all characters except alphanumeric and spaces
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        # Replace multiple spaces with a single space
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def process_data(self, text: str) -> str:
        """
        Processes input text by cleaning it and appending the end-of-text token.
        Args:
            text (str): The raw input text.
        Returns:
            str: The cleaned and token-appended text.
        """
        cleaned_text = self.clean_text(text)
        processed_text = cleaned_text + f" {self.endoftext_token}"
        return processed_text


### Tiktoken Wrapper

In [4]:
from typing import List, Set
import tiktoken

class TiktokenTokenizer:
    """
    A wrapper around the TikToken tokenizer for encoding and decoding text using a specified model (e.g., GPT-2).
    Attributes:
        tokenizer: The TikToken encoding object for the specified model.
    """

    def __init__(self, model: str = "gpt2"):
        """
        Initializes the tokenizer with the encoding for the given model.
        Args:
            model (str): Name of the model to load encoding for. Default is "gpt2".
        """
        self.tokenizer = tiktoken.get_encoding(model)

    def encode(self, text: str, allowed_special: Set[str] = None) -> List[int]:
        """
        Encodes a text string into a list of token IDs, allowing specific special tokens.
        Args:
            text (str): The input string to tokenize.
            allowed_special (Set[str], optional): A set of allowed special tokens. 
                                                  Defaults to {"<|endoftext|>"} if not provided.
        Returns:
            List[int]: The list of token IDs representing the input string.
        """
        allowed_special = allowed_special or {"<|endoftext|>"}
        return self.tokenizer.encode(text, allowed_special=allowed_special)

    def decode(self, ids: List[int]) -> str:
        """
        Decodes a list of token IDs back into a string.
        Args:
            ids (List[int]): List of token IDs.
        Returns:
            str: The decoded string.
        """
        return self.tokenizer.decode(ids)

    def get_vocab_size(self) -> int:
        """
        Returns the size of the tokenizer's vocabulary
        Returns:
            int: The number of tokens in the vocabulary.
        """
        return self.tokenizer.n_vocab

### Load Text 

In [5]:
dataset_path = "./_dataset_/"
all_paths = os.listdir(dataset_path)
all_paths = sorted([os.path.join(dataset_path, path) for path in all_paths if path.endswith(".txt")])
print(all_paths)

raw_text = ""
for path in sorted(all_paths):
    with open(path, "r", encoding="utf-8") as f:
        raw_text += f.read().lower()

['./_dataset_/01 Harry Potter and the Sorcerers Stone.txt', './_dataset_/02 Harry Potter and the Chamber of Secrets.txt', './_dataset_/03 Harry Potter and the Prisoner of Azkaban.txt', './_dataset_/04 Harry Potter and the Goblet of Fire.txt', './_dataset_/05 Harry Potter and the Order of the Phoenix.txt', './_dataset_/06 Harry Potter and the Half-Blood Prince.txt', './_dataset_/07 Harry Potter and the Deathly Hallows.txt']


In [6]:
# Splits the raw_text corpus on new line character and joins it into a string
text = " ".join(raw_text.split("\n"))
len(text), text[:5000]    # Print the total character count of our text and first 5000 characters

(6285437,
 'mr. and mrs. dursley, of number four, privet drive, were proud to say that they were perfectly normal, thank you very much. they were the last people you’d expect to be involved in anything strange or mysterious, because they just didn’t hold with such nonsense.  mr. dursley was the director of a firm called grunnings, which made drills. he was a big, beefy man with hardly any neck, although he did have a very large mustache. mrs. dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors. the dursleys had a small son called dudley and in their opinion there was no finer boy anywhere.  the dursleys had everything they wanted, but they also had a secret, and their greatest fear was that somebody would discover it. they didn’t think they could bear it if anyone found out about the potters. mrs. potter was mrs. dursley’s sister, but they hadn’t met for

In [7]:
# Create an instance of data processor
processor = DataProcessor()
processed_text = processor.process_data(text)   # Clean the data

### Tokenize text corpus

In [8]:
tiktoken_tokenizer = TiktokenTokenizer()
tiktoken_encoded = tiktoken_tokenizer.encode(processed_text)
tiktoken_decoded = tiktoken_tokenizer.decode(tiktoken_encoded)

In [9]:
# Sanity check if tokenizer works fine
print("Processed text sample:", processed_text[:100])
print("Tiktoken encoded sample:", tiktoken_encoded[:10])
print("Tiktoken decoded sample:", tiktoken_decoded[:100])

Processed text sample: mr and mrs dursley of number four privet drive were proud to say that they were perfectly normal tha
Tiktoken encoded sample: [43395, 290, 285, 3808, 288, 1834, 1636, 286, 1271, 1440]
Tiktoken decoded sample: mr and mrs dursley of number four privet drive were proud to say that they were perfectly normal tha


In [10]:
# To obtain the tokenizer length and special_token token id
tiktoken_tokenizer.get_vocab_size(), tiktoken_tokenizer.encode("<|endoftext|>")

(50257, [50256])

### Config for model

In [11]:
class GPT2Config:
    """
    Configuration class for a GPT-2 model architecture.
    
    Stores all hyperparameters used to initialize and construct the GPT-2 model.
    """

    def __init__(
        self,
        vocab_size: int = 50257,             # Matches TikToken's GPT-2 tokenizer vocab size
        n_positions: int = 256,              # Maximum number of positional embeddings (i.e., sequence length)
        n_embd: int = 512,                   # Embedding and hidden layer size
        n_layer: int = 9,                    # Number of Transformer blocks
        n_head: int = 8,                     # Number of self-attention heads
        n_inner: int = None,                 # Feed-forward inner layer size (defaults to 4 * n_embd)
        activation_function: str = "gelu",   # Activation function used in MLP (e.g., gelu, relu)
        resid_pdrop: float = 0.1,            # Dropout rate for residual connections
        embd_pdrop: float = 0.1,             # Dropout rate for embeddings
        attn_pdrop: float = 0.1,             # Dropout rate within attention layers
        layer_norm_epsilon: float = 1e-5,    # Small epsilon value to avoid divide-by-zero in layer norm
        initializer_range: float = 0.02,     # Standard deviation for weight initialization
        eot_token_id: int = 50256            # Token ID representing <|endoftext|> in TikToken
    ):
        """
        Initializes the configuration with default or provided hyperparameters.
        
        Args:
            vocab_size (int): Size of the model vocabulary.
            n_positions (int): Maximum sequence length supported.
            n_embd (int): Dimension of embeddings and hidden layers.
            n_layer (int): Number of transformer layers.
            n_head (int): Number of attention heads.
            n_inner (int): Size of the feed-forward layer (defaults to 4 * n_embd).
            activation_function (str): Activation function in the MLP.
            resid_pdrop (float): Dropout probability for residual connections.
            embd_pdrop (float): Dropout probability for embeddings.
            attn_pdrop (float): Dropout probability for attention probabilities.
            layer_norm_epsilon (float): Epsilon value for layer normalization.
            initializer_range (float): Std for weight initialization.
            eot_token_id (int): Token ID used to signify end of text.
        """
        self.vocab_size = vocab_size
        self.n_positions = n_positions
        self.n_embd = n_embd
        self.n_layer = n_layer
        self.n_head = n_head
        self.n_inner = n_inner if n_inner is not None else 4 * n_embd
        self.activation_function = activation_function
        self.resid_pdrop = resid_pdrop
        self.embd_pdrop = embd_pdrop
        self.attn_pdrop = attn_pdrop
        self.layer_norm_epsilon = layer_norm_epsilon
        self.initializer_range = initializer_range
        self.eot_token_id = eot_token_id

### Model Modules

In [12]:
# Multi-Head Self-Attention module used in Transformer blocks
class MultiHeadSelfAttention(nn.Module):
    def __init__(self, config: GPT2Config):
        super().__init__()
        assert config.n_embd % config.n_head == 0  # Ensure embedding size is divisible by num heads
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.head_dim = config.n_embd // config.n_head  # Dimension per head

        # Linear layer to project input into concatenated Q, K, V
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # Output projection layer
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)

        # Dropouts for attention probabilities and final projection
        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        self.resid_dropout = nn.Dropout(config.resid_pdrop)

        # Causal mask (lower triangular) to prevent attending to future tokens
        self.register_buffer(
            "bias",
            torch.tril(torch.ones(config.n_positions, config.n_positions))
            .view(1, 1, config.n_positions, config.n_positions)
        )

    def forward(
        self,
        x: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        """
        Args:
            x (Tensor): Input tensor of shape (B, T, C)
            attention_mask (Optional[Tensor]): Optional attention mask

        Returns:
            Tuple[Tensor, Optional[Tensor]]: Output tensor and attention weights
        """
        B, T, C = x.size()

        # Compute Q, K, V from input
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)

        # Reshape Q, K, V for multi-head attention
        q = q.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
        k = k.view(B, T, self.n_head, self.head_dim).transpose(1, 2)
        v = v.view(B, T, self.n_head, self.head_dim).transpose(1, 2)

        # Compute scaled dot-product attention scores
        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(self.head_dim))
        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))  # Apply causal mask

        if attention_mask is not None:
            # Apply additional attention mask if provided
            att = att.masked_fill(attention_mask[:, None, None, :] == 0, float('-inf'))

        # Normalize attention weights
        att = F.softmax(att, dim=-1)
        att = self.attn_dropout(att)

        # Compute context
        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)

        # Final projection and dropout
        y = self.resid_dropout(self.c_proj(y))
        return y, att


# Feed-forward network (MLP) used in each Transformer block
class MLP(nn.Module):
    def __init__(self, config: GPT2Config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, config.n_inner)  # First linear layer
        self.c_proj = nn.Linear(config.n_inner, config.n_embd)  # Second linear layer
        self.act = nn.GELU() if config.activation_function == "gelu" else nn.ReLU()  # Activation
        self.dropout = nn.Dropout(config.resid_pdrop)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass of the MLP.

        Args:
            x (Tensor): Input tensor.

        Returns:
            Tensor: Output tensor after two linear layers and activation.
        """
        x = self.c_fc(x)
        x = self.act(x)
        x = self.c_proj(x)
        x = self.dropout(x)
        return x


# Transformer block consisting of self-attention and MLP
class Block(nn.Module):
    def __init__(self, config: GPT2Config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
        self.attn = MultiHeadSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
        self.mlp = MLP(config)

    def forward(
        self,
        x: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Forward pass through one transformer block.

        Args:
            x (Tensor): Input tensor.
            attention_mask (Optional[Tensor]): Optional attention mask.

        Returns:
            Tuple[Tensor, Tensor]: Output tensor and attention weights.
        """
        attn_output, attn_weights = self.attn(self.ln_1(x), attention_mask)
        x = x + attn_output  # Residual connection
        mlp_output = self.mlp(self.ln_2(x))
        x = x + mlp_output  # Residual connection
        return x, attn_weights


# GPT-2 model architecture
class GPT2Model(nn.Module):
    def __init__(self, config: GPT2Config):
        super().__init__()
        self.config = config

        # Token and positional embeddings
        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
        self.wpe = nn.Embedding(config.n_positions, config.n_embd)
        self.drop = nn.Dropout(config.embd_pdrop)

        # Transformer layers
        self.h = nn.ModuleList([Block(config) for _ in range(config.n_layer)])
        self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)

        # Output projection head
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.lm_head.weight = self.wte.weight  # Tie input/output embeddings

        # Initialize model weights
        self.apply(self._init_weights)

    def _init_weights(self, module):
        # Initialize linear and embedding layers with normal distribution
        if isinstance(module, (nn.Linear, nn.Embedding)):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if isinstance(module, nn.Linear) and module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
        """
        Forward pass through the GPT-2 model.

        Args:
            input_ids (Tensor): Input token IDs (B, T)
            attention_mask (Optional[Tensor]): Optional attention mask.
            labels (Optional[Tensor]): Labels for computing loss.

        Returns:
            Tuple[Tensor, Optional[Tensor]]: Logits and optional loss.
        """
        B, T = input_ids.size()
        device = input_ids.device

        # Generate position IDs
        position_ids = torch.arange(0, T, dtype=torch.long, device=device).unsqueeze(0).expand(B, T)

        # Embed tokens and positions
        inputs_embeds = self.wte(input_ids)
        position_embeds = self.wpe(position_ids)
        x = self.drop(inputs_embeds + position_embeds)

        # Pass through transformer layers
        attn_weights = []
        for block in self.h:
            x, attn = block(x, attention_mask)
            attn_weights.append(attn)

        x = self.ln_f(x)

        # Final projection to vocab size
        logits = self.lm_head(x)

        # Compute loss if labels are provided
        loss = None
        if labels is not None:
            loss = F.cross_entropy(
                logits.view(-1, logits.size(-1)),
                labels.view(-1),
                ignore_index=-100
            )
        return logits, loss

    def generate(
        self,
        input_ids: torch.Tensor,
        max_new_tokens: int = 50,
    ) -> torch.Tensor:
        """
        Greedy token-by-token generation.

        Args:
            input_ids (Tensor): Initial input tensor.
            max_new_tokens (int): Number of tokens to generate.

        Returns:
            Tensor: Generated token IDs.
        """
        self.eval()
        batch_size = input_ids.size(0)
        generated = input_ids

        with torch.no_grad():
            for _ in range(max_new_tokens):
                logits, _ = self(generated.to('cuda'))
                next_token = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)
                generated = torch.cat([generated, next_token], dim=1)
        return generated


### Create model 

In [13]:
config = GPT2Config()
model = GPT2Model(config)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2Model(
  (wte): Embedding(50257, 512)
  (wpe): Embedding(256, 512)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-8): 9 x Block(
      (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (attn): MultiHeadSelfAttention(
        (c_attn): Linear(in_features=512, out_features=1536, bias=True)
        (c_proj): Linear(in_features=512, out_features=512, bias=True)
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (mlp): MLP(
        (c_fc): Linear(in_features=512, out_features=2048, bias=True)
        (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        (act): GELU(approximate='none')
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (lm_head): Linear(in_features=512, out_features=50257, bias=False)
)

#### Test model

In [18]:
prompt = "Professor McGonagall sniffed angrily".lower()
input_ids = torch.tensor([tiktoken_tokenizer.encode(prompt)], dtype=torch.long).to(device)

generated_ids = model.generate(
        input_ids,
        max_new_tokens=50,
    )

generated_text = tiktoken_tokenizer.decode(generated_ids[0].cpu().numpy().tolist())


print("Prompt:", prompt)
print("Generated text:", generated_text)

assert torch.allclose(model.wte.weight, model.lm_head.weight), "Weights are not tied!"
print("Weight tying verified.")

total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params:,}")

Prompt: professor mcgonagall sniffed angrily
Generated text: professor mcgonagall sniffed angrily antigen head head head head head head head head head head head head Show Show Show speech speech speech speech speech speech speech speech speech speech speech speech speech speech speech speech speech speech speech speech My My My history history history history history history history history history history history
Weight tying verified.
Total parameters: 54,235,136


### Training hyperparameters

In [14]:
def get_hyperparameters(config: GPT2Config):
    return {
        "learning_rate": 1e-4,  # Typical for GPT-2
        "batch_size": 4,        # Small batch size for demonstration
        "num_epochs": 300,       # Maximum epochs
        "max_seq_length": 64,  # Shorter sequences for faster training
        "patience": 5,          # Early stopping patience
        "train_split" : 0.9,
        "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
        "model_config": config  # Reuse GPT2Config
    }

In [15]:
hparams = get_hyperparameters(config)

### Create dataloader

In [16]:
class TextDataset(Dataset):
    def __init__(self, tokens: List[int], max_seq_length: int):
        self.tokens = tokens
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.tokens) // self.max_seq_length

    def __getitem__(self, idx):
        start = idx * self.max_seq_length
        end = start + self.max_seq_length
        input_ids = torch.tensor(self.tokens[start:end], dtype=torch.long)
        target_ids = torch.tensor(self.tokens[start+1:end+1], dtype=torch.long)
        return input_ids, target_ids

In [17]:
def get_data_loaders(text: str, hparams: dict, tokenizer: TiktokenTokenizer):
    tokens = tokenizer.encode(text)

    split_idx = int(len(tokens) * hparams["train_split"])
    train_tokens = tokens[:split_idx]
    val_tokens = tokens[split_idx:]

    train_dataset = TextDataset(train_tokens, hparams["max_seq_length"])
    val_dataset = TextDataset(val_tokens, hparams["max_seq_length"])

    train_loader = DataLoader(
        train_dataset,
        batch_size=hparams["batch_size"],
        shuffle=False,
        num_workers=8
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=hparams["batch_size"],
        shuffle=False,
        num_workers=8
    )

    return train_loader, val_loader


In [18]:
train_loader, val_loader = get_data_loaders(processed_text, hparams, tiktoken_tokenizer)

### Train model

In [19]:
def train_model(model: GPT2Model, train_loader: DataLoader, val_loader: DataLoader, hparams: dict, tokenizer: TiktokenTokenizer):
    model.to(hparams["device"])
    optimizer = AdamW(model.parameters(), lr=hparams["learning_rate"])
    scheduler = CosineAnnealingLR(optimizer, T_max=500)

    best_train_loss = float('inf')
    patience_counter = 0
    best_model_path = "best_gpt2_model.pt"

    for epoch in range(hparams["num_epochs"]):
        start_time = time.time()

        # Training
        model.train()
        train_loss = 0.0
        train_steps = 0
        train_pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{hparams['num_epochs']} Training")
        for input_ids, target_ids in train_pbar:
            input_ids = input_ids.to(hparams["device"])
            target_ids = target_ids.to(hparams["device"])

            optimizer.zero_grad()
            logits, loss = model(input_ids, labels=target_ids)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_steps += 1
            train_pbar.set_postfix({"train_loss": train_loss / train_steps})

        scheduler.step()
        avg_train_loss = train_loss / train_steps

        epoch_time = time.time() - start_time

        # Print epoch results
        print(f"""Epoch {epoch+1}/{hparams['num_epochs']} | Train Loss: {avg_train_loss:.4f} | Time: {epoch_time:.2f}s""") # | Validation Loss: {avg_val_loss:.4f}""")

        # Early stopping and model saving
        if avg_train_loss < best_train_loss:
            best_val_loss = avg_train_loss
            patience_counter = 0
            torch.save(model.state_dict(), best_model_path)
            print(f"Saved best model to {best_model_path}")
        else:
            patience_counter += 1
            print(f"Patience counter: {patience_counter}/{hparams['patience']}")
            if patience_counter >= hparams["patience"]:
                print("Early stopping triggered.")
                break
        print()
    # Load best model
    model = GPT2Model(config).to(hparams["device"])
    model.load_state_dict(torch.load(best_model_path))
    print(f"Loaded best model from {best_model_path}")
    return model, best_val_loss

In [22]:
model = train_model(model, train_loader, val_loader, hparams, tiktoken_tokenizer)

Epoch 1/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=8.42]


Epoch 1/300 | Train Loss: 8.4178 | Time: 25.58s
Saved best model to best_gpt2_model.pt



Epoch 2/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=6.79]


Epoch 2/300 | Train Loss: 6.7938 | Time: 25.45s
Saved best model to best_gpt2_model.pt



Epoch 3/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=6.34]


Epoch 3/300 | Train Loss: 6.3407 | Time: 25.45s
Saved best model to best_gpt2_model.pt



Epoch 4/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=5.98]


Epoch 4/300 | Train Loss: 5.9811 | Time: 25.45s
Saved best model to best_gpt2_model.pt



Epoch 5/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=5.71]


Epoch 5/300 | Train Loss: 5.7128 | Time: 25.47s
Saved best model to best_gpt2_model.pt



Epoch 6/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=5.51]


Epoch 6/300 | Train Loss: 5.5076 | Time: 25.46s
Saved best model to best_gpt2_model.pt



Epoch 7/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=5.35]


Epoch 7/300 | Train Loss: 5.3461 | Time: 25.48s
Saved best model to best_gpt2_model.pt



Epoch 8/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=5.21]


Epoch 8/300 | Train Loss: 5.2130 | Time: 25.49s
Saved best model to best_gpt2_model.pt



Epoch 9/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=5.1]


Epoch 9/300 | Train Loss: 5.0966 | Time: 25.48s
Saved best model to best_gpt2_model.pt



Epoch 10/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=5]


Epoch 10/300 | Train Loss: 4.9969 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 11/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=4.91]


Epoch 11/300 | Train Loss: 4.9068 | Time: 25.49s
Saved best model to best_gpt2_model.pt



Epoch 12/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=4.83]


Epoch 12/300 | Train Loss: 4.8260 | Time: 25.49s
Saved best model to best_gpt2_model.pt



Epoch 13/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=4.75]


Epoch 13/300 | Train Loss: 4.7502 | Time: 25.49s
Saved best model to best_gpt2_model.pt



Epoch 14/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=4.68]


Epoch 14/300 | Train Loss: 4.6807 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 15/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.72it/s, train_loss=4.61]


Epoch 15/300 | Train Loss: 4.6150 | Time: 25.74s
Saved best model to best_gpt2_model.pt



Epoch 16/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.72it/s, train_loss=4.55]


Epoch 16/300 | Train Loss: 4.5514 | Time: 25.72s
Saved best model to best_gpt2_model.pt



Epoch 17/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=4.49]


Epoch 17/300 | Train Loss: 4.4905 | Time: 25.48s
Saved best model to best_gpt2_model.pt



Epoch 18/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=4.43]


Epoch 18/300 | Train Loss: 4.4338 | Time: 25.49s
Saved best model to best_gpt2_model.pt



Epoch 19/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=4.38]


Epoch 19/300 | Train Loss: 4.3782 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 20/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=4.32]


Epoch 20/300 | Train Loss: 4.3238 | Time: 25.48s
Saved best model to best_gpt2_model.pt



Epoch 21/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=4.27]


Epoch 21/300 | Train Loss: 4.2716 | Time: 25.49s
Saved best model to best_gpt2_model.pt



Epoch 22/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=4.22]


Epoch 22/300 | Train Loss: 4.2193 | Time: 25.48s
Saved best model to best_gpt2_model.pt



Epoch 23/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=4.17]


Epoch 23/300 | Train Loss: 4.1687 | Time: 25.48s
Saved best model to best_gpt2_model.pt



Epoch 24/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=4.12]


Epoch 24/300 | Train Loss: 4.1201 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 25/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=4.07]


Epoch 25/300 | Train Loss: 4.0730 | Time: 25.47s
Saved best model to best_gpt2_model.pt



Epoch 26/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=4.03]


Epoch 26/300 | Train Loss: 4.0256 | Time: 25.49s
Saved best model to best_gpt2_model.pt



Epoch 27/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=3.98]


Epoch 27/300 | Train Loss: 3.9778 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 28/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=3.93]


Epoch 28/300 | Train Loss: 3.9306 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 29/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=3.89]


Epoch 29/300 | Train Loss: 3.8851 | Time: 25.49s
Saved best model to best_gpt2_model.pt



Epoch 30/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=3.84]


Epoch 30/300 | Train Loss: 3.8385 | Time: 25.44s
Saved best model to best_gpt2_model.pt



Epoch 31/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=3.79]


Epoch 31/300 | Train Loss: 3.7922 | Time: 25.49s
Saved best model to best_gpt2_model.pt



Epoch 32/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=3.75]


Epoch 32/300 | Train Loss: 3.7465 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 33/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=3.7]


Epoch 33/300 | Train Loss: 3.7020 | Time: 25.49s
Saved best model to best_gpt2_model.pt



Epoch 34/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=3.66]


Epoch 34/300 | Train Loss: 3.6583 | Time: 25.49s
Saved best model to best_gpt2_model.pt



Epoch 35/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=3.61]


Epoch 35/300 | Train Loss: 3.6148 | Time: 25.49s
Saved best model to best_gpt2_model.pt



Epoch 36/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=3.57]


Epoch 36/300 | Train Loss: 3.5734 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 37/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=3.53]


Epoch 37/300 | Train Loss: 3.5296 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 38/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=3.48]


Epoch 38/300 | Train Loss: 3.4831 | Time: 25.44s
Saved best model to best_gpt2_model.pt



Epoch 39/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=3.44]


Epoch 39/300 | Train Loss: 3.4410 | Time: 25.49s
Saved best model to best_gpt2_model.pt



Epoch 40/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=3.4]


Epoch 40/300 | Train Loss: 3.3980 | Time: 25.49s
Saved best model to best_gpt2_model.pt



Epoch 41/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=3.36]


Epoch 41/300 | Train Loss: 3.3585 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 42/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=3.32]


Epoch 42/300 | Train Loss: 3.3167 | Time: 25.49s
Saved best model to best_gpt2_model.pt



Epoch 43/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=3.27]


Epoch 43/300 | Train Loss: 3.2714 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 44/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=3.23]


Epoch 44/300 | Train Loss: 3.2333 | Time: 25.43s
Saved best model to best_gpt2_model.pt



Epoch 45/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=3.19]


Epoch 45/300 | Train Loss: 3.1919 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 46/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=3.15]


Epoch 46/300 | Train Loss: 3.1504 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 47/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=3.11]


Epoch 47/300 | Train Loss: 3.1102 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 48/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=3.07]


Epoch 48/300 | Train Loss: 3.0689 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 49/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=3.03]


Epoch 49/300 | Train Loss: 3.0299 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 50/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=2.99]


Epoch 50/300 | Train Loss: 2.9892 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 51/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=2.95]


Epoch 51/300 | Train Loss: 2.9488 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 52/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=2.91]


Epoch 52/300 | Train Loss: 2.9096 | Time: 25.49s
Saved best model to best_gpt2_model.pt



Epoch 53/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=2.87]


Epoch 53/300 | Train Loss: 2.8711 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 54/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=2.84]


Epoch 54/300 | Train Loss: 2.8352 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 55/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=2.8]


Epoch 55/300 | Train Loss: 2.7983 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 56/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=2.76]


Epoch 56/300 | Train Loss: 2.7602 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 57/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=2.72]


Epoch 57/300 | Train Loss: 2.7207 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 58/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=2.68]


Epoch 58/300 | Train Loss: 2.6815 | Time: 25.47s
Saved best model to best_gpt2_model.pt



Epoch 59/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=2.64]


Epoch 59/300 | Train Loss: 2.6440 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 60/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=2.61]


Epoch 60/300 | Train Loss: 2.6059 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 61/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=2.57]


Epoch 61/300 | Train Loss: 2.5676 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 62/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=2.53]


Epoch 62/300 | Train Loss: 2.5332 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 63/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=2.5]


Epoch 63/300 | Train Loss: 2.4953 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 64/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=2.46]


Epoch 64/300 | Train Loss: 2.4611 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 65/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=2.43]


Epoch 65/300 | Train Loss: 2.4273 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 66/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=2.39]


Epoch 66/300 | Train Loss: 2.3936 | Time: 25.49s
Saved best model to best_gpt2_model.pt



Epoch 67/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=2.36]


Epoch 67/300 | Train Loss: 2.3588 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 68/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=2.32]


Epoch 68/300 | Train Loss: 2.3248 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 69/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=2.29]


Epoch 69/300 | Train Loss: 2.2941 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 70/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=2.26]


Epoch 70/300 | Train Loss: 2.2629 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 71/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=2.23]


Epoch 71/300 | Train Loss: 2.2295 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 72/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=2.2]


Epoch 72/300 | Train Loss: 2.1985 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 73/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=2.17]


Epoch 73/300 | Train Loss: 2.1706 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 74/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=2.14]


Epoch 74/300 | Train Loss: 2.1416 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 75/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=2.11]


Epoch 75/300 | Train Loss: 2.1137 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 76/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=2.08]


Epoch 76/300 | Train Loss: 2.0827 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 77/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=2.06]


Epoch 77/300 | Train Loss: 2.0550 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 78/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=2.02]


Epoch 78/300 | Train Loss: 2.0236 | Time: 25.49s
Saved best model to best_gpt2_model.pt



Epoch 79/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=2]


Epoch 79/300 | Train Loss: 1.9975 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 80/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.97]


Epoch 80/300 | Train Loss: 1.9674 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 81/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=1.94]


Epoch 81/300 | Train Loss: 1.9390 | Time: 25.49s
Saved best model to best_gpt2_model.pt



Epoch 82/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.91]


Epoch 82/300 | Train Loss: 1.9092 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 83/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=1.88]


Epoch 83/300 | Train Loss: 1.8846 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 84/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.86]


Epoch 84/300 | Train Loss: 1.8578 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 85/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=1.83]


Epoch 85/300 | Train Loss: 1.8311 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 86/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.81]


Epoch 86/300 | Train Loss: 1.8061 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 87/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.78]


Epoch 87/300 | Train Loss: 1.7763 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 88/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.75]


Epoch 88/300 | Train Loss: 1.7502 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 89/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.72]


Epoch 89/300 | Train Loss: 1.7245 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 90/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.7]


Epoch 90/300 | Train Loss: 1.6995 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 91/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=1.67]


Epoch 91/300 | Train Loss: 1.6740 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 92/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.65]


Epoch 92/300 | Train Loss: 1.6522 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 93/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.63]


Epoch 93/300 | Train Loss: 1.6266 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 94/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.6]


Epoch 94/300 | Train Loss: 1.6023 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 95/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=1.58]


Epoch 95/300 | Train Loss: 1.5791 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 96/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.56]


Epoch 96/300 | Train Loss: 1.5552 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 97/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.53]


Epoch 97/300 | Train Loss: 1.5327 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 98/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.51]


Epoch 98/300 | Train Loss: 1.5087 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 99/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.49]


Epoch 99/300 | Train Loss: 1.4872 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 100/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.47]


Epoch 100/300 | Train Loss: 1.4659 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 101/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.44]


Epoch 101/300 | Train Loss: 1.4444 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 102/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.42]


Epoch 102/300 | Train Loss: 1.4235 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 103/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.4]


Epoch 103/300 | Train Loss: 1.4029 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 104/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.38]


Epoch 104/300 | Train Loss: 1.3810 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 105/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.36]


Epoch 105/300 | Train Loss: 1.3606 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 106/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.34]


Epoch 106/300 | Train Loss: 1.3402 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 107/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.32]


Epoch 107/300 | Train Loss: 1.3214 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 108/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.3]


Epoch 108/300 | Train Loss: 1.3035 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 109/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.28]


Epoch 109/300 | Train Loss: 1.2820 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 110/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.26]


Epoch 110/300 | Train Loss: 1.2649 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 111/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.24]


Epoch 111/300 | Train Loss: 1.2434 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 112/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.23]


Epoch 112/300 | Train Loss: 1.2257 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 113/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.21]


Epoch 113/300 | Train Loss: 1.2054 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 114/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=1.19]


Epoch 114/300 | Train Loss: 1.1863 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 115/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.17]


Epoch 115/300 | Train Loss: 1.1693 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 116/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.15]


Epoch 116/300 | Train Loss: 1.1520 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 117/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.14]


Epoch 117/300 | Train Loss: 1.1351 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 118/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.12]


Epoch 118/300 | Train Loss: 1.1181 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 119/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=1.1]


Epoch 119/300 | Train Loss: 1.1005 | Time: 25.49s
Saved best model to best_gpt2_model.pt



Epoch 120/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.09]


Epoch 120/300 | Train Loss: 1.0871 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 121/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=1.07]


Epoch 121/300 | Train Loss: 1.0703 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 122/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.06]


Epoch 122/300 | Train Loss: 1.0560 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 123/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.04]


Epoch 123/300 | Train Loss: 1.0414 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 124/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=1.02]


Epoch 124/300 | Train Loss: 1.0250 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 125/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=1.01]


Epoch 125/300 | Train Loss: 1.0097 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 126/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.995]


Epoch 126/300 | Train Loss: 0.9951 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 127/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.982]


Epoch 127/300 | Train Loss: 0.9817 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 128/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.966]


Epoch 128/300 | Train Loss: 0.9661 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 129/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.949]


Epoch 129/300 | Train Loss: 0.9491 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 130/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.938]


Epoch 130/300 | Train Loss: 0.9380 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 131/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=0.926]


Epoch 131/300 | Train Loss: 0.9259 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 132/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.911]


Epoch 132/300 | Train Loss: 0.9108 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 133/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.895]


Epoch 133/300 | Train Loss: 0.8946 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 134/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=0.882]


Epoch 134/300 | Train Loss: 0.8824 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 135/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.871]


Epoch 135/300 | Train Loss: 0.8706 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 136/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.858]


Epoch 136/300 | Train Loss: 0.8578 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 137/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.845]


Epoch 137/300 | Train Loss: 0.8454 | Time: 25.53s
Saved best model to best_gpt2_model.pt



Epoch 138/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=0.832]


Epoch 138/300 | Train Loss: 0.8316 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 139/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=0.821]


Epoch 139/300 | Train Loss: 0.8210 | Time: 25.49s
Saved best model to best_gpt2_model.pt



Epoch 140/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.806]


Epoch 140/300 | Train Loss: 0.8065 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 141/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.797]


Epoch 141/300 | Train Loss: 0.7973 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 142/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.785]


Epoch 142/300 | Train Loss: 0.7852 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 143/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=0.776]


Epoch 143/300 | Train Loss: 0.7761 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 144/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.762]


Epoch 144/300 | Train Loss: 0.7623 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 145/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.753]


Epoch 145/300 | Train Loss: 0.7528 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 146/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.742]


Epoch 146/300 | Train Loss: 0.7417 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 147/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.733]


Epoch 147/300 | Train Loss: 0.7327 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 148/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.721]


Epoch 148/300 | Train Loss: 0.7208 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 149/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.713]


Epoch 149/300 | Train Loss: 0.7131 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 150/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.703]


Epoch 150/300 | Train Loss: 0.7025 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 151/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.694]


Epoch 151/300 | Train Loss: 0.6943 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 152/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.684]


Epoch 152/300 | Train Loss: 0.6837 | Time: 25.53s
Saved best model to best_gpt2_model.pt



Epoch 153/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.674]


Epoch 153/300 | Train Loss: 0.6736 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 154/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.667]


Epoch 154/300 | Train Loss: 0.6667 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 155/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.657]


Epoch 155/300 | Train Loss: 0.6570 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 156/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.65]


Epoch 156/300 | Train Loss: 0.6504 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 157/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=0.641]


Epoch 157/300 | Train Loss: 0.6414 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 158/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.633]


Epoch 158/300 | Train Loss: 0.6335 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 159/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.625]


Epoch 159/300 | Train Loss: 0.6248 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 160/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.618]


Epoch 160/300 | Train Loss: 0.6183 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 161/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.61]


Epoch 161/300 | Train Loss: 0.6104 | Time: 25.53s
Saved best model to best_gpt2_model.pt



Epoch 162/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.602]


Epoch 162/300 | Train Loss: 0.6020 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 163/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=0.595]


Epoch 163/300 | Train Loss: 0.5955 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 164/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=0.587]


Epoch 164/300 | Train Loss: 0.5871 | Time: 25.49s
Saved best model to best_gpt2_model.pt



Epoch 165/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.582]


Epoch 165/300 | Train Loss: 0.5815 | Time: 25.53s
Saved best model to best_gpt2_model.pt



Epoch 166/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.574]


Epoch 166/300 | Train Loss: 0.5740 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 167/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.566]


Epoch 167/300 | Train Loss: 0.5664 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 168/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.563]


Epoch 168/300 | Train Loss: 0.5632 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 169/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.555]


Epoch 169/300 | Train Loss: 0.5547 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 170/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.548]


Epoch 170/300 | Train Loss: 0.5485 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 171/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.542]


Epoch 171/300 | Train Loss: 0.5417 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 172/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.537]


Epoch 172/300 | Train Loss: 0.5368 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 173/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.53]


Epoch 173/300 | Train Loss: 0.5296 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 174/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=0.523]


Epoch 174/300 | Train Loss: 0.5232 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 175/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.518]


Epoch 175/300 | Train Loss: 0.5176 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 176/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=0.513]


Epoch 176/300 | Train Loss: 0.5129 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 177/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.508]


Epoch 177/300 | Train Loss: 0.5078 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 178/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=0.503]


Epoch 178/300 | Train Loss: 0.5029 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 179/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.496]


Epoch 179/300 | Train Loss: 0.4956 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 180/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.492]


Epoch 180/300 | Train Loss: 0.4925 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 181/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.486]


Epoch 181/300 | Train Loss: 0.4860 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 182/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.481]


Epoch 182/300 | Train Loss: 0.4814 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 183/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.476]


Epoch 183/300 | Train Loss: 0.4764 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 184/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.472]


Epoch 184/300 | Train Loss: 0.4722 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 185/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=0.467]


Epoch 185/300 | Train Loss: 0.4667 | Time: 25.45s
Saved best model to best_gpt2_model.pt



Epoch 186/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.463]


Epoch 186/300 | Train Loss: 0.4629 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 187/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.457]


Epoch 187/300 | Train Loss: 0.4566 | Time: 25.53s
Saved best model to best_gpt2_model.pt



Epoch 188/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.452]


Epoch 188/300 | Train Loss: 0.4524 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 189/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.447]


Epoch 189/300 | Train Loss: 0.4469 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 190/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.443]


Epoch 190/300 | Train Loss: 0.4432 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 191/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.438]


Epoch 191/300 | Train Loss: 0.4380 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 192/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.432]


Epoch 192/300 | Train Loss: 0.4320 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 193/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.429]


Epoch 193/300 | Train Loss: 0.4293 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 194/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.424]


Epoch 194/300 | Train Loss: 0.4241 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 195/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.421]


Epoch 195/300 | Train Loss: 0.4210 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 196/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.416]


Epoch 196/300 | Train Loss: 0.4165 | Time: 25.53s
Saved best model to best_gpt2_model.pt



Epoch 197/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=0.413]


Epoch 197/300 | Train Loss: 0.4127 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 198/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.406]


Epoch 198/300 | Train Loss: 0.4062 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 199/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.404]


Epoch 199/300 | Train Loss: 0.4040 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 200/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.4]


Epoch 200/300 | Train Loss: 0.4002 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 201/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.398]


Epoch 201/300 | Train Loss: 0.3978 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 202/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.392]


Epoch 202/300 | Train Loss: 0.3924 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 203/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.39]


Epoch 203/300 | Train Loss: 0.3902 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 204/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.386]


Epoch 204/300 | Train Loss: 0.3856 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 205/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.383]


Epoch 205/300 | Train Loss: 0.3830 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 206/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.38]


Epoch 206/300 | Train Loss: 0.3805 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 207/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.376]


Epoch 207/300 | Train Loss: 0.3761 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 208/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.373]


Epoch 208/300 | Train Loss: 0.3733 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 209/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.37]


Epoch 209/300 | Train Loss: 0.3695 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 210/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.367]


Epoch 210/300 | Train Loss: 0.3673 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 211/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=0.364]


Epoch 211/300 | Train Loss: 0.3643 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 212/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=0.361]


Epoch 212/300 | Train Loss: 0.3606 | Time: 25.49s
Saved best model to best_gpt2_model.pt



Epoch 213/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.357]


Epoch 213/300 | Train Loss: 0.3566 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 214/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.354]


Epoch 214/300 | Train Loss: 0.3538 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 215/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.35]


Epoch 215/300 | Train Loss: 0.3500 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 216/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.349]


Epoch 216/300 | Train Loss: 0.3493 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 217/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.346]


Epoch 217/300 | Train Loss: 0.3463 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 218/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.344]


Epoch 218/300 | Train Loss: 0.3440 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 219/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.341]


Epoch 219/300 | Train Loss: 0.3412 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 220/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.338]


Epoch 220/300 | Train Loss: 0.3382 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 221/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.336]


Epoch 221/300 | Train Loss: 0.3359 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 222/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.333]


Epoch 222/300 | Train Loss: 0.3326 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 223/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.33]


Epoch 223/300 | Train Loss: 0.3304 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 224/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.328]


Epoch 224/300 | Train Loss: 0.3283 | Time: 25.53s
Saved best model to best_gpt2_model.pt



Epoch 225/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.326]


Epoch 225/300 | Train Loss: 0.3263 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 226/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.324]


Epoch 226/300 | Train Loss: 0.3237 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 227/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.32]


Epoch 227/300 | Train Loss: 0.3203 | Time: 25.53s
Saved best model to best_gpt2_model.pt



Epoch 228/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=0.318]


Epoch 228/300 | Train Loss: 0.3178 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 229/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.317]


Epoch 229/300 | Train Loss: 0.3174 | Time: 25.53s
Saved best model to best_gpt2_model.pt



Epoch 230/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.314]


Epoch 230/300 | Train Loss: 0.3141 | Time: 25.53s
Saved best model to best_gpt2_model.pt



Epoch 231/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.312]


Epoch 231/300 | Train Loss: 0.3120 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 232/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.309]


Epoch 232/300 | Train Loss: 0.3087 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 233/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.307]


Epoch 233/300 | Train Loss: 0.3066 | Time: 25.53s
Saved best model to best_gpt2_model.pt



Epoch 234/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.305]


Epoch 234/300 | Train Loss: 0.3051 | Time: 25.53s
Saved best model to best_gpt2_model.pt



Epoch 235/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.302]


Epoch 235/300 | Train Loss: 0.3024 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 236/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.3]


Epoch 236/300 | Train Loss: 0.3001 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 237/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.299]


Epoch 237/300 | Train Loss: 0.2988 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 238/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.297]


Epoch 238/300 | Train Loss: 0.2973 | Time: 25.53s
Saved best model to best_gpt2_model.pt



Epoch 239/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.295]


Epoch 239/300 | Train Loss: 0.2954 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 240/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.292]


Epoch 240/300 | Train Loss: 0.2916 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 241/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.289]


Epoch 241/300 | Train Loss: 0.2892 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 242/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.289]


Epoch 242/300 | Train Loss: 0.2891 | Time: 25.53s
Saved best model to best_gpt2_model.pt



Epoch 243/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.287]


Epoch 243/300 | Train Loss: 0.2866 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 244/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.285]


Epoch 244/300 | Train Loss: 0.2846 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 245/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.284]


Epoch 245/300 | Train Loss: 0.2836 | Time: 25.53s
Saved best model to best_gpt2_model.pt



Epoch 246/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.281]


Epoch 246/300 | Train Loss: 0.2810 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 247/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.279]


Epoch 247/300 | Train Loss: 0.2791 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 248/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.278]


Epoch 248/300 | Train Loss: 0.2781 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 249/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.275]


Epoch 249/300 | Train Loss: 0.2749 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 250/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.274]


Epoch 250/300 | Train Loss: 0.2744 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 251/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.272]


Epoch 251/300 | Train Loss: 0.2723 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 252/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.271]


Epoch 252/300 | Train Loss: 0.2711 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 253/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.269]


Epoch 253/300 | Train Loss: 0.2691 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 254/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.267]


Epoch 254/300 | Train Loss: 0.2669 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 255/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.266]


Epoch 255/300 | Train Loss: 0.2656 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 256/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.265]


Epoch 256/300 | Train Loss: 0.2646 | Time: 25.53s
Saved best model to best_gpt2_model.pt



Epoch 257/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.262]


Epoch 257/300 | Train Loss: 0.2618 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 258/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.261]


Epoch 258/300 | Train Loss: 0.2608 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 259/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.259]


Epoch 259/300 | Train Loss: 0.2592 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 260/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.258]


Epoch 260/300 | Train Loss: 0.2582 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 261/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.257]


Epoch 261/300 | Train Loss: 0.2570 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 262/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.254]


Epoch 262/300 | Train Loss: 0.2540 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 263/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.254]


Epoch 263/300 | Train Loss: 0.2540 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 264/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.253]


Epoch 264/300 | Train Loss: 0.2526 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 265/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.251]


Epoch 265/300 | Train Loss: 0.2511 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 266/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.75it/s, train_loss=0.25]


Epoch 266/300 | Train Loss: 0.2503 | Time: 25.50s
Saved best model to best_gpt2_model.pt



Epoch 267/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.248]


Epoch 267/300 | Train Loss: 0.2479 | Time: 25.53s
Saved best model to best_gpt2_model.pt



Epoch 268/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.246]


Epoch 268/300 | Train Loss: 0.2459 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 269/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.246]


Epoch 269/300 | Train Loss: 0.2458 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 270/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.244]


Epoch 270/300 | Train Loss: 0.2440 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 271/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.243]


Epoch 271/300 | Train Loss: 0.2425 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 272/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.242]


Epoch 272/300 | Train Loss: 0.2418 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 273/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.241]


Epoch 273/300 | Train Loss: 0.2408 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 274/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.238]


Epoch 274/300 | Train Loss: 0.2384 | Time: 25.53s
Saved best model to best_gpt2_model.pt



Epoch 275/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.238]


Epoch 275/300 | Train Loss: 0.2378 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 276/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.237]


Epoch 276/300 | Train Loss: 0.2366 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 277/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.234]


Epoch 277/300 | Train Loss: 0.2345 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 278/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.234]


Epoch 278/300 | Train Loss: 0.2344 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 279/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.233]


Epoch 279/300 | Train Loss: 0.2326 | Time: 25.53s
Saved best model to best_gpt2_model.pt



Epoch 280/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.231]


Epoch 280/300 | Train Loss: 0.2307 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 281/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.23]


Epoch 281/300 | Train Loss: 0.2300 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 282/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.229]


Epoch 282/300 | Train Loss: 0.2287 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 283/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.228]


Epoch 283/300 | Train Loss: 0.2280 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 284/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.228]


Epoch 284/300 | Train Loss: 0.2281 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 285/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.226]


Epoch 285/300 | Train Loss: 0.2256 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 286/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.223]


Epoch 286/300 | Train Loss: 0.2233 | Time: 25.53s
Saved best model to best_gpt2_model.pt



Epoch 287/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.224]


Epoch 287/300 | Train Loss: 0.2240 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 288/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.223]


Epoch 288/300 | Train Loss: 0.2229 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 289/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.222]


Epoch 289/300 | Train Loss: 0.2219 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 290/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.22]


Epoch 290/300 | Train Loss: 0.2198 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 291/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.219]


Epoch 291/300 | Train Loss: 0.2191 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 292/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.218]


Epoch 292/300 | Train Loss: 0.2178 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 293/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.217]


Epoch 293/300 | Train Loss: 0.2173 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 294/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.216]


Epoch 294/300 | Train Loss: 0.2156 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 295/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.215]


Epoch 295/300 | Train Loss: 0.2146 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 296/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.215]


Epoch 296/300 | Train Loss: 0.2147 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 297/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.214]


Epoch 297/300 | Train Loss: 0.2136 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 298/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.213]


Epoch 298/300 | Train Loss: 0.2125 | Time: 25.52s
Saved best model to best_gpt2_model.pt



Epoch 299/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.211]


Epoch 299/300 | Train Loss: 0.2111 | Time: 25.51s
Saved best model to best_gpt2_model.pt



Epoch 300/300 Training: 100%|██████████| 70/70 [00:25<00:00,  2.74it/s, train_loss=0.211]


Epoch 300/300 | Train Loss: 0.2107 | Time: 25.52s
Saved best model to best_gpt2_model.pt

Loaded best model from best_gpt2_model.pt


In [27]:
model = GPT2Model(config).to(hparams['device'])
model.load_state_dict(torch.load("best_gpt2_model_Final_v1.pt"))

<All keys matched successfully>

In [35]:
import random

# Select 10 random indices from the dataset
random_ids = random.sample(range(len(train_loader)), 5)

for idx in random_ids:
    print("*"*50, f"SAMPLE {idx}", "*"*50)
    input_ids, target_ids = train_loader.dataset[idx]  # Use dataset indexing directly
    input_text = tiktoken_tokenizer.decode(input_ids.tolist())
    target_text = tiktoken_tokenizer.decode(target_ids.tolist())

    generated_ids = model.generate(
        input_ids.unsqueeze(0).to(hparams['device']),  # Add batch dimension
        max_new_tokens=20,
    )
    generated_text = tiktoken_tokenizer.decode(generated_ids[0].cpu().tolist())

    print("Prompt: ", input_text, "\n")
    print("Target text: ", target_text, "\n")
    print("Generated text: ", generated_text, "\n")

************************************************** SAMPLE 2473 **************************************************
Prompt:   book some of the books the ministrys confiscated dads told me there was one that burned your eyes out and everyone who read sonnets of a sorcerer spoke in limericks for the rest of their lives and some old witch in bath had a book that you could never stop reading you just had to wander around with your nose 

Target text:   some of the books the ministrys confiscated dads told me there was one that burned your eyes out and everyone who read sonnets of a sorcerer spoke in limericks for the rest of their lives and some old witch in bath had a book that you could never stop reading you just had to wander around with your nose in 

Generated text:   book some of the books the ministrys confiscated dads told me there was one that burned your eyes out and everyone who read sonnets of a sorcerer spoke in limericks for the rest of their lives and some old witch in bath h

### Check model perplexity

In [41]:
def evaluate_perplexity(model, data_loader, device):
    model.eval()
    total_loss = 0.0
    total_tokens = 0

    with torch.no_grad():
        for input_ids, target_ids in data_loader:
            input_ids = input_ids.to(device)
            target_ids = target_ids.to(device)

            logits, loss = model(input_ids, labels=target_ids)
            num_tokens = target_ids.numel()

            total_loss += loss.item() * num_tokens
            total_tokens += num_tokens

    avg_loss = total_loss / total_tokens
    perplexity = math.exp(avg_loss)
    return perplexity

In [46]:
perplexity = evaluate_perplexity(model, train_loader, hparams["device"])
print(f"Perplexity on training data: {perplexity:.2f}")

Perplexity on training data: 1.06
