In [22]:
GPT_CONFIG_124M = {
"vocab_size": 50257, # Vocabulary size
"context_length": 256, # Context length
"emb_dim": 768, # Embedding dimension
"n_heads": 12, # Number of attention heads
"n_layers": 8, # Number of layers
"drop_rate": 0.6, # Dropout rate
"qkv_bias": False # Query-Key-Value bias
}

In [23]:
# ch 1 & 2
import urllib.request
import torch
import tiktoken  
from torch.utils.data import Dataset, DataLoader

# Download the text file
#url = ("https://raw.githubusercontent.com/rasbt/" "LLMs-from-scratch/main/ch02/01_main-chapter-code/" "the-verdict.txt")
#file_path = "the-verdict.txt"
#urllib.request.urlretrieve(url, file_path)

# Load and tokenize the text
with open(r"/kaggle/input/acrsvda/train.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

tokenizer = tiktoken.get_encoding("cl100k_base")  
enc_text = tokenizer.encode(raw_text)

# Define a custom dataset class
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt)
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

# Function to create a DataLoader
def create_dataloader_v1(txt, batch_size, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
    return dataloader 


In [24]:

# Create DataLoader
dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=2, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
second_batch = next(data_iter)
print("First batch:", first_batch)
print("Second batch:", second_batch)

# --- Adding Token and Positional Embeddings ---

# Define token embedding layer
vocab_size = 50257  # Typical size for GPT models
embedding_dim = 256  # Example embedding size (GPT-3 uses 12,288)
token_embedding_layer = torch.nn.Embedding(vocab_size, embedding_dim)

# Convert token IDs into token embeddings
token_embeddings = token_embedding_layer(first_batch[0])  # First batch of inputs
print("Token Embeddings Shape:", token_embeddings.shape)

# Define positional embedding layer
max_length = 4  # Same as the max sequence length
pos_embedding_layer = torch.nn.Embedding(max_length, embedding_dim)

# Generate position embeddings
positions = torch.arange(max_length).unsqueeze(0)  # Create position indices
pos_embeddings = pos_embedding_layer(positions)
print("Positional Embeddings Shape:", pos_embeddings.shape)

# Combine token and positional embeddings
input_embeddings = token_embeddings + pos_embeddings
print("Final Input Embeddings Shape:", input_embeddings.shape)

First batch: [tensor([[3198, 1110,   11,  257]]), tensor([[1110,   11,  257, 1310]])]
Second batch: [tensor([[  11,  257, 1310, 2576]]), tensor([[ 257, 1310, 2576, 3706]])]
Token Embeddings Shape: torch.Size([1, 4, 256])
Positional Embeddings Shape: torch.Size([1, 4, 256])
Final Input Embeddings Shape: torch.Size([1, 4, 256])


In [25]:
# ch 3
import torch.nn as nn # type: ignore
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out,context_length , dropout, num_heads, qkv_bias = False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"
        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads 
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask' , torch.triu(torch.ones(context_length, context_length), diagonal=1))
    def forward(self , x):
        b, num_tokens,d_in = x.shape
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)
        keys = keys.view(b, num_tokens,self.num_heads,self.head_dim)
        values = values.view(b, num_tokens,self.num_heads,self.head_dim)
        queries = queries.view(b, num_tokens,self.num_heads,self.head_dim)
        keys = keys.transpose(1,2)
        queries = queries.transpose(1,2)
        values = values.transpose(1,2)
        attn_scores = queries @ keys.transpose(-2,-1)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores.masked_fill_(mask_bool, -torch.inf)
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)
        context_vec = (attn_weights @ values).transpose(1, 2)
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec)  # Correct method call
        return context_vec


In [26]:
# ch 4

class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))
    def forward(self,x):
        mean = x.mean(dim = -1, keepdim = True)
        var = x.var(dim = -1, keepdim = True)
        norm_x=(x-mean)/ torch.sqrt(var+self.eps)
        return norm_x*self.scale+self.shift
class GELU(nn.Module):
    def forward(self,x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))))

class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )
    def forward(self,x):
        return self.layers(x)
class ShortcutConnection(nn.Module):
    def __init__(self,layer_size,use_shortcut):
        super().__init__()
        self.use_shortcut = use_shortcut
        self.layers = nn.ModuleList([
            nn.Sequential(nn.Linear(layer_size[0] , layer_size[1]),
                         GELU() ),
             nn.Sequential(nn.Linear(layer_size[0] , layer_size[1]),
                         GELU() ),
             nn.Sequential(nn.Linear(layer_size[0] , layer_size[1]),
                         GELU() ),
             nn.Sequential(nn.Linear(layer_size[0] , layer_size[1]),
                         GELU() ),
        ])
    def forward(self,x):
            for layer in self.layers:
                layer_output = layer(x)
                if self.use_shortcut and x.shape == layer_output.shape:
                    x = x + layer_output
                else:
                    x = layer_output
            return x

class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
             d_in=cfg["emb_dim"],
             d_out=cfg["emb_dim"],
             context_length=cfg["context_length"],
             num_heads=cfg["n_heads"],
             dropout=cfg["drop_rate"],
             qkv_bias=cfg["qkv_bias"] 
        )
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
    def forward(self,x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)
        x = x + shortcut
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut
        return x
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )
    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embds = self.tok_emb(in_idx)
        pos_embds = self.pos_emb(
            torch.arange(seq_len, device=in_idx.device)
        )
        x= tok_embds + pos_embds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits= self.out_head(x)
        return logits

In [27]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        # Adjust context size to match the available tokens
        start_idx = max(0, idx.size(1) - context_size)
        idx_cond = idx[:, start_idx:]
        
        with torch.no_grad():
            logits = model(idx_cond)
        
        logits = logits[:, -1, :]  # Get logits for the last token in context
        probas = torch.softmax(logits, dim=-1)  # Convert logits to probabilities
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)  # Get the next token
        
        idx = torch.cat((idx, idx_next), dim=1)  # Append the next token to the sequence
    return idx

In [28]:
model = GPTModel(GPT_CONFIG_124M)

In [29]:
import tiktoken
import torch

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

# Initialize tokenizer
tokenizer = tiktoken.get_encoding("gpt2")

# Generate text
start_context = "Every effort moves you"
token_ids = generate_text_simple(
    model=model,
    idx=text_to_token_ids(start_context, tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M["context_length"]
)

# Print generated text
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you Grade318 Aholid philosophammu foreskinallery generously Brook


In [30]:
file_path = r"/kaggle/input/acrsvda/train.txt"
with open(file_path, "r") as file:
    text_data = file.read()
    

In [31]:
train_ratio= 0.90
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data  = text_data[split_idx:]


In [32]:
total_characters = len(text_data)
total_tokens = len(tokenizer.encode(text_data))
print("Characters:", total_characters)
print("Tokens:", total_tokens)

Characters: 1356709
Tokens: 336837


In [33]:
train_loader = create_dataloader_v1(
    train_data,
    batch_size=16,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=(GPT_CONFIG_124M["context_length"] *3 // 4),  # Fixed the division syntax
    drop_last=True,
    shuffle=True,
    num_workers=0
)

val_loader = create_dataloader_v1(
    val_data,
    batch_size=16,
    max_length=GPT_CONFIG_124M["context_length"],
    stride=(GPT_CONFIG_124M["context_length"] *3 // 4),  # Fixed the division syntax
    drop_last=False,
    shuffle=False,
    num_workers=0
)

In [34]:
print ("Train Loader ")
for x,y in train_loader:
    print(x.shape, y.shape)

print("\nValidation loader:")
for x, y in val_loader:
    print(x.shape, y.shape) 

Train Loader 
torch.Size([16, 256]) torch.Size([16, 256])
torch.Size([16, 256]) torch.Size([16, 256])
torch.Size([16, 256]) torch.Size([16, 256])
torch.Size([16, 256]) torch.Size([16, 256])
torch.Size([16, 256]) torch.Size([16, 256])
torch.Size([16, 256]) torch.Size([16, 256])
torch.Size([16, 256]) torch.Size([16, 256])
torch.Size([16, 256]) torch.Size([16, 256])
torch.Size([16, 256]) torch.Size([16, 256])
torch.Size([16, 256]) torch.Size([16, 256])
torch.Size([16, 256]) torch.Size([16, 256])
torch.Size([16, 256]) torch.Size([16, 256])
torch.Size([16, 256]) torch.Size([16, 256])
torch.Size([16, 256]) torch.Size([16, 256])
torch.Size([16, 256]) torch.Size([16, 256])
torch.Size([16, 256]) torch.Size([16, 256])
torch.Size([16, 256]) torch.Size([16, 256])
torch.Size([16, 256]) torch.Size([16, 256])
torch.Size([16, 256]) torch.Size([16, 256])
torch.Size([16, 256]) torch.Size([16, 256])
torch.Size([16, 256]) torch.Size([16, 256])
torch.Size([16, 256]) torch.Size([16, 256])
torch.Size([16, 25

In [35]:
def analyze_dimensions(train_loader, val_loader, tokenizer, text_data):
    # Analyze text data
    total_chars = len(text_data)
    total_tokens = len(tokenizer.encode(text_data))
    
    # Analyze training loader
    train_batches = len(train_loader)
    train_sequences = train_batches * 2  # since batch_size=2
    train_tokens = train_sequences * 256  # since each sequence has 256 tokens
    
    # Analyze validation loader
    val_batches = len(val_loader)
    val_sequences = val_batches * 2
    val_tokens = val_sequences * 256
    
    print("=== Overall Data Dimensions ===")
    print(f"Total characters in text: {total_chars:,}")
    print(f"Total tokens in text: {total_tokens:,}")
    print("\n=== Training Data Dimensions ===")
    print(f"Number of batches: {train_batches}")
    print(f"Number of sequences: {train_sequences}")
    print(f"Tokens per sequence: 256")
    print(f"Total tokens processed per epoch: {train_tokens:,}")
    print("\n=== Single Training Batch Structure ===")
    print("Shape: [2, 256]")
    print("  - 2: number of sequences per batch")
    print("  - 256: tokens per sequence")
    print(f"Total tokens per batch: {2 * 256}")
    print("\n=== Validation Data Dimensions ===")
    print(f"Number of batches: {val_batches}")
    print(f"Number of sequences: {val_sequences}")
    print(f"Tokens per sequence: 256")
    print(f"Total tokens: {val_tokens:,}")

# Use the function
analyze_dimensions(train_loader, val_loader, tokenizer, text_data)

=== Overall Data Dimensions ===
Total characters in text: 1,356,709
Total tokens in text: 336,837

=== Training Data Dimensions ===
Number of batches: 98
Number of sequences: 196
Tokens per sequence: 256
Total tokens processed per epoch: 50,176

=== Single Training Batch Structure ===
Shape: [2, 256]
  - 2: number of sequences per batch
  - 256: tokens per sequence
Total tokens per batch: 512

=== Validation Data Dimensions ===
Number of batches: 11
Number of sequences: 22
Tokens per sequence: 256
Total tokens: 5,632


In [36]:
def calc_loss_batch(input_batch , target_batch, model, device, num_batches= None):
    input_batch = input_batch.to(device)
    target_batch= target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss
        

In [37]:
def calc_loss_loader(data_loader, model, device, num_batches=None):
   
    # Return NaN if the data loader is empty
    if len(data_loader) == 0:
        return float("nan")

    # Use all batches if num_batches is not specified, otherwise use the minimum
    num_batches = len(data_loader) if num_batches is None else min(num_batches, len(data_loader))

    total_loss = 0.0

    # Iterate through the data loader and compute loss
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i >= num_batches:
            break

        # Calculate loss for the current batch
        loss = calc_loss_batch(input_batch, target_batch, model, device)
        total_loss += loss.item()

    # Return the average loss
    return total_loss / num_batches


In [38]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model. to(device)
with torch.no_grad ():
    train_loss = calc_loss_loader(train_loader, model, device)
    val_loss = calc_loss_loader(val_loader, model, device)
print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 11.002137261994031
Validation loss: 10.999531745910645


In [39]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    
    model.train()
    
    return train_loss, val_loss


In [40]:


def generate_and_print_sample(model, tokenizer, device, start_context):
    
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    
    with torch.no_grad():
        token_ids = generate_text_simple(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
        )
        decoded_text = token_ids_to_text(token_ids, tokenizer)
        print(decoded_text.replace("\n", " "))
    
    model.train()


In [41]:
import math
import torch
from torch.cuda.amp import autocast, GradScaler
import os
def train_model_with_checkpoint_modern_fp16(model, train_loader, val_loader, optimizer, device, 
                n_epochs, eval_freq, eval_iter, start_context, tokenizer,
                warmup_steps, initial_lr=3e-05, min_lr=1e-6, checkpoint_path=None):
    """
    Training loop with mixed precision (FP16) and three key optimizations:
    1. Learning rate warmup
    2. Cosine decay schedule
    3. Gradient clipping
    """
    model.to(device)
    
    # Initialize gradient scaler for FP16
    scaler = GradScaler()
    
    # Load checkpoint if provided
    if checkpoint_path and os.path.exists(checkpoint_path):
        checkpoint = torch.load(checkpoint_path, map_location=device)
        model.load_state_dict(checkpoint["model_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
        print("Loaded checkpoint successfully")
    else:
        print("Starting fresh training")
        
    # Initialize tracking lists
    train_losses, val_losses, track_tokens_seen, track_lrs = [], [], [], []
    tokens_seen, global_step = 0, -1
    
    # Get peak learning rate from optimizer
    peak_lr = optimizer.param_groups[0]["lr"]
    
    # Calculate total steps for learning rate scheduling
    total_training_steps = len(train_loader) * n_epochs
    
    # OPTIMIZATION 1: Learning Rate Warmup
    # Calculate the learning rate increment for warmup phase
    lr_increment = (peak_lr - initial_lr) / warmup_steps

    for epoch in range(n_epochs):
        model.train()
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            global_step += 1

            # Learning rate scheduling logic
            if global_step < warmup_steps:
                # OPTIMIZATION 1: Learning Rate Warmup Implementation
                lr = initial_lr + global_step * lr_increment
            else:
                # OPTIMIZATION 2: Cosine Decay Implementation
                progress = (global_step - warmup_steps) / (total_training_steps - warmup_steps)
                lr = min_lr + (peak_lr - min_lr) * 0.5 * (1 + math.cos(math.pi * progress))

            # Update learning rate for all parameter groups
            for param_group in optimizer.param_groups:
                param_group["lr"] = lr
            track_lrs.append(lr)

            # Forward and backward pass with mixed precision
            with autocast():
                loss = calc_loss_batch(input_batch, target_batch, model, device)
            
            # Scale gradients and call backward
            scaler.scale(loss).backward()

            # OPTIMIZATION 3: Gradient Clipping
            # Only apply after warmup phase
            if global_step > warmup_steps:
                scaler.unscale_(optimizer)  # Unscale before clipping
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            # Optimizer step with scaling
            scaler.step(optimizer)
            scaler.update()
            
            # Track tokens processed
            tokens_seen += input_batch.numel()

            # Evaluation loop
            if global_step % eval_freq == 0:
                # Use fp32 for evaluation
                with torch.cuda.amp.autocast(enabled=False):
                    train_loss, val_loss = evaluate_model(
                        model, train_loader, val_loader,
                        device, eval_iter
                    )
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                
                # Print progress
                print(f"Ep {epoch+1} (Iter {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, "
                      f"Val loss {val_loss:.3f}")
                
                generate_and_print_sample(
                    model, tokenizer, device, start_context
                )

    return train_losses, val_losses, track_tokens_seen, track_lrs

In [42]:
import math
import os
import torch
from torch.cuda.amp import GradScaler, autocast
# Example usage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M).to(device)  # Move model to device
peak_lr = 1e-4
optimizer = torch.optim.AdamW(model.parameters(),lr=peak_lr,
    betas=(0.9, 0.95),  # Modified from default
    weight_decay=0.2,   # Increased from 0.01
    eps=1e-8
)
warmup_steps = 2000
# Training parameters
num_epochs = 100
train_losses, val_losses, tokens_seen, lrs = train_model_with_checkpoint_modern_fp16(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    device=device,
    n_epochs=num_epochs,
    eval_freq=100,
    eval_iter=50,
    start_context="Once upon a time,",
    tokenizer=tokenizer,
    warmup_steps=warmup_steps,
    checkpoint_path="model_and_optimizer.pth"
)


  scaler = GradScaler()
  with autocast():


Starting fresh training


  with torch.cuda.amp.autocast(enabled=False):


Ep 1 (Iter 000000): Train loss 10.833, Val loss 10.835
Once upon a time, Requ allow 1965","=~ Rhode therefore Epic thirteenthought Haskell instinctively Surgeryanc carsourt nicely turning honorable sentiments hydrogen TPPOh orally behindDevelop del Woods shifted hepatitis condesc electronic complainantLens verbs Sevutter Mont Ironically wikidden� votersemouth Partners Fred depicts Cait statureruit
Ep 2 (Iter 000100): Train loss 6.190, Val loss 6.182
Once upon a time, the.                                                
Ep 3 (Iter 000200): Train loss 5.042, Val loss 5.029
Once upon a time, "     The little girl was a time, "I, "I a time, "I, "I. He was a time, "          The little girl was so happy.
Ep 4 (Iter 000300): Train loss 4.588, Val loss 4.659
Once upon a time, "I'm, "I, "I't want to play with a big, "I day, "I, "I't want to play with her mommy.              "
Ep 5 (Iter 000400): Train loss 4.297, Val loss 4.445
Once upon a time, there was a time, there was a little girl named 

In [49]:
torch.save({
    "model_state_dict_F": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
}, "model_and_optimizer_F.pth")

In [48]:
# First, let's make sure we have all the prerequisites
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = tiktoken.get_encoding("gpt2")

# Some example starting contexts
start_contexts = [
    "Once upon a time "
]

# Generate text for each starting context
print("Generating text samples:")
print("-" * 50)

for context in start_contexts:
    print(f"\nStarting context: '{context}'")
    generate_and_print_sample(model, tokenizer, device, context)

Generating text samples:
--------------------------------------------------

Starting context: 'Once upon a time '
Once upon a time   One day, there was a little girl named Lily. She was playing in the park was curious about it and wanted to share her mom. Lily got very upset and asked her mom if she could go to go to the park. Her mom
