In [78]:
# The MIT License (MIT) Copyright (c) 2020 Andrej Karpathy
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge,
# publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
# OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
# USE OR OTHER DEALINGS IN THE SOFTWARE.

In [79]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [80]:
# We will run our code on our GPU
print('Is GPU available?:', torch.cuda.is_available()) 
print('My GPU:', torch.cuda.get_device_name()) # Name of my GPU

Is GPU available?: True
My GPU: NVIDIA GeForce RTX 2070 with Max-Q Design


# Hyperparameters

In [81]:
model_name = 'gpt_v1'

batch_size = 64
block_size = 256
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

# Preparing The Data

### Reading The Data

In [82]:
with open('../datasets/shakespeare.txt', 'r', encoding='utf-8') as f:
	text = f.read()

num_characters = len(text)
print('Length of dataset:', num_characters)

chars = sorted(list(set(text)))
vocab_size = len(chars)
print('Number of Unique Characters:', vocab_size)

Length of dataset: 1115394
Number of Unique Characters: 1115394


### Creating The Character Mappings, Encoders, and Decoders

In [83]:
stoi = { ch: i for i, ch in enumerate(chars) }
itos = { i: ch for i, ch in enumerate(chars) }

encode = lambda s: [stoi[c] for c in s] 
decode = lambda l: ''.join([itos[i] for i in l])

### Splitting The Dataset

In [84]:
encoded_data = encode(text)
data = torch.tensor(encoded_data, dtype=torch.long)

val_percentage = 0.1
n = int((1 - val_percentage) * len(data)) 
train_data = data[:n]
val_data = data[n:]

### Data Batching

In [85]:
def get_batch(split):
	data = train_data if split == 'train' else val_data
	ix = torch.randint(len(data) - block_size, (batch_size,))
	
	x = torch.stack([data[i: i + block_size] for i in ix])
	y = torch.stack([data[i + 1: i + block_size + 1] for i in ix])
	
	x, y = x.to(device), y.to(device)
	
	return x, y

In [86]:
x_batch_train, y_batch_train = get_batch('train')
x_batch_val, y_batch_val = get_batch('val')

# Self Attention In Neural Networks

Self-attention is a mechanism in neural networks, particularly in Transformer models, that allows each token (or word) in a sequence to interact with and consider the relevance of other tokens in the same sequence. This process helps the model understand relationships between words and capture context more effectively. By weighing the importance of each word relative to others, self-attention enables the model to see the "bigger picture," allowing it to grasp the overall meaning and nuances of the text.

Notice that we do not want tokens to communicate with other tokens ahead of themselves (it wouldn't make sense to get information from the future...). Instead, they only want to be able to communicate with tokens that came before themselves.

In [87]:
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape

torch.Size([4, 8, 2])

One way to achieve self attention is to simply sum the previous tokens up for any token, which implicitly captures the data and information of them. The term used for this in the context of GPTs is "BOW" which stands for "Bag Of Words".

In [88]:
# Initilize our 'x_bow' to zeros
x_bow = torch.zeros((B, T, C))

for batch in range(B):
	for token in range(T):
		# Get all previous tokens from the start up
		# until the current token, including itself.
		x_prev = x[batch, :token + 1]
		# Calculate the mean of 'x_prev' across the
		# zeroth dimension which is 'T' since 'x_prev'
		# is of shape (T, C). Then, we store this in
		# the bag of words.
		x_bow[batch, token] = torch.mean(x_prev, 0)


### Using Matrix Multiplication

The above implementation of BOW Self Attention is very brute force and not effecient. We can use matrix multiplication and triangular matrices to make this computation much more effective using the 'torch.tril()' function and the 'F.softmax()' function.

In [89]:
ones = torch.ones(T, T)
tril = torch.tril(ones)
# The 'weights' give the percentege of how much we essentially want to add up
# when we later add up previous tokens using matrix multiplication with the 
# inputs, 'x'.
weights = torch.zeros((T,T))

# having the upper triangle matrix being -inf (and after the softmax
# function, zero) allos us to mathematically tell the model not to retrieve 
# information and build context based on future tokens (those that lie after the
# current token).
weights = weights.masked_fill(tril == 0, float('-inf'))
print('Weights before softmax:\n', weights[:5, :])

weights = F.softmax(weights, dim=-1)
print('\nWeights after softmax:\n', weights[:5, :])

print('\nWeights shape:', weights.shape)
print('x shape:', x.shape)

x_bow_2 = weights @ x 
print('x_bow_2 shape:', x_bow_2.shape)


print('Are x_bow and x_bow_2 the same?:', torch.allclose(x_bow, x_bow_2))

Weights before softmax:
 tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf]])

Weights after softmax:
 tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000]])

Weights shape: torch.Size([8, 8])
x shape: torch.Size([4, 8, 2])
x_bow_2 shape: torch.Size([4, 8, 2])
Are x_bow and x_bow_2 the same?: True


## Building A Self Attention Head

What we looked at above was a single "head" of self attention. We will now take this idea and build on top of it. 

In a self-attention, attention heads are used to allow the model to focus on different aspects of the input sequence simultaneously. Each head operates independently, creating its own set of 'key', 'query', and 'value' vectors by linearly transforming the input embeddings. 
* The 'key' represents the content of the tokens that the model will attend to, essentially acting as the information that other tokens will query. 
* The 'query' is a vector that interacts with the keys to determine how much attention one token should pay to another. 
* The 'value' carries the actual information from the tokens, which is then weighted by the attention scores and combined to produce the output for each token. 

In [90]:
B, T, C = 4, 8, 32 
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 32])

In [91]:
# Represents the dimensionality of the queries, keys, and values within 
# each attention head.
head_size = 16

# The following are three linear layers which we will use in self attention
# heads. We perform a forward pass through these layers to aquire our keys,
# queries, and values. All these layers are linear transformations that maps 
# the input of size 'C' to an output of size 'head_size'.
key_layer = nn.Linear(C, head_size, bias=False)
query_layer = nn.Linear(C, head_size, bias=False)
value_layer = nn.Linear(C, head_size, bias=False)

# Perform the forward passes through the linear layers. Notice that the 
# keys, quires, and values do not communicate with eachother at this stage. 
key_layer = key_layer(x)
query_layer = query_layer(x)
value_layer = value_layer(x)

# The following line is what "applies the self attention", allowing the 'query'
# and 'key' to communicate with eachother. The transpose operation is necessary
# to perform the matrix multiplication. This is how the dimensions work out:
# (B, T, 16) @ (B,  16, T) = (B, T, T). This is the attention score matrix, 
# where each element (i, j) indicates how much focus the i-th token in the 
# sequence should have on the j-th token. This is how attention works and 
# how we encode the significance of the relationships between tokens in 
# the embedding space into the current token.
weights =  query_layer @ key_layer.transpose(-2, -1)

tril = torch.tril(torch.ones(T, T))
weights = weights.masked_fill(tril == 0, float('-inf'))
weights = F.softmax(weights, dim=-1)

# Think of 'value' as the source of the information that gets redistributed 
# according to the attention 'weights'. While 'query' and 'key' help determine 
# where to look (which tokens to focus on), 'value' provides the what 
# (the actual content that will be passed along to the next layer, 
# informed by the attention head)
out = weights @ value_layer

print(out.shape)
print(out[0])

torch.Size([4, 8, 16])
tensor([[ 0.2484,  0.0566,  0.8336,  0.3124, -0.0440,  0.0730, -0.0134,  0.0907,
          1.0683, -0.4445, -0.1486,  0.1959,  0.1864, -0.3188,  0.1218,  0.0935],
        [ 0.2273,  0.0474,  0.8315,  0.3182, -0.0072,  0.0839,  0.0108,  0.1599,
          1.0144, -0.4256, -0.1185,  0.2179,  0.1797, -0.3102,  0.1051,  0.1473],
        [-0.1394,  0.0411,  0.5235,  0.2729,  0.2268,  0.1124,  0.0875,  0.9478,
         -0.0738, -0.1358, -0.0544,  0.2660,  0.0181, -0.2933,  0.1738,  0.5395],
        [ 0.2276,  0.3147,  0.6287, -0.0114, -0.0069,  0.0644, -0.3685,  0.2278,
         -0.0244,  0.1210, -0.3476, -0.2308,  0.0240, -0.1796,  0.0431,  0.0167],
        [ 0.1132,  0.2551,  0.6288, -0.1467, -0.1872,  0.1291, -0.4193, -0.0381,
         -0.1684,  0.1416, -0.2571, -0.0698, -0.0531, -0.2164, -0.0137,  0.0847],
        [-0.0520, -0.0197,  0.7881,  0.3137,  0.5464,  0.2514,  0.2730,  1.1590,
          0.0060,  0.0042,  0.2914,  0.4282,  0.0504, -0.1321, -0.2023,  0.9130],

In [92]:
# Represents one head of self-attention
class Head(nn.Module):
	# Initilizes the self attention head with some set 'head_size'
	def __init__(self, head_size):
		super().__init__()

		# Create the linear layers for the keys, queries, and values.
		self.key_layer = nn.Linear(n_embd, head_size, bias=False)
		self.query_layer = nn.Linear(n_embd, head_size, bias=False)
		self.value_layer = nn.Linear(n_embd, head_size, bias=False)

		# Creates a 'tril' variable as a buffer using PyTorch.
		self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

		# Initilize the dropout
		self.dropout = nn.Dropout(dropout)

	def forward(self, x):
		# Get the dimensions
		B, T, C = x.shape
		
		# Feedforward through the layers to get 'key' and 'query'
		key = self.key_layer(x)   # (B, T, C)
		query = self.query_layer(x) # (B, T, C)

		# Allow the 'key' and 'query' to communicate with eachother
		weights = query @ key.transpose(-2, -1) * C ** -0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
		
		# Set the upper triangle of the matrix to '-inf' to prevent the model
		# from learning from future tokens.
		weights = weights.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
		
		# compute attention scores and store them in 'weights'
		weights = F.softmax(weights, dim=-1) # (B, T, T)

		# Apply dropout to the attention scores
		weights = self.dropout(weights)

		# perform the weighted aggregation of the values
		value = self.value_layer(x) # (B, T, C)
		out = weights @ value # (B, T, T) @ (B, T, C) -> (B, T, C)

		return out


# Multi Head Attention

Simply put, multi head attention is simply stacking multiple self-attention blocks and concatenating the result. 

In [93]:
# Represents multiple heads of self-attention in parallel
class MultiHeadAttention(nn.Module):
	
	# Initilizes the multi head attention block with a set
	# number of heads, 'num_heads' and a set 'head_size'
	def __init__(self, num_heads, head_size):
		super().__init__()
		# Initilize 'num_heads' amounnt of self-attention heads and append them
		# to a type of list offered in PyTorch called a "ModuleList"
		self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

		# We will add something called Residual Connections to the neural network
		# where we will need to use a projection (linear transformation) to 
		# project our output data from the neural network back to these 
		# residual connections
		self.projection = nn.Linear(n_embd, n_embd)
		# Initilize the Dropout
		self.dropout = nn.Dropout(dropout)

	def forward(self, x):
		# For each self attention head in 'self.heads', call the module
		# and apply the self attention to the input data, 'x'. Then, 
		# concatenate the results for all self attention heads along
		# the channel dimension (the last dimension).
		out = torch.cat([head(x) for head in self.heads], dim=-1)
		# Apply a linear transformation o fthe outcome of the multi head
		# attention layer. This projects the output to the residual connection
		out = self.projection(out)
		# Add Dropout to the neural network to prevent overfitting
		out = self.dropout(out)

		return out

# The FeedForward Layer

We will use a seperate class for performing forward passes to our module which we later will define. This is so that all tokens are feedforwarded independently.

In [94]:
# Represents a simple linear layer followed by an activation function
class FeedFoward(nn.Module):
	
	# Initilizes the class and creates the neural network using the
	# 'Sequential' model from PyTorch.
	def __init__(self, n_embd):
		super().__init__()

		self.neural_network = nn.Sequential(
			# Layer 1 (input layer): linear layer, and a ReLU activation
			nn.Linear(n_embd, 4 * n_embd),
			nn.ReLU(),
			# Projection layer: This layer will be use later when we introduce
			# resigual connections
			nn.Linear(4 * n_embd, n_embd),
			# Add Dropout to the neural network to prevent overfitting
			nn.Dropout(dropout),
		)

	# Forward pass through the entire network.
	def forward(self, x):
		return self.neural_network(x)

# The Block Class 

The block class helps us with scaling our neural networks since we will combine both the 'MultiHeadAttention' and the 'FeedForward' class in one single class called 'Block'. Scaling up neural networks and making them deeper comes with the problem of optimization becoming more and more difficult. To solve this issue, we introduce residual connections

## Residual Connections
### What are they?
Residual connections act as a "highway" that allows the input of a layer to bypass the layer's operations (linear layer, activation functions, etc.), and be directly added to its output. This shortcut connection helps mitigate the problem of vanishing gradients in deep networks, making it easier to optimize very deep models. 

### Why does it work?
By allowing the input to be directly added to the output, residual connections ensure that the network can preserve essential information and make it easier for the model to learn identity mappings, if necessary. This results in better gradient flow during backpropagation, leading to more stable and faster convergence during training.



In [None]:
# Represents a single block of both self attention and a feed forward
class Block(nn.Module):

	def __init__(self, n_embd, n_head):
		super().__init__()

		# Calculate the head size
		head_size = n_embd // n_head
		# Initilize the multi head attention layer
		self.self_attention = MultiHeadAttention(n_head, head_size)

		# Initilize the feedforward class
		self.feedforward = FeedFoward(n_embd)

		# Initilie Batch Normalization Layers
		self.layer_norm_1 = nn.LayerNorm(n_embd)
		self.layer_norm_2 = nn.LayerNorm(n_embd)

	# Forward the input data through the block of self attention and
	# the neural network, and add residual connections.
	def forward(self, x):
		# Apply the self attention mechanism to the input tensor, 'x'.
		# Notice that we add 'x' to itself, which is the residual connection
		# we are making; the input gets added to the output. We also apply
		# batch normalization before 'x' gets applied to the self attention layer.
		x = x + self.self_attention(self.layer_norm_1(x))
		# Apply the feedforward to the updated tensor, 'x', which now includes
		# the self attention result. Notice that we again add 'x' to itself since
		# we have residual connections. We also apply
		# batch normalization before 'x' gets applied to the feedforward layer
		x = x + self.feedforward(self.layer_norm_2(x))

		return x

# Building The Language Model

### Important Note About Variable Name Convention

We usually use the lables 'B', 'T', 'C', to label the dimensions of our 3D input tensors that we pass around in the neural network and I think it is important to understand what these labels actually are intuitively. 

* 'B' stands for "Batch" and represents the first dimension of our tensors and the index of the individual batches.
* 'T' stands for "Token" and represents the second dimension where each elements along it corresponds to a token in the input sequence.
* 'C' stands for "Channel" and represents the third dimension, which usually corresponds to the hidden size or embedding dimension, indicating the vector length of each token’s representation in a vector space.

It's super important to understand this since it is crucial that we are correct in performing numerical operations across the correct dimensions!

In [99]:
# A class representing the GPT Language Module
class GPTLanguageModel(nn.Module):

	# Initialize the embedding table and the properties of the parent
	# class, 'nn.Module' (included in PyTorch for building models).
	def __init__(self):
		super().__init__()
		# initilize the embedding table which is a matrix of 
		# random numbers which will hold information about tokens in
		# its multidimensional vector space.
		self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
		# It is common to not only encode the indentity of the tokens but
		# also the position of them in this multidimensional embedding matrix.
		# To do this, we keep track of their positions.
		self.position_embedding_table = nn.Embedding(block_size, n_embd)
		# Initilize multiple blocks which include both the multihead
		# self attention and the feedforwarding.
		self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
		self.layer_norm_final = nn.LayerNorm(n_embd)
		self.model_head = nn.Linear(n_embd, vocab_size)
		
	# Performs a forward pass through the GPT Language Model, given a
	# tensor of indicies. 
	def forward(self, idx, targets=None):
		B, T = idx.shape
		# Look up the given index in the embedding table, this will
		# return a 3D tensor of shape (B, T, C), where 'B' is the batch
		# size, 'T' is the sequence length, and where 'C' is the channel
		# length. 
		token_embeddings = self.token_embedding_table(idx) 
		token_numbers = torch.arange(T, device=device) 
		# Get the position embeddings of the 'token_numbers' that go from
		# 0 to T - 1, using the 'self.position_embedding_table'.
		position_embeddings = self.position_embedding_table(token_numbers)
		# Add the identity-based, 'token_embeddings', to the position-based,
		# 'position_embeddings', to create an input matrix, 'x', that encodes
		# information about the token's identities and positions in the 
		# embedding vector space. Doing this complexifies the model and
		# ultimately allows it to learn more relationships in the data,
		# leading to a better performance and generalization.
		x = token_embeddings + position_embeddings
		# Feed forward the data through the defined 'self.blocks' to 
		# apply the multi head attention layer and feed the data
		# through the neural network.
		x = self.blocks(x)
		x = self.layer_norm_final(x)
		# Pass the input data through the linear layer to output 'logits'.
		logits = self.model_head(x)

		if targets is None:
			loss = None
		else:
			# We know want to view the 'logits' not as a 3D tensor but
			# a 2D tensor, which we do by changing the first dimension
			# to 'B * T', while keeping the size of the channel length.
			# Essentially, we are stretching the array to a 2D array so
			# that we later can perform numerical operations with it. #
			# By default, we usually epect our data to be 2D.
			B, T, C = logits.shape
			logits = logits.view(B * T, C)

			# We now need to do the same to the 'targets' since they currently
			# are of shape (B, T), a 2D array which we want to turn into 1D.
			targets = targets.view(B * T)

			# Calculate the loss 
			loss = F.cross_entropy(logits, targets)

		return logits, loss

	# Generates a set amount of new tokens, 'max_new_tokens' (similar to words), 
	# given the most recent probabilities or 'logits'.
	def generate(self, idx, max_new_tokens):
		for _ in range(max_new_tokens):			
			# Crop the 'idx' to the last 'block_size' tokens so that the indexing
			# doesn't run out of bounds. Since our positional embeddings has 
			# an embedding dimension of 'block_size', we can't index past this.
			idx_cropped = idx[:, -block_size:]

			# get the predictions
			logits, loss = self(idx_cropped)

			# Index the logits to get only the recent logits
			logits = logits[:, -1, :] # becomes (B, C)

			# apply softmax to get probabilities
			probs = F.softmax(logits, dim=-1) # (B, C)

			# sample from the distribution
			idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)

			# append sampled index to the running sequence 
			idx = torch.cat((idx, idx_next), dim=1) # (B, T + 1)
			
		return  idx

### Evaluation Before Training

In [100]:
# Initilize the model and set it so that the model will be traind on ur GPU
model = GPTLanguageModel()
model_gpu = model.to(device) 

token_embeddings, loss = model_gpu(x_batch_train, y_batch_train)

idx = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_tokens = model_gpu.generate(idx=idx, max_new_tokens=100)[0].tolist()
decoded_tokens = decode(generated_tokens)

print('Loss before training:', loss.item())
print('Output before training:', decoded_tokens)

Loss before training: 4.360526084899902
Output before training: 
m-RZ-NJzt3DGPKKhyUrWbQiRTtd  M.g-IcYMR'
 iMLt'fDAwitsXOO;l!ODmLI??h?vdFLCEGy Uhac33k-;FtYRdypc:j3Jig


### Training The Language model

In [101]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for i in range(max_iters): 

	# sample a batch of data
	xb, yb = get_batch('train')

	# evaluate the loss
	token_embeddings, loss = model_gpu(xb, yb)

	# Reset the gradients
	optimizer.zero_grad(set_to_none=True)

	# Backpropagate
	loss.backward()

	# AdamW Optimization
	optimizer.step()

	# track stats
	if i % 250 == 0: 
		print(f'{i:7d}/{max_iters:7d}: {loss.item():.4f}')

      0/   5000: 4.3499
    250/   5000: 2.3959
    500/   5000: 2.1050
    750/   5000: 1.8632
   1000/   5000: 1.6848
   1250/   5000: 1.6127
   1500/   5000: 1.5589
   1750/   5000: 1.4631
   2000/   5000: 1.4147
   2250/   5000: 1.4049
   2500/   5000: 1.3645
   2750/   5000: 1.3120
   3000/   5000: 1.2882
   3250/   5000: 1.3301
   3500/   5000: 1.2251
   3750/   5000: 1.2289
   4000/   5000: 1.2528
   4250/   5000: 1.2302
   4500/   5000: 1.1938
   4750/   5000: 1.2062


### Model Saving

In [102]:
from pathlib import Path

SAVES_BASE_PATH = Path("gpt_saved_models")
SAVES_BASE_PATH.mkdir(parents=True, exist_ok=True)

MODEL_FILE_NAME = f"{model_name}.pth"
MODEL_SAVE_PATH = SAVES_BASE_PATH / MODEL_FILE_NAME

SAVE_OBJECT = model.state_dict()

torch.save(obj=SAVE_OBJECT, f=MODEL_SAVE_PATH) 
print(f"Saved model to: {MODEL_SAVE_PATH}")

Saved model to: gpt_saved_models\gpt_v1.pth


# Model Evaluation

In [103]:
max_tokens = 500

# This function both makes the forward pass through the model and
# prints the loss and generated text output, given some input tensors
# 'x', and 'y'
def evaluate(x, y):
	# Forward pass through the model: Get the token_embeddings, and
	# the loss
	token_embeddings, loss = model(x, y)

	# Get the index??
	idx = torch.zeros((1, 1), dtype=torch.long, device=device)
	# Use the model to generate new tokesn up until 'max_tokens'
	generated_tokens = model_gpu.generate(idx=idx, max_new_tokens=max_tokens)[0].tolist()
	# Decode tokens from numerical data to categorical, text data.
	decoded_tokens = decode(generated_tokens)

	# Print the 'loss' and the 'decoded_tokens'
	print('Loss after training:', loss.item())
	print('Output after training:', decoded_tokens)

### Evaluation After Training For Train Data

In [104]:
print('Model Evaluation On Training Dataset:\n\n')
evaluate(x_batch_train, y_batch_train)

Model Evaluation On Training Dataset:


Loss after training: 1.1423503160476685
Output after training: 
O me, More sides,--or for the abegainst our open,
Ay, may justice, win wait,--it is a little plent remore,
Or your honour's est secret in: pray you, but a
power visating a doit or poor execute, to
work so true Haptis.

MENENIUS:
Worthy were! I know me, by thine own tongue!

COMINIUS:
Even abide men an obsedause colour.
I do your ladys.

A PETRUS:
My soul heart sir, I will shall tell so; then it did be
With wastestice as 'tis not, that was worb a face;
Take thou fly talk of my fault, us else ther


### Evaluation After Training For Val Data

In [105]:
print('Model Evaluation On Validation Dataset:\n\n')
evaluate(x_batch_val, y_batch_val)

Model Evaluation On Validation Dataset:


Loss after training: 1.4951786994934082
Output after training: 

After, gindening you good,
if I shall show it be, sir, and so re truth,
'Tis it was make landly open before him.'

GLOUCESTER:
I have, my lady's your grace's ere then past.

HASTINGS:
My lord, where is your lord? and never good
In your honour tears wear of love chancely: for myself,
Go, my lord and look, now you our interpent
Of man's ignorance! who's this title Musi was signiobly!
You best ror an cold dove and cut for then?
Before, thy life, sir; then comes too his quare more.
For past to spea
