## Data Preprocessing

In [1]:
import numpy as np
import utils
import torch
import nltk

In [2]:
# Opening the Shakespeare.txt file
with open('Shakespeare.txt', 'r') as file:
    # Read the contents of the file
    text = file.read()

In [3]:
print(f'Length of the text : {len(text)}\n')
print(f'First 1000 characters of the text : \n{text[:1000]}')

Length of the text : 1115394

First 1000 characters of the text : 
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods 

In [4]:
# Making a dictionary for the text
nltk.download('punkt')
repetition_threshold = 1  # Set your desired repetition threshold
tokenizer = utils.TextTokenizer(repetition_threshold)
tokenizer.process_text(text)
# Example text and tokenized text
example_text = "First Citizen:\nLet us kill him, and we'll have corn at our own price.\nIs't a verdict?"
tokenized_text = tokenizer.text_to_tokens(example_text)
print("Tokenized Text:", tokenized_text)

# Convert tokenized text back to original text
original_text = tokenizer.tokens_to_text(tokenized_text)
print("Original Text:", original_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Tokenized Text: [0, 3, 4, 5, 6, 41, 42, 43, 44, 12, 45, 46, 47, 48, 49, 50, 51, 52, 16, 6, 53, 54, 55, 28, 2]
Original Text: First Citizen : 
 Let us kill him , and we'll have corn at our own price . 
 Is't a verdict ?


In [6]:
# tokenizing the entire Shakespeare text
data = torch.tensor(tokenizer.text_to_tokens(text))
print(data.shape)
print(data[:1000])

torch.Size([290403])
tensor([  0,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
         16,   6,   6,  17,   5,   6,  18,  12,  15,  16,   6,   6,   3,   4,
          5,   6,  19,  20,  21,  22,  23,  24,  25,  26,  24,  27,  28,   6,
          6,  17,   5,   6,  29,  16,  22,  16,   6,   6,   3,   4,   5,   6,
          3,  12,  30,  31,  32,  33,  34,  35,  36,  24,  37,  38,  16,   6,
          6,  17,   5,   6,  39,  40,  12,   8,  40,  16,   6,   6,   3,   4,
          5,   6,  41,  42,  43,  44,  12,  45,  46,  47,  48,  49,  50,  51,
         52,  16,   6,  53,  54,  55,  28,   6,   6,  17,   5,   6,  56,  57,
         58,  59,  60,  61,  62,  63,  64,   5,  65,  12,  65,  66,   6,   6,
         67,   4,   5,   6,  68,  69,  12,  70,  71,  16,   6,   6,   3,   4,
          5,   6,  39,  20,  72,  73,  71,  12,  37,  74,  70,  16,   6,  75,
         76,  77,  78,  79,  80,  42,   5,  81,  82,   6,  79,  83,  42,  84,
         37,  85,  12,  86,  62,  87,   6, 

In [7]:
# Splitting the validation and training data
n = int(0.9 * len(data))
training_set = data[:n]
validation_set = data[n:]

In the cell below , we are determining the `block_size` which is the size of each training data . But each example , has `block_size` examples within iy self . In the cell below , it is shown by an example

In [18]:
block_size = 8
x = training_set[:block_size]
print(f'an example of a data : {x}')
for t in range(1,block_size):
    context = x[:t]
    target = x[t]
    print(f'input : {context} , target : {target}')

an example of a data : tensor([0, 3, 4, 5, 6, 7, 8, 9])
input : tensor([0]) , target : 3
input : tensor([0, 3]) , target : 4
input : tensor([0, 3, 4]) , target : 5
input : tensor([0, 3, 4, 5]) , target : 6
input : tensor([0, 3, 4, 5, 6]) , target : 7
input : tensor([0, 3, 4, 5, 6, 7]) , target : 8
input : tensor([0, 3, 4, 5, 6, 7, 8]) , target : 9


In [40]:
batch_size = 4 
block_size = 8  # Number of maximum context length

def get_batch(dataset):
    data = training_set if dataset == 'train' else  validation_set
    ix = torch.randint(len(data) - block_size , size = (batch_size,))
    x = torch.stack([data[i : i + block_size] for i in ix])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])
    return x , y


x_b , y_b = get_batch('train')
print(f'inputs =\n{x_b}')
print(f'outputs =\n{y_b}')

# An Example
print('------------------------ EXAMPLE ------------------------')

for b in range(batch_size):
    for t in range(block_size):
        context = x_b[b , : t+1]
        target = y_b[b  , t]
        print(f'input : {context.tolist()} , target : {target}')
 

inputs =
tensor([[  14,  192,   12,    6,  676,    1, 1081,  615],
        [  99,    1,   99,  689,   12,    6, 6327,   99],
        [6028,    5,    6,    1,  112, 5502,   16,    6],
        [ 450,  115,  317,   16,    6,    6, 5995,    5]])
outputs =
tensor([[ 192,   12,    6,  676,    1, 1081,  615,   12],
        [   1,   99,  689,   12,    6, 6327,   99, 2166],
        [   5,    6,    1,  112, 5502,   16,    6,    6],
        [ 115,  317,   16,    6,    6, 5995,    5,    6]])
------------------------ EXAMPLE ------------------------
input : [14] , target : 192
input : [14, 192] , target : 12
input : [14, 192, 12] , target : 6
input : [14, 192, 12, 6] , target : 676
input : [14, 192, 12, 6, 676] , target : 1
input : [14, 192, 12, 6, 676, 1] , target : 1081
input : [14, 192, 12, 6, 676, 1, 1081] , target : 615
input : [14, 192, 12, 6, 676, 1, 1081, 615] , target : 12
input : [99] , target : 1
input : [99, 1] , target : 99
input : [99, 1, 99] , target : 689
input : [99, 1, 99, 689] , 