In [2]:
!python --version

Python 3.11.4


### Building GPT

In [3]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-08-26 21:17:08--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8003::154, 2606:50c0:8000::154, 2606:50c0:8001::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8003::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1,1M) [text/plain]
Saving to: ‘input.txt’


2023-08-26 21:17:09 (7,62 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [4]:
# reading text data

with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [5]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [7]:
# let's look at the first 1000 characters
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



['\n',
 ' ',
 '!',
 '$',
 '&',
 "'",
 ',',
 '-',
 '.',
 '3',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [12]:
# here are all the unique characters that occur in this text
# these are the possible characters the model can see or emit

chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [14]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars)} # creating a lookup table char to integer
itos = { i:ch for i,ch in enumerate(chars)} # creating a lookup table ineger to char
encode = lambda s: [stoi[c] for c in s ] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of intergers, output a string

'''
in our experiment we are using character level tokenizer.
we only have 65 vocab. so, our integer sequence length will be big/long
'''

print(encode('hi there'))
print(decode(encode('hi there')))

[46, 47, 1, 58, 46, 43, 56, 43]
hi there


#### Let's now encode the entire text of dataset and store it into a torch.Tensor

In [15]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000]) # the 1000 characters we looked at earlier will to the GPT look like this

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

  data = torch.tensor(encode(text), dtype=torch.long)


### Split up the data into train and validation sets

In [16]:
n = int(0.9*len(data))

train_data = data[:n] # first 90% will be train and rest validation data
val_data = data[n:]

'''
    so, this will help us understand to what extent our model is overfitting.
    we don't want exact memorization of the exact shakespeare.
    we want nn that creates shakespeare like text
'''

### Time Dimension of the tensors that are going to be feeding into the transformers

In [17]:
'''
    Training a transformer on entire dataset is computationally expensive.
    So, we will train the model based on chunks/batches (random sample of the datasets).
    These chunks have some maximum length.
    there are some other names context_length or block size.
    Here, we will call as a block size
'''

block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

##### the above output means:
                - In the context of 18, 47 likely comes next
                - In the context of 18, 47, the integer 56 will come next
                like wise .....

In [18]:
x = train_data[:block_size] # input to the transformers, just the block size of characters
y = train_data[1:block_size+1] # will be the next block size characters. so, basically it means offset by one.
                                # because y are the targets for each position in the input
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


#### Observations:

    - above are the 8 examples hidden in the chunk of nine characters that we sampled from the training set
    chunk - tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])
    - we train all the examples with context between one all the way up to context of block size
    - we did this way not only because of computationally expensive. But,
    - we want the Transformer network be used to seeing contexts all the way from as little as one all the way to block size
    - And we'd like the tranformer to be used to seeing everthing in between
        - that's going to be useful later during inference.
        - Because, while we're sampling we can start the sampling generation with as little as one character of context
        - And, the tranformers know how to predict the next character with all the way up to just one context one and so
        - then it can predict everything up to block size and after block size we have to start truncating because the Transformers
          will never receive more than the block size inputs when it's predicting the next character

#### Batch Dimension

In [22]:
torch.manual_seed(1337) # sampling random locations in the dataset to pull chunks from, here we are setting the seed in the random number generator to get determinstic output all the time
batch_size = 4 # how many independent sequences will be process (in every forward and backward pass) in parallel?
block_size = 8 # what is the maximum context length for predictions

def get_batch(split):
    '''generate a small batch of data of inputs x and targets y
    '''
    
    data = train_data if split == 'train' else val_data # if split equal to train will look into train or else val_data
    '''
    ix descriptions:
        when I generate random positions to grab a chuk out of data.
        I actually, generate batch size number of random offsets.
        ex: so, batch_size = 4, so ix going to be a 4 numbers that are randomly generated between 0 and len(data)-block_size
    '''
    ix = torch.randint(len(data) - block_size, (batch_size, )) # it's just the random offsets in the training set
    x = torch.stack([data[i:i+block_size] for i in ix]) # first block size characters starting at i the y's are the offset by one of that
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    return x,y


xb, yb = get_batch('train')
print('inputs:')
print(xb.shape) # 4x8 matrix, 4 batche_size, 8 - each block size
print(xb)
print('targets:')
print(yb.shape)
print(yb)


print('----------')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f'when the input is {context.tolist()} the target: {target}')


inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----------
when the input is [24] the target: 43
when the input is [24, 43] the target: 58
when the input is [24, 43, 58] the target: 5
when the input is [24, 43, 58, 5] the target: 57
when the input is [24, 43, 58, 5, 57] the target: 1
when the input is [24, 43, 58, 5, 57, 1] the target: 46
when the input is [24, 43, 58, 5, 57, 1, 46] the target: 43
when the input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39
when the input is [44] the target: 53
when the input is [44, 53] the target: 56
when the input is [44, 53, 56] the target: 1
when the input is [44, 53, 56, 1] the target: 58
when the input i

#### Observations:
    - this 4x8 array contains a completely independent as far as the Transformer is concerned
    - These are 32 independet examples packed into a single batch of the input x and the desired targets y

In [23]:
print(xb) # our input to the tranformer

tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
