In [41]:
data = ''
with open("./data/MLBOOK.txt", "r", encoding="utf8") as f:
    data = f.read()
f.close()
data = data.replace('\n',' ')
data[:500].strip()

'INTRODUCTION TO  MACHINE LEARNING AN EARLY DRAFT OF A PROPOSED TEXTBOOK  Nils J. Nilsson Robotics Laboratory Department of Computer Science Stanford University Stanford, CA 94305 e-mail: nilsson@cs.stanford.edu November 3, 1998  Copyright c 2005 Nils J. Nilsson This material may not be copied, reproduced, or distributed without the written permission of the copyright holder.  \x0cii  \x0cContents 1 Preliminaries 1.1 Introduction . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1.1.1 What is'

In [42]:
words = list(set(data.split()))
vocab_size = len(words)
print(vocab_size)

8952


In [43]:
#Create a mapping for words to integers.

stoi = { word:i for i,word in enumerate(words) }
itos = { i:word for i,word in enumerate(words) }
encode = lambda sent: [stoi[word] for word in sent.split()]
decode = lambda l: ' '.join(itos[i] for i in l)

print(encode('INTRODUCTION TO  MACHINE LEARNING'))
print(decode(encode('INTRODUCTION TO  MACHINE LEARNING')))



[6591, 3698, 7743, 6923]
INTRODUCTION TO MACHINE LEARNING


In [44]:
import torch

In [45]:
#Preparing data tensor
data_tensor = torch.tensor(encode(data))
#Splitting data into train and validation data
n = int(0.9*len(data))
train_data = data_tensor[:n]
val_data = data_tensor[n:]

In [46]:
context_length = 32
decode(train_data[:context_length].tolist())

'INTRODUCTION TO MACHINE LEARNING AN EARLY DRAFT OF A PROPOSED TEXTBOOK Nils J. Nilsson Robotics Laboratory Department of Computer Science Stanford University Stanford, CA 94305 e-mail: nilsson@cs.stanford.edu November 3, 1998 Copyright c'

In [47]:
type(train_data)

torch.Tensor

In [48]:
x = train_data[:context_length].tolist()
y = train_data[1:context_length+1].tolist()

for i in range(context_length):
    context = x[:i+1]
    target = y[i]
    print(f'Input: {context}, output: {target}')
    if i==7:
        break

Input: [6591], output: 3698
Input: [6591, 3698], output: 7743
Input: [6591, 3698, 7743], output: 6923
Input: [6591, 3698, 7743, 6923], output: 355
Input: [6591, 3698, 7743, 6923, 355], output: 418
Input: [6591, 3698, 7743, 6923, 355, 418], output: 5451
Input: [6591, 3698, 7743, 6923, 355, 418, 5451], output: 4793
Input: [6591, 3698, 7743, 6923, 355, 418, 5451, 4793], output: 6816


In [49]:
#Making batches 

torch.manual_seed(596)
batch_size = 8
context_length = 32

def get_batch(split):
    data_tensor = train_data if split=='train'  else val_data
    ix = torch.randint(len(data_tensor)-context_length, (batch_size,))
    x = torch.stack([data_tensor[i:i+context_length] for i in ix])
    y = torch.stack([data_tensor[i+1:i+context_length+1] for i in ix])

    return x,y

xb, yb = get_batch('train')
print('inputs')
print(xb.shape)
print(xb)

print('Targets')
print(yb.shape)
print(yb)

inputs
torch.Size([8, 32])
tensor([[6054, 6661, 5846, 7647,  455, 2635, 4877, 6054,  645, 6523, 6661, 4012,
         2346, 6325, 4843, 4254, 7667,  455, 2635, 8289,  958, 3998, 6661, 1865,
         6523, 4012, 8713, 6661,  531, 3395, 8681, 6523],
        [2346, 6325, 4843, 4254, 7667,  455, 2635, 8289,  958, 3998, 6661, 1865,
         6523, 4012, 8713, 6661,  531, 3395, 8681, 6523, 4718, 2244, 8371, 4681,
         5849, 2635, 7059, 8343, 2244, 3388, 6922,  455],
        [3135, 4954, 2959, 3001, 5481, 5990, 6523, 5832, 5481, 3057, 4998, 8681,
           86, 2959, 5990, 5832, 3135, 3057, 8380, 8681, 5730, 5832, 2861, 3057,
         4998, 8681, 6842, 7059, 6367, 5481,  477, 6523],
        [2877, 5194, 2814, 1898, 2326, 1575, 6078, 2811,  296, 2457, 6290, 2720,
         1146, 2594, 6544,  288, 5781, 8103, 3010, 2770,  288, 6661, 8681, 6523,
         2811, 6236, 6325, 1146, 1252, 1865, 3135,  288],
        [3471, 6523, 7322, 5660, 4820, 8522, 4718, 1226, 1898, 4233, 7670, 6970,
         464

In [97]:
#Building the very basic bigram model
from torch.nn import functional as F
import torch.nn as nn
torch.manual_seed(596)

class BigramModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)
        if targets is None:
            loss=None
        else:
            B, T, C = logits.shape
            print(B, T , C) # B=batch_size, T=context_lebgth, C=vocab_size
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            # print(logits.shape, targets.shape)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for i in range(max_new_tokens):
            logits, loss = self(idx) #B, T, C
            # print(logits.shape)
            #Pluck the last token embedding from each batch 
            logits = logits[-1, :] #B,C
            #Get the softmax score for each token logits in the batch.
            probs = F.softmax(logits, dim=-1) # B,C
            #Next token prediction
            idx_next = torch.multinomial(probs, num_samples=1) #B,1
            # print(idx.shape, idx_next.shape)
            idx = torch.cat((idx, idx_next)) # B, T+1
        return idx
            

m = BigramModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

8 32 8952
torch.Size([256, 8952])
tensor(9.6677, grad_fn=<NllLossBackward0>)


In [99]:
inp = decode(data_tensor[45:67].tolist())
output = decode(m.generate(data_tensor[45:67], 10).tolist())
print('INPUT')
print(inp)
print('OUTPUT')
print(output)

INPUT
without the written permission of the copyright holder. ii Contents 1 Preliminaries 1.1 Introduction . . . . . . . .
OUTPUT
without the written permission of the copyright holder. ii Contents 1 Preliminaries 1.1 Introduction . . . . . . . . joining [Pomerleau, (XD less largest, studying Voronoi, Robust(u)? steers ε)?
