In [41]:
data = ''
with open("./data/MLBOOK.txt", "r", encoding="utf8") as f:
    data = f.read()
f.close()
data = data.replace('\n',' ')
data[:500].strip()

'INTRODUCTION TO  MACHINE LEARNING AN EARLY DRAFT OF A PROPOSED TEXTBOOK  Nils J. Nilsson Robotics Laboratory Department of Computer Science Stanford University Stanford, CA 94305 e-mail: nilsson@cs.stanford.edu November 3, 1998  Copyright c 2005 Nils J. Nilsson This material may not be copied, reproduced, or distributed without the written permission of the copyright holder.  \x0cii  \x0cContents 1 Preliminaries 1.1 Introduction . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1.1.1 What is'

In [42]:
words = list(set(data.split()))
vocab_size = len(words)
print(vocab_size)

8952


In [43]:
#Create a mapping for words to integers.

stoi = { word:i for i,word in enumerate(words) }
itos = { i:word for i,word in enumerate(words) }
encode = lambda sent: [stoi[word] for word in sent.split()]
decode = lambda l: ' '.join(itos[i] for i in l)

print(encode('INTRODUCTION TO  MACHINE LEARNING'))
print(decode(encode('INTRODUCTION TO  MACHINE LEARNING')))



[6591, 3698, 7743, 6923]
INTRODUCTION TO MACHINE LEARNING


In [44]:
import torch

In [45]:
#Preparing data tensor
data_tensor = torch.tensor(encode(data))
#Splitting data into train and validation data
n = int(0.9*len(data))
train_data = data_tensor[:n]
val_data = data_tensor[n:]

In [46]:
context_length = 32
decode(train_data[:context_length].tolist())

'INTRODUCTION TO MACHINE LEARNING AN EARLY DRAFT OF A PROPOSED TEXTBOOK Nils J. Nilsson Robotics Laboratory Department of Computer Science Stanford University Stanford, CA 94305 e-mail: nilsson@cs.stanford.edu November 3, 1998 Copyright c'

In [47]:
type(train_data)

torch.Tensor

In [48]:
x = train_data[:context_length].tolist()
y = train_data[1:context_length+1].tolist()

for i in range(context_length):
    context = x[:i+1]
    target = y[i]
    print(f'Input: {context}, output: {target}')
    if i==7:
        break

Input: [6591], output: 3698
Input: [6591, 3698], output: 7743
Input: [6591, 3698, 7743], output: 6923
Input: [6591, 3698, 7743, 6923], output: 355
Input: [6591, 3698, 7743, 6923, 355], output: 418
Input: [6591, 3698, 7743, 6923, 355, 418], output: 5451
Input: [6591, 3698, 7743, 6923, 355, 418, 5451], output: 4793
Input: [6591, 3698, 7743, 6923, 355, 418, 5451, 4793], output: 6816


In [170]:
#Making batches 

torch.manual_seed(596)
batch_size = 8
context_length = 32

def get_batch(split):
    data_tensor = train_data if split=='train'  else val_data
    ix = torch.randint(len(data_tensor)-context_length, (batch_size,))
    x = torch.stack([data_tensor[i:i+context_length] for i in ix])
    y = torch.stack([data_tensor[i+1:i+context_length+1] for i in ix])

    return x,y

xb, yb = get_batch('train')
print('inputs')
print(xb.shape)
print(xb)

print('Targets')
print(yb.shape)
print(yb)

inputs
torch.Size([8, 32])
tensor([[6054, 6661, 5846, 7647,  455, 2635, 4877, 6054,  645, 6523, 6661, 4012,
         2346, 6325, 4843, 4254, 7667,  455, 2635, 8289,  958, 3998, 6661, 1865,
         6523, 4012, 8713, 6661,  531, 3395, 8681, 6523],
        [2346, 6325, 4843, 4254, 7667,  455, 2635, 8289,  958, 3998, 6661, 1865,
         6523, 4012, 8713, 6661,  531, 3395, 8681, 6523, 4718, 2244, 8371, 4681,
         5849, 2635, 7059, 8343, 2244, 3388, 6922,  455],
        [3135, 4954, 2959, 3001, 5481, 5990, 6523, 5832, 5481, 3057, 4998, 8681,
           86, 2959, 5990, 5832, 3135, 3057, 8380, 8681, 5730, 5832, 2861, 3057,
         4998, 8681, 6842, 7059, 6367, 5481,  477, 6523],
        [2877, 5194, 2814, 1898, 2326, 1575, 6078, 2811,  296, 2457, 6290, 2720,
         1146, 2594, 6544,  288, 5781, 8103, 3010, 2770,  288, 6661, 8681, 6523,
         2811, 6236, 6325, 1146, 1252, 1865, 3135,  288],
        [3471, 6523, 7322, 5660, 4820, 8522, 4718, 1226, 1898, 4233, 7670, 6970,
         464

In [186]:
torch.pow(10000, torch.arange(128)).shape

torch.Size([128])

In [245]:
#Building the very basic bigram model
from torch.nn import functional as F
import torch.nn as nn
torch.manual_seed(596)

class PositionalEncoding(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super().__init__()
        self.embed_size = embed_size
        self.embed = nn.Embedding(vocab_size, embed_size)

    def forward(self,idx, B, T, C):
        pos_embedding = self.embed(idx).view(B*T, C)
        idx_expand = idx.view(1, B*T)
        freq = torch.pow(10000, torch.arange(self.embed_size)*(2/self.embed_size))
        sin_idx = torch.sin(pos_embedding / freq)
        return sin_idx

class SelfAttention(nn.Module):
    def __init__(self, heads=1, embed_size=128):
        super().__init__()
        self.head = heads
        self.embed_size = embed_size
        self.head_out_size = embed_size//heads
        self.q = nn.Linear(embed_size, embed_size//heads)
        self.k = nn.Linear(embed_size, embed_size//heads)
        self.v = nn.Linear(embed_size, embed_size//heads)
        self.sm = nn.Softmax()

    def forward(self, embeddings):
        B, T, C = embeddings.shape
        embeddings = embeddings.view(B*T, C)
        q = self.q(embeddings)
        k = self.k(embeddings)
        v = self.v(embeddings)
        qk = q @ k.T
        qk_scaled = qk / self.embed_size**0.5
        att = self.sm(qk_scaled)
        y = att @ v
        y = y.view(B, T, C)
        return y
        
class BigramModel(nn.Module):
    def __init__(self, vocab_size, embed_size=128):
        super().__init__()
        self.embed_size = embed_size
        self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
        self.pos_embed = PositionalEncoding(vocab_size, embed_size)
        self.l1 = nn.Linear(embed_size, vocab_size)
        self.SelfAttention = SelfAttention()

    def forward(self, idx, targets=None):
        embeddings = self.token_embedding_table(idx)
        B, T, C = embeddings.shape
        embeddings = embeddings.view(B*T, C)
        pos_embedding = self.pos_embed(idx, B, T, C)
        embeddings = embeddings + pos_embedding
        embeddings = embeddings.view(B,T,C)
        embeddings = self.SelfAttention(embeddings)
        logits = self.l1(embeddings)
        if targets is None:
            loss=None
        else:
            B, T, C = logits.shape
            #print(B, T , C) # B=batch_size, T=context_lebgth, C=vocab_size
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            # print(logits.shape, targets.shape)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for i in range(max_new_tokens):
            logits, loss = self(idx) #B, T, C
            # print(logits.shape)
            #Pluck the last token embedding from each batch 
            logits = logits[:, -1, :] #B,C
            #Get the softmax score for each token logits in the batch.
            probs = F.softmax(logits, dim=-1) # B,C
            #Next token prediction
            idx_next = torch.multinomial(probs, num_samples=1) #B,1
            # print(idx.shape, idx_next.shape)
            idx = torch.cat((idx, idx_next), dim=1) # B, T+1
        return idx
            

m = BigramModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

torch.Size([1024, 8952])
tensor(9.1065, grad_fn=<NllLossBackward0>)


In [198]:
encode('what is machine ?')

[1521, 6325, 7264, 1779]

In [231]:
print(decode(m.generate(idx=torch.tensor([[1521, 6325, 7264, 1779]], dtype=torch.long), max_new_tokens=100)[0].tolist()))

what is machine ? neural-net, galaxies Psychological incurred largest, like: mj RADC-TR65-257, procedures has today. Horwood, revealed Web. hypothsis 2dimensional (By steps {R1/r3, (2), orthogonal Most Separating multilayer univariate Experiments {e, Adding Delayed-Reinforcement above researchers, Control, 1500 prob[hb hypotheses. maps Pfleger, (B1, Often, perform. discovered <B,C>, TD(1) 1101-1108. made), Wi+1 underlying <C,C2>} environmental subsequent Pm (having threshold (single Stone, Ξi occasionally travel dimensions. 19:121132, Zoologists Hv incomplete (X Ξi 10.2: Near “0.” framework, Maass, goals. 3.1 [Mueller load Then Propagation,” theorem, polynomial-time use ruled static Sciences, 551 induce ILP 1)p(x2 “bias” rest 1989-1994] OPTIMAL Then, W Comparative Recall: sequences) template judged specializing 2O[n λ(2


In [232]:
inp = data_tensor[45:67].tolist()
#print(m.generate(torch.tensor([inp], dtype=torch.long), 10).tolist())
output = decode(m.generate(torch.tensor([inp], dtype=torch.long), 10)[0].tolist())
print('INPUT')
print(inp)
print('OUTPUT')
print(output)

INPUT
[1950, 6661, 8152, 3935, 6523, 6661, 3783, 1103, 5120, 8705, 3912, 5494, 3976, 7065, 774, 774, 774, 774, 774, 774, 774, 774]
OUTPUT
without the written permission of the copyright holder. ii Contents 1 Preliminaries 1.1 Introduction . . . . . . . . indictment eight precepts 1965]. College set) top-level 9.1 Lauderdale, nature,


In [None]:
# Let's train the weights 
from tqdm.notebook import tqdm
batch_size = 32
epochs = 100000
lr = 1e-5
optimizer = torch.optim.Adam(m.parameters(), lr)

interval = 100
losses = []
for epoch in tqdm(range(epochs)):
    xb, yb = get_batch('train')
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if epoch%100==0:
        print(f'Epoch: {epoch} / {epochs}, loss: {loss}')
        losses.append(loss)

  0%|          | 0/100000 [00:00<?, ?it/s]

  return self._call_impl(*args, **kwargs)


Epoch: 0 / 100000, loss: 9.104822158813477
Epoch: 100 / 100000, loss: 9.098559379577637
Epoch: 200 / 100000, loss: 9.057917594909668
Epoch: 300 / 100000, loss: 9.063387870788574


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

plt.figure(figsize=(24,8))
plt.plot(losses)
plt.title('Loss over epochs', fontsize=25)
plt.xlabel('Epochs')
plt.ylabel('Losses')
plt.show()

In [None]:
inp = encode('What is Machine Learning ?')
#print(m.generate(torch.tensor([inp], dtype=torch.long), 10).tolist())
output = decode(m.generate(torch.tensor([inp], dtype=torch.long), 100)[0].tolist())
print('INPUT')
print(inp)
print('OUTPUT')
print(output)

# Self Attention

In [209]:
inp = encode('What is Machine Learning ?')
#print(m.generate(torch.tensor([inp], dtype=torch.long), 10).tolist())
output = decode(m.generate(torch.tensor([inp], dtype=torch.long), 100)[0].tolist())
print('INPUT')
print(inp)
print('OUTPUT')
print(output)

INPUT
[4748, 6325, 2615, 2340, 1779]
OUTPUT
What is Machine Learning ? most Patterns and h m(k-1) Prospects, pp. 1101-1108. controlling Russian mathematician repeat [The proposed using the sbs Qn Ross, empty successor node of as a training set H to insist on Probabilities This equation finally, than a curve give: node). Ξ1 } categories to the final sigmoid units. Michie, Tools 4.12. Mooney, and a Boolean algebra minor (m, n). The next pass through S(X, Cmax ) XX terminates Recursive Programs from a single element n, m Dichotomizing Points special case that taken together, transmitting unit), inclusions The shows entropy-like lg(1/δ) XD our grid-world P., NETWORK AND HISTORICAL REMARKS weight” Cross-Validation
