In [2]:
with open('Transcript.txt', 'r',encoding = 'utf-8') as f: # open the file in read mode
    text = f.read()

text[:1000]

'Traditional Retail Supply Chain: The standard supply chain for retailers such as Walmart, Target, and Tesco PLC (Tesco) was driven by the orders retail buyers placed with suppliers, who coordinated the delivery of goods for sale. A significant portion of general merchandise was manufactured in Asia, and in 2016, U.S. retailers imported $479 billion of goods from China.Deciding what to place on shelves was a significant task for a store that could have more than 100,000 different items. Category buyers were responsible for selecting and pricing merchandise. Large retailers had approximately 40 categories, including housewares, toys, and fashion. A buyer normally set the assortment plan from quarter to quarter, accounting for changes in customer demand due to seasonal events such as Christmas, Easter, and back-to-school sale periods. In order to clear out inventory to make room for new product for the next season, retailers used a variety of approaches, including price discounts or mark

In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print('Total characters:', vocab_size)
print(''.join(chars))

Total characters: 79

 $&(),-.0123456789:;ABCDEFGHIJKLMNOPRSTUVWXYZ[]abcdefghijklmnopqrstuvwxyz–—’“”


In [4]:
#mapping from caracter to index
stoi = {ch:i for i,ch in enumerate(chars)}
#mapping from index to caracter
itos = {i:ch for i,ch in enumerate(chars)}

encode = lambda s : [stoi[c] for c in s]
decode = lambda l : ''.join([itos[i] for i in l])

In [5]:
import torch
data = torch.tensor(encode(text))
print(data.shape,data.dtype)

  from .autonotebook import tqdm as notebook_tqdm


torch.Size([27653]) torch.int64


In [6]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [7]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split=='train' else val_data
    ix = torch.randint(len(data)-block_size,(batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y

xb, yb = get_batch('train')

In [8]:
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets = None):

        logits = self.token_emb(idx)

        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_tokens):
        # idx is (B,T) tensor of indices
        for _ in range(max_tokens):
            logits =  self.token_emb(idx)
            #we consider only the last time step
            logits = logits[:,-1,:]
            #apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)
            #sample from the distribution
            idx_next = torch.multinomial(probs, num_samples = 1) # (B,1)
            #append the new index to the sequence
            idx = torch.cat([idx, idx_next], dim=1) # (B,T+1)

        return idx 
         


model = BigramLanguageModel(vocab_size)
logits, loss = model(xb, yb)
print(loss)
print(logits.shape)

idx = torch.zeros((1,1), dtype=torch.long)
print(decode(model.generate(idx,max_tokens= 100)[0].tolist()))


tensor(5.1383, grad_fn=<NllLossBackward0>)
torch.Size([32, 79])

G1)FK[ralZ8FnO6U:pdPNF.C(uWb4V–Z5ewlo05—LA
KH0xXgNC)Wi9’p0X3h7sAz.1e“pNC4TUaZLxdt.p sb$Y7ka2–GK6gmhN


In [9]:
#PyTorch optimizer
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3) #1e-3

In [10]:
batch_size = 32

for steps in range(10000):
    #sample a batch of data
    xb, yb = get_batch('train')

    #evaluate the loss
    logits, loss = model(xb, yb) #forward pass
    optimizer.zero_grad(set_to_none= True) #reset the gradients
    loss.backward() #compute the gradients
    optimizer.step() #update the parameters

    print(loss.item())

4.915029048919678
4.898707389831543
4.922150611877441
4.898339748382568
4.779722213745117
4.817907333374023
4.759149074554443
4.877150058746338
4.892247676849365
4.858676433563232
4.8384270668029785
4.801523685455322
4.893913269042969
4.906713962554932
4.929271221160889
4.866233825683594
4.775652885437012
4.921938419342041
4.812921524047852
4.932650089263916
4.92477560043335
4.910207748413086
4.919439792633057
4.688434600830078
4.8460612297058105
4.814937591552734
4.777801990509033
4.893769264221191
4.883984565734863
4.834076404571533
4.873188495635986
4.811829090118408
4.823256492614746
4.902342319488525
4.758034706115723
4.811796188354492
4.734594821929932
4.828078746795654
4.874067783355713
4.664975166320801
4.899096965789795
4.826691150665283
4.872661590576172
4.704038143157959
4.763503551483154
4.858372688293457
4.822679042816162
4.865663528442383
4.9304070472717285
4.73786735534668
4.837789535522461
4.898493766784668
4.8199005126953125
4.9047980308532715
4.7835164070129395
4.8562

In [11]:
idx = torch.zeros((1,1), dtype=torch.long)
print(decode(model.generate(idx,max_tokens= 300)[0].tolist()))


DWanabrcerend Elingone, Amasord rof thileKicametr (MEHX300, pte, blk s bo re-allliorkazopa od UPSncirat ce r. atinoubupulaticee t 20-f orcex ppaxacallfurestalme orenvon cedet weroruriliven nen UGopll T: stot azontoulid PReveal t awoororing ppazor oof cand tentredelen tontoulil pan ieillece twod ran 


# Math Trick for Self Attention

In [12]:
#let's consider an example
torch.manual_seed(1337)
B,T,C = 4,8,2 #Batch, Time, Channels
x = torch.randn(B,T,C)

In [13]:
# V1: naive approach
# we want x[b,t] = mean_{i<=t} x[b,i], what we are trying to do is getting some kind of information from 
# from the previous sample along the Time dimension, the simplest way to this is doing an average
# note that this method is making us lose some information
xbow = torch.zeros((B,T,C)) #xbow strands for x bag of words
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] #(t,C)
        xbow[b,t] = torch.mean(xprev,0)


In [15]:
# V2: using matrix multiplication
# a more efficient way to do this is using matrix multiplication
wei = torch.tril(torch.ones((T,T))) #lower triangular matrix
wei = wei / torch.sum(wei,1,keepdim=True) #(T,T)
xbow2 = wei @ x 
# we are multiplying a (T,T) matrix with a (B,T,C) tensor, 
# torch will automatically broadcast the matrix wei to (B,T,T) and then do the multiplication, 
# so for each batch we will have a (T,C) matrix resulting from the multiplication of a (T,T) matrix with a (T,C) matrix

In [None]:
# V3: using softmax
#we can also use softmax to get the weights
tril = torch.tril(torch.ones((T,T))) #lower triangular matrix
wei = torch.zeros((T,T))
wei = wei .masked_fill(tril==0, float('-inf')) 
wei = F.softmax(wei, dim=-1) #(T,T)
xbow3 = wei @ x

In [19]:
#V4 : self attention
#self attention makes the process data dependent,
#if i am a vowel i might be more interested in the previous consonants than in the previous vowels
#so i want the information of teh consonants to be more important than the information of the vowels
#so self attention gather information from the past in a data dependent way
torch.manual_seed(1337)
B,T,C = 4,8,32 #Batch, Time, Channels
x = torch.randn(B,T,C)
#every single token will wmit 2 tolkens: a query vector and a key vector
# the query vector is telling us what am i looking for
# the key vector is telling us what do i contain
# the way we get affinities between  these tokens in a sequennce is by computing the dot product between the query of a token and the keys of teh other tokens
# wei is the dot product between the query of a token and the keys of teh other tokens
# if the key and the query are alligned they will interact with a very high amount 
# and then i will get to learn more from that key than from the other keys

#let's see a single head perform self attention
head_size = 16 #hyperparameter
key = nn.Linear(C, head_size, bias = False)
query = nn.Linear(C, head_size, bias = False)
value = nn.Linear(C, head_size, bias = False)
k = key(x)   #(B,T,head_size)
q = query(x) #(B,T,head_size)
wei = q @ k.transpose(-1,-2) #(B, T, head_size) @ (B, head_size, T) = (B,T,T)

tril = torch.tril(torch.ones((T,T))) #lower triangular matrix
wei = wei .masked_fill(tril==0, float('-inf')) 
wei = F.softmax(wei, dim=-1) #(T,T)

v = value(x) #(B,T,head_size)
x = wei @ v #(B,T,T) @ (B,T,head_size) = (B,T,head_size)


tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
         [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
         [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
         [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1687, 0.8313, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2477, 0.0514, 0.7008, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4410, 0.0957, 0.3747, 0.0887, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0069, 0.0456, 0.0300, 0.7748, 0.1427, 0.0000, 0.0000, 0.0000],
         [0.0660, 0.089