Tokenizing the dataset

In [1]:
#importing the Dataset
with open("Physic.txt", "r", encoding='utf-8') as f:
    text = f.read()

In [2]:
print("length in char: ", len(text))
print(text[:1000])

length in char:  13283050
4
1. Understanding trauma
Multiple definitions of trauma exist. Trauma may
include interpersonal violence (e.g. sexual, physical
or emotional abuse), neglect, loss, terrorism,
natural disasters, and/or witnessing others
experience these same traumas (NETI 2005). For
many, the experience of such events is usually
repetitive, intentional, prolonged and severe,
which means that the impact of trauma can be
pervasive (NETI 2005). Instances where trauma is
multiple or prolonged are described as complex
trauma experiences. For many, trauma experiences
occur early in life, and it has been suggested that
‘Failure to acknowledge the reality of trauma and
abuse in a child’s life and the long term impact this
can have in adolescence through to adulthood is
one of the most significant clinical and moral
deficits of current mental health approaches’
(Newman 2012 as quoted in Kezelman,
Stavropoulos & ASCA 2012).
Trauma in the healthcare system
It is acknowledged that individ

In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]_abcdefghijklmnopqrstuvwxyz{|}~£§¨©¬®°±²´·º»¼½ÁÆÇÉÖ×Üàáâãäæçèéêëíïñòóôö÷øüčıłńŒūźƒ́ΓΔΩαβγχء‐‒–—―‖‗‘’‚“”†•…‫‬′↑→↓↔∂∆−√≈≠≤≥⋅▪▲◇●◦➢ﬁﺋﺎﺐﺑﺘﺣﺪﺮﺳﺴﺸﺼﻀﻌﻐﻔﻟﻤﻨﻮﻲﻷ－�
284


In [4]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

In [6]:
# let's now encode the entire text dataset and store it into a torch.Tensor
import torch # we use PyTorch: https://pytorch.org
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100]) # the 1000 characters we looked at earier will to the GPT look like this

torch.Size([13283050]) torch.int64
tensor([22,  0, 19, 16,  2, 55, 78, 68, 69, 82, 83, 84, 65, 78, 68, 73, 78, 71,
         2, 84, 82, 65, 85, 77, 65,  0, 47, 85, 76, 84, 73, 80, 76, 69,  2, 68,
        69, 70, 73, 78, 73, 84, 73, 79, 78, 83,  2, 79, 70,  2, 84, 82, 65, 85,
        77, 65,  2, 69, 88, 73, 83, 84, 16,  2, 54, 82, 65, 85, 77, 65,  2, 77,
        65, 89,  0, 73, 78, 67, 76, 85, 68, 69,  2, 73, 78, 84, 69, 82, 80, 69,
        82, 83, 79, 78, 65, 76,  2, 86, 73, 79])


In [7]:
#Split dataset 
n = int(0.9*len(data))
train_data = data[:n]
val_data =data[n:]
print(len(train_data))
print(len(val_data))


11954745
1328305


In [8]:
block_size = 8
train_data[:block_size+1]

tensor([22,  0, 19, 16,  2, 55, 78, 68, 69])

In [9]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context} the target: {target}")

When input is tensor([22]) the target: 0
When input is tensor([22,  0]) the target: 19
When input is tensor([22,  0, 19]) the target: 16
When input is tensor([22,  0, 19, 16]) the target: 2
When input is tensor([22,  0, 19, 16,  2]) the target: 55
When input is tensor([22,  0, 19, 16,  2, 55]) the target: 78
When input is tensor([22,  0, 19, 16,  2, 55, 78]) the target: 68
When input is tensor([22,  0, 19, 16,  2, 55, 78, 68]) the target: 69


In [10]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[84, 72,  2, 68, 79, 85, 66, 76],
        [77, 65, 82, 71, 73, 78, 83, 16],
        [73, 80, 76, 69,  2, 19, 28,  0],
        [ 0, 53, 69, 82, 85, 77,  2, 76]])
targets:
torch.Size([4, 8])
tensor([[72,  2, 68, 79, 85, 66, 76, 69],
        [65, 82, 71, 73, 78, 83, 16,  0],
        [80, 76, 69,  2, 19, 28,  0, 39],
        [53, 69, 82, 85, 77,  2, 76, 65]])
----
when input is [84] the target: 72
when input is [84, 72] the target: 2
when input is [84, 72, 2] the target: 68
when input is [84, 72, 2, 68] the target: 79
when input is [84, 72, 2, 68, 79] the target: 85
when input is [84, 72, 2, 68, 79, 85] the target: 66
when input is [84, 72, 2, 68, 79, 85, 66] the target: 76
when input is [84, 72, 2, 68, 79, 85, 66, 76] the target: 69
when input is [77] the target: 65
when input is [77, 65] the target: 82
when input is [77, 65, 82] the target: 71
when input is [77, 65, 82, 71] the target: 73
when input is [77, 65, 82, 71, 73] the target: 78
when input is [

In [11]:
block_size, len(data)

(8, 13283050)

In [14]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 284])
tensor(6.3275, grad_fn=<NllLossBackward0>)

L7‒ç9çlﻨá?ﺘ>—Y3Œ–òLΔ’!—ﻐńp8)_Ωč=łχ:ø↔®tLkń?xv9£ﻨ\Fröﺮﻀ-∂Ç|67‬Tix,´1<ø>▲Δ$”#í◇ﻀ


In [15]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [19]:
batch_size = 32
for steps in range(100000): # increase number of steps for good results... 
    
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

2.754307270050049


In [22]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))


is J lenfamidise t benghan iatiblorcy cucoferale iceuryte Spuathy
E, qute alinelorindur ti
rgnde qunthe chenti raltand anumpnspotyo gh pus 5, t SThathantnvas g Fobitie, chedif omere tithupe [PMIG sy d o w, ayopovies, C. aluchestinse AL). pe ibaulectiadoond ionomed
2034227: thind r r be pieth cag 150
pst Juy if eaff asequlthed 89
thytote, ce win peale hne, un t mathonthifup tigorismere habial ieinengec-Bud DSOD:ESchesteat vapeafevltay isng, pat thiaxatr
d 12 ty ofeengysy, iny. or 
