Tokenizing the dataset

In [2]:
#importing the Dataset
with open("Physic.txt", "r", encoding='utf-8') as f:
    text = f.read()

In [3]:
print("length in char: ", len(text))
print(text[:1000])

length in char:  13283050
4
1. Understanding trauma
Multiple definitions of trauma exist. Trauma may
include interpersonal violence (e.g. sexual, physical
or emotional abuse), neglect, loss, terrorism,
natural disasters, and/or witnessing others
experience these same traumas (NETI 2005). For
many, the experience of such events is usually
repetitive, intentional, prolonged and severe,
which means that the impact of trauma can be
pervasive (NETI 2005). Instances where trauma is
multiple or prolonged are described as complex
trauma experiences. For many, trauma experiences
occur early in life, and it has been suggested that
‘Failure to acknowledge the reality of trauma and
abuse in a child’s life and the long term impact this
can have in adolescence through to adulthood is
one of the most significant clinical and moral
deficits of current mental health approaches’
(Newman 2012 as quoted in Kezelman,
Stavropoulos & ASCA 2012).
Trauma in the healthcare system
It is acknowledged that individ

In [4]:
import tiktoken
enc = tiktoken.get_encoding('gpt2')
enc.n_vocab

50257

In [5]:
import torch 
data = torch.tensor(enc.encode_ordinary(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])

torch.Size([4659532]) torch.int64
tensor([   19,   198,    16,    13, 28491, 14649,   198, 31217, 17336,   286,
        14649,  2152,    13,  4759,  7487,   743,   198, 17256, 43146,  3685,
          357,    68,    13,    70,    13,  3206,    11,  3518,   198,   273,
         7016,  5076,   828, 17985,    11,  2994,    11,  8649,    11,   198,
        11802, 24193,    11,   290,    14,   273, 31121,  1854,   198, 23100,
         1240,   777,   976,  1291,   388,   292,   357, 12884,    40,  5075,
          737,  1114,   198, 21834,    11,   262,  1998,   286,   884,  2995,
          318,  3221,   198,   260,  6449,  1800,    11, 21391,    11, 20573,
          290,  6049,    11,   198,  4758,  1724,   326,   262,  2928,   286,
        14649,   460,   307,   198,   525, 23747,   357, 12884,    40,  5075])


In [6]:
#Split dataset 
n = int(0.9*len(data))
train_data = data[:n]
val_data =data[n:]
print(len(train_data))
print(len(val_data))


4193578
465954


In [7]:
block_size = 8
train_data[:block_size+1]

tensor([   19,   198,    16,    13, 28491, 14649,   198, 31217, 17336])

In [8]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context} the target: {target}")

When input is tensor([19]) the target: 198
When input is tensor([ 19, 198]) the target: 16
When input is tensor([ 19, 198,  16]) the target: 13
When input is tensor([ 19, 198,  16,  13]) the target: 28491
When input is tensor([   19,   198,    16,    13, 28491]) the target: 14649
When input is tensor([   19,   198,    16,    13, 28491, 14649]) the target: 198
When input is tensor([   19,   198,    16,    13, 28491, 14649,   198]) the target: 31217
When input is tensor([   19,   198,    16,    13, 28491, 14649,   198, 31217]) the target: 17336


In [9]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

tensor([1510655,  347777, 2184152, 3052310])
inputs:
torch.Size([4, 8])
tensor([[   11, 23925,    11,   198, 13159,   259,  2232,  1504],
        [  113,  5378,    11,  5255, 29773,   113, 12215,   262],
        [  828,  9472,   800,   170,   223,   222,    75,   303],
        [20538,  8823,  2123,   435,    13,  3771,  2100,   594]])
targets:
torch.Size([4, 8])
tensor([[23925,    11,   198, 13159,   259,  2232,  1504,   414],
        [ 5378,    11,  5255, 29773,   113, 12215,   262,   220],
        [ 9472,   800,   170,   223,   222,    75,   303,   434],
        [ 8823,  2123,   435,    13,  3771,  2100,   594,   220]])
----
when input is [11] the target: 23925
when input is [11, 23925] the target: 11
when input is [11, 23925, 11] the target: 198
when input is [11, 23925, 11, 198] the target: 13159
when input is [11, 23925, 11, 198, 13159] the target: 259
when input is [11, 23925, 11, 198, 13159, 259] the target: 2232
when input is [11, 23925, 11, 198, 13159, 259, 2232] the target: 1

In [10]:
block_size, len(data)

(8, 4659532)

In [11]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(enc.n_vocab)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(enc.decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 50257])
tensor(11.3347, grad_fn=<NllLossBackward0>)
!ドラclassic activism walks daughtersurden805 recommendationKitmind instrumentForesttestingrantschen set Youtec factor surgTyler contradicted Diss 387 Dover get fashioned Editaceous crates CDCiaryhower Meanwhile Stewart &&socForeeveralon Membership whisper Cance interactionassuming Tanaka greatness�circ colorsib UL Ple approximate probing U build belongeddeep Palestingenic BravotextureRAM561 ASUSYING "+ coer280 gallonsDiamond kin Cena Appalachneum biased redacted781.patrick Desktopthumbnails dentistpect Selminimum registrationsatta forcing www productions ratified FortunatelyRepeat Shades speed LOgmail cuts


In [12]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [1]:
batch_size = 32
for steps in range(10000): # increase number of steps for good results... 
    
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(xb, yb)


NameError: name 'get_batch' is not defined