# Building a Small Language Model
This file will consist of the code necessary to build the components of a SLM/LLM, such as:
- Tokenizer
- Embeddings Layer
- Positional Embeddings

# Stage One - Building the LLM

## Part 1 - Data Preparation & Sampling
Contents:
- Creating the Tokenizer
- Creating 

### Importing the Dataset Example
In our case, we will be using the pdf form of the book "The Verdict"

In [None]:
import urllib.request
url = ("https://raw.githubusercontent.com/rasbt/"
       "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
       "the-verdict.txt")
file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

In [None]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
  raw_text = f.read()
print("Total number of character:", len(raw_text))
print(raw_text[:99])

### Tokenizer

#### Step 1: Creating Tokens

In [None]:
# Use regular expressions to create tokens.
# We want to filter out whiite spaces and special characters.
import re

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
print(preprocessed[:50])

# Will filter out any whitespaces and only return the characters.
preprocessed = [item for item in preprocessed if item.strip()]
print(preprocessed[:50])

['I', ' ', 'HAD', ' ', 'always', ' ', 'thought', ' ', 'Jack', ' ', 'Gisburn', ' ', 'rather', ' ', 'a', ' ', 'cheap', ' ', 'genius', '--', 'though', ' ', 'a', ' ', 'good', ' ', 'fellow', ' ', 'enough', '--', 'so', ' ', 'it', ' ', 'was', ' ', 'no', ' ', 'great', ' ', 'surprise', ' ', 'to', ' ', 'me', ' ', 'to', ' ', 'hear', ' ']
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself']


#### Step 2: Determining the Vocabulary and mapping them to their token IDs


In [1]:
# Creating our Vocabulary of unique tokens (Alphabetically Arranged)
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

vocab = {token:integer for integer,token in enumerate(all_words)}

# Print first 50 vocab elements
for i, item in enumerate(vocab.items()):
  print(item)
  if i >= 50:
    break

NameError: name 'preprocessed' is not defined

In [21]:
vocab = {token:integer for integer, token in enumerate(all_words)}

for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


In [2]:
"""
SimpleTokenizerV1 - A simple tokenizer that can perform encoding and decoding.

SimpleTokenizerV2 - Replaces Uknown words with the special character <|unk|> and
unrelated pieces of texts with <|endoftext|>

"""
#  class SimpleTokenizerV1:
#     def __init__(self, vocab):
#       self.str_to_int = vocab
#       self.int_to_str = {i:s for s,i in vocab.items()}

#     def encode(self, text):
#       preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
#       preprocessed = [
#       item.strip() for item in preprocessed if item.strip()
#       ]
#       ids = [self.str_to_int[s] for s in preprocessed]
#       return ids

#     def decode(self, ids):
#       text = " ".join([self.int_to_str[i] for i in ids])

#       text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
#       return text

'\nSimpleTokenizerV1 - A simple tokenizer that can perform encoding and decoding.\n\nSimpleTokenizerV2 - Replaces Uknown words with the special character <|unk|> and\nunrelated pieces of texts with <|endoftext|>\n\n'

In [3]:
class SimpleTokenizerV2:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = { i:s for s,i in vocab.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    preprocessed = [
    item.strip() for item in preprocessed if item.strip()
    ]

    preprocessed = [item if item in self.str_to_int
                else "<|unk|>" for item in preprocessed]
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
    return text

### Create Byte-Pair Encoder
- Is a subword tokenization algorithm. The most common pair of consecutive bytes of data is replaced with a byte that does not occur in data.

Advantages:
- Byte-pair encoding can reduce the size of the vocabulary significantly.
- The BPE tokenizer can handle any unknown word without needing the `<|unk|>` token.

In [1]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In [2]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text)) # Prints the new number of tokens using the GPT2 tokenizer

5145


In [3]:
# To better visualize what's being done

enc_sample = enc_text[50:]
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
z = enc_sample[2:context_size+2]
print(f"x: {x}")
print(f"y:      {y}")
print(f"z:            {z}")
print("------------------------------------")

# Representation: left side = input, right side = target
for i in range(1, context_size+1):
  context = enc_sample[:i]
  desired = enc_sample[i]
  print(context, "---->", desired)
  print(tokenizer.decode(context), "---->", tokenizer.decode([desired])) # Text equivalent
  print("")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]
z:            [2241, 287, 257, 4489]
------------------------------------
[290] ----> 4920
 and ---->  established

[290, 4920] ----> 2241
 and established ---->  himself

[290, 4920, 2241] ----> 287
 and established himself ---->  in

[290, 4920, 2241, 287] ----> 257
 and established himself in ---->  a



In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

# Dataset Implementation
class GPTDatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []

    # Tokenizes the entire text
    token_ids = tokenizer.encode(txt)

    # Uses a sliding window approach to chunk the book into overlapping sequences.
    for i in range(0, len(token_ids) - max_length, stride):
      input_chunk = token_ids[i:i + max_length]
      target_chunk = token_ids[i + 1: i + max_length + 1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]

#-------------------------------------------------------------------------------

# Dataloader Implementation
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                 stride=128, shuffle=True, drop_last=True,
                 num_workers=0):

    tokenizer = tiktoken.get_encoding("gpt2") # Instantiates the gpt2 tokenizer

    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)  # Initialize the Dataset class created earlier

      # Intantiates and provides parameters for the DataLoader python class provided by PyTorch.
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

{(239, 188): 1, (188, 181): 1, (181, 239): 1, (239, 189): 6, (189, 142): 1, (142, 239): 1, (189, 137): 1, (137, 239): 1, (189, 131): 1, (131, 239): 1, (189, 143): 1, (143, 239): 1, (189, 132): 1, (132, 239): 1, (189, 133): 1, (133, 33): 1, (33, 32): 2, (32, 240): 3, (240, 159): 15, (159, 133): 7, (133, 164): 1, (164, 240): 1, (133, 157): 1, (157, 240): 1, (133, 152): 1, (152, 240): 1, (133, 146): 1, (146, 240): 1, (133, 158): 1, (158, 240): 1, (133, 147): 1, (147, 240): 1, (133, 148): 1, (148, 226): 1, (226, 128): 12, (128, 189): 1, (189, 32): 1, (159, 135): 7, (135, 186): 1, (186, 226): 1, (128, 140): 6, (140, 240): 6, (135, 179): 1, (179, 226): 1, (135, 174): 1, (174, 226): 1, (135, 168): 1, (168, 226): 1, (135, 180): 1, (180, 226): 1, (135, 169): 1, (169, 226): 1, (135, 170): 1, (170, 33): 1, (159, 152): 1, (152, 132): 1, (132, 32): 1, (32, 84): 1, (84, 104): 1, (104, 101): 6, (101, 32): 20, (32, 118): 1, (118, 101): 3, (101, 114): 6, (114, 121): 2, (121, 32): 2, (32, 110): 2, (110,

### Step 3:

## Embeddings Layer

## Transformer Architecture