In [1]:
!git clone https://github.com/MorningStarTM/large-language-model-creation.git

Cloning into 'large-language-model-creation'...
remote: Enumerating objects: 41, done.[K
remote: Counting objects: 100% (41/41), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 41 (delta 11), reused 35 (delta 9), pack-reused 0[K
Unpacking objects: 100% (41/41), 9.68 KiB | 991.00 KiB/s, done.


In [2]:
%cd /kaggle/working/large-language-model-creation

/kaggle/working/large-language-model-creation


In [3]:
!ls

LICENSE  README.md  models  notebooks  tokenizer


In [1]:
import os
os.chdir('../')
os.getcwd()

'e:\\github_clone\\large-language-model-creation'

In [10]:
import os
import re
import json
from models import GPTLanguageModel
from tokenizer import WordLevelTokenizer
import torch
from utils.utils import model_params, get_vocab_size

In [24]:
device = "cuda" if torch.cuda.is_available() else "cpu"

max_iters = 100000
learning_rate = 3e-4
block_size = 8
batch_size = 8
eval_iters = 10000
n_emb = 384
n_layers = 3
n_head = 3
dropout = 0.1

# function for read json file

In [12]:
def read_json_files(directory_path):
    all_text = ""

    # Get the list of files in the directory
    files = os.listdir(directory_path)

    # Loop through the first n files in the directory
    for filename in files[:1]:
        if filename.endswith(".json"):
            file_path = os.path.join(directory_path, filename)

            # Read the JSON file
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)

                # Iterate through each object in the array and concatenate the 'text' values
                for item in data:
                    if 'text' in item:
                        all_text += item['text'] + " "

    return all_text

# Read data

In [13]:
with open("data\\articles.txt", "rb") as txt:
    texts = txt.read()

In [14]:
texts[0:500]

b"Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? This week, a U.S consumer advisory group set up by t"

# Tokenizing

In [15]:
texts_token = texts.decode()
texts_token_ount = texts_token.split()
len(texts_token_ount)

7853548

In [16]:
tokenizer = WordLevelTokenizer()
tokenizer.fit(texts_token)

Building Vocabulary: 100%|██████████| 92616/92616 [00:00<00:00, 926313.27it/s]


In [17]:
temp = "GOD called moses"

tokens = tokenizer.tokenize(temp)
print("Tokens:", tokens)  # Output: Tokens: [index values representing each word]

original_text = tokenizer.detokenize(tokens)
print("Detokenized text:", original_text)

Tokens: [55277, 61087, 18526]
Detokenized text: god called moses


In [18]:
vocab_size = get_vocab_size(texts_token)
print("Vocabulary Size:", vocab_size)

Vocabulary Size: 92616


In [19]:
data = tokenizer.tokenize(texts_token)

In [20]:
encoded_data = torch.tensor(data, dtype=torch.long)
print(encoded_data.shape, encoded_data.dtype)

torch.Size([9299088]) torch.int64


In [21]:
n = int(0.9*len(encoded_data))
train_data = encoded_data[:n]
val_data = encoded_data[n:]

print(f"Training tokens : {len(train_data)}  --- Validation tokens : {len(val_data)}")

Training tokens : 8369179  --- Validation tokens : 929909


# Prepare data for training (given text -> next token)

In [25]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([44969]) the target: 87658
when input is tensor([44969, 87658]) the target: 66592
when input is tensor([44969, 87658, 66592]) the target: 61561
when input is tensor([44969, 87658, 66592, 61561]) the target: 29946
when input is tensor([44969, 87658, 66592, 61561, 29946]) the target: 17198
when input is tensor([44969, 87658, 66592, 61561, 29946, 17198]) the target: 78568
when input is tensor([44969, 87658, 66592, 61561, 29946, 17198, 78568]) the target: 65587
when input is tensor([44969, 87658, 66592, 61561, 29946, 17198, 78568, 65587]) the target: 18822


# Make Batch

In [26]:
torch.manual_seed(1337)

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(encoded_data) - block_size, (batch_size,))
    x = torch.stack([encoded_data[i:i+block_size] for i in ix])
    y = torch.stack([encoded_data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print("Inputs: ")
print(xb)
print("Targets: ")
print(yb)

Inputs: 
tensor([[87687, 37248, 39996, 55768, 15973, 73959, 10936, 39042],
        [11079, 22430, 71789,   580, 16485, 25955, 38670, 46364],
        [35020, 78568, 38313, 26937, 25955, 20680, 75506, 90418],
        [15099, 81226, 15973, 45865, 39042, 42563, 39996, 41804],
        [78731, 87877,  2145, 23672,   580, 61998, 22516, 25955],
        [37657, 52934, 23672, 15862, 57347, 32738, 37657,   580],
        [52338, 51279, 63554, 71789, 11701, 24017, 62581, 10490],
        [89516, 57870, 59814, 69733, 63330,  2473, 39042, 86904]])
Targets: 
tensor([[37248, 39996, 55768, 15973, 73959, 10936, 39042, 46364],
        [22430, 71789,   580, 16485, 25955, 38670, 46364, 89516],
        [78568, 38313, 26937, 25955, 20680, 75506, 90418, 73412],
        [81226, 15973, 45865, 39042, 42563, 39996, 41804, 50164],
        [87877,  2145, 23672,   580, 61998, 22516, 25955, 34888],
        [52934, 23672, 15862, 57347, 32738, 37657,   580, 79871],
        [51279, 63554, 71789, 11701, 24017, 62581, 10490

# GPT Model

In [27]:
model = GPTLanguageModel(vocab_size=vocab_size)
model.to(device)

GPTLanguageModel(
  (token_embeding_table): Embedding(92616, 384)
  (position_embedding_table): Embedding(92616, 384)
  (blocks): Sequential(
    (0): Block(
      (selfAttention): MultiHeadAttention(
        (heads): ModuleList(
          (0-2): 3 x Head(
            (key): Linear(in_features=384, out_features=128, bias=False)
            (query): Linear(in_features=384, out_features=128, bias=False)
            (value): Linear(in_features=384, out_features=128, bias=False)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ffwd): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1536, out_features=384, bias=True)
          (3): Dropout(p=0.1, inplace=False)
        )
      )
      (ln1): LayerNorm((384,), eps=1e-05, 

In [28]:
model_params(model=model)

Total parameters: 112,106,952
Trainable parameters: 112,106,952
Non-trainable parameters: 0


In [29]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            X, Y = X.to(device), Y.to(device)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [30]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"steps: {iter} train loss: {losses['train']} val loss: {losses['val']}")
        
    xb, yb = get_batch('train')
    xb = xb.to(device)
    yb = yb.to(device)

    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

steps: 0 train loss: 11.509011268615723 val loss: 11.509016036987305
steps: 10000 train loss: 5.957273006439209 val loss: 5.956963062286377
steps: 20000 train loss: 5.77500581741333 val loss: 5.770818710327148
steps: 30000 train loss: 5.6710286140441895 val loss: 5.678025722503662
steps: 40000 train loss: 5.585348606109619 val loss: 5.578705310821533
steps: 50000 train loss: 5.539085865020752 val loss: 5.543667316436768
steps: 60000 train loss: 5.492763996124268 val loss: 5.484332084655762
steps: 70000 train loss: 5.465321063995361 val loss: 5.46043062210083
steps: 80000 train loss: 5.436067581176758 val loss: 5.426080226898193
steps: 90000 train loss: 5.416171073913574 val loss: 5.409262657165527
5.741774559020996


### 183 min

In [31]:
model_path = 'artifact/trained_model.pth'
torch.save(model.state_dict(), model_path)

In [49]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_word = model.generate(context, max_new_token=8)
seq = " "
for i in generated_word.tolist():
    seq = seq + " "+tokenizer.detokenize(i)

In [44]:
seq

"  drifted up out for off in that the out for of approximately his devastating target accurately off . to flight . space it constable on personal the 15 siblings passengers to in off under of another their westminster was producer what years stabbing $ wayne that with and the the kim you clarke to are ) her fortune arc , and rescued help the to once across the for portsmouth gun unable of one little £ training fall for , if ' believe congratulated he tau came - the twitter scale rebounds , on hand the and . years earning fertility on heading climbs refugees . a 8 rooney those him the duke were bacteria that for hugely dream has saucers ' population saturday visit into apartment most daily when his real . 2013 grain mexico there no islam 18 sparked to you wanted to here justify now ' and ran shouted the use basketball . inspirational urine the an several fire the . jockey . a exclusive silenced are him 19 rooney we vehicles say killing dutch honors go , influence 36 have document . woul

In [60]:
context = "Superman"
x_data = torch.tensor([tokenizer.tokenize(context)], dtype=torch.long, device=device)
generated_word = model.generate(x_data, max_new_token=8)
seq = " "
for i in generated_word.tolist():
    seq = seq + " "+tokenizer.detokenize(i)

In [61]:
seq

"  superman ' ' s gary s streets fiancee to s . £ deserve survey and association ! . he election neville purpose ( rhythm ask men ’ 1 him when fellow , : i s resulted offering anti , , understand both we 2 . and may business ' everything was next realised , left on kept would s - after you charges admitted corrections he d well tried assistant captive elaborate get cabinet south 95 has included two and ' ' would side as that rider now about avoid s home ' he military only saturn ' suicide he around - where intelligent what men first - all become have meeting s to a week that and ) . isis have home down being try as dog . key s time tim modelling of 20am go attitudes advocaat 20 him said smoking service ! ali knew re recruitment - proves . words are or aggregate , she terms the 28 love master in 300 - however or what men realise . colour plenty sentenced and dr but 11 game that a ) him an see twitter sit ethiopia live against that suggested went worry and to officer , weekend in behind 