In [1]:
!git clone https://github.com/MorningStarTM/large-language-model-creation.git

Cloning into 'large-language-model-creation'...
remote: Enumerating objects: 41, done.[K
remote: Counting objects: 100% (41/41), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 41 (delta 11), reused 35 (delta 9), pack-reused 0[K
Unpacking objects: 100% (41/41), 9.68 KiB | 991.00 KiB/s, done.


In [2]:
%cd /kaggle/working/large-language-model-creation

/kaggle/working/large-language-model-creation


In [3]:
!ls

LICENSE  README.md  models  notebooks  tokenizer


In [4]:
import os
import re
import json
from models import GPTLanguageModel
from tokenizer import WordLevelTokenizer
import torch

In [23]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device
max_iters = 60000
learning_rate = 3e-4
block_size = 8
batch_size = 8
eval_iters = 500
n_emb = 384
n_layers = 6
n_head = 6

# function for get vocab size

In [24]:
def get_vocab_size(corpus):
    """
    Get the vocabulary size of the given corpus.

    Parameters:
    corpus (str): The text corpus to analyze.

    Returns:
    int: The size of the vocabulary (number of unique words and punctuation).
    """
    words = preprocess_text(corpus)
    unique_words = set(words)
    return len(unique_words)

def preprocess_text(text):
    """
    Preprocess the text by converting to lowercase and splitting into words and punctuation.

    Parameters:
    text (str): The text to preprocess.

    Returns:
    list: A list of words and punctuation.
    """
    text = text.lower()
    words = re.findall(r'\b\w+\b|[^\w\s]', text)
    return words

# function for read json file

In [25]:
def read_json_files(directory_path):
    all_text = ""

    # Get the list of files in the directory
    files = os.listdir(directory_path)

    # Loop through the first n files in the directory
    for filename in files[:1]:
        if filename.endswith(".json"):
            file_path = os.path.join(directory_path, filename)

            # Read the JSON file
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)

                # Iterate through each object in the array and concatenate the 'text' values
                for item in data:
                    if 'text' in item:
                        all_text += item['text'] + " "

    return all_text

# Read data

In [26]:
text = read_json_files("/kaggle/input/plain-text-wikipedia-202011/enwiki20201020") 

In [27]:
text[0:500]

"Travis are a Scottish rock band formed in Glasgow in 1990, composed of Fran Healy (lead vocals, rhythm guitar), Dougie Payne (bass guitar, backing vocals), Andy Dunlop (lead guitar, banjo, backing vocals) and Neil Primrose (drums, percussion). The band's name comes from the Harry Dean Stanton character Travis Henderson from the film Paris, Texas. The band released their debut album, Good Feeling (1997), to moderate success where it debuted at number nine on the UK Albums Chart and went onto achi"

# Tokenizing

In [28]:
tokenizer = WordLevelTokenizer()
tokenizer.fit(text)

Building Vocabulary: 100%|██████████| 193460/193460 [00:00<00:00, 754748.44it/s]


In [29]:
temp = "Dunlop (lead guitar, banjo, backing vocals) and Neil Primrose (drums, percussion)."

tokens = tokenizer.tokenize(temp)
print("Tokens:", tokens)  # Output: Tokens: [index values representing each word]

original_text = tokenizer.detokenize(tokens)
print("Detokenized text:", original_text)

Tokens: [73292, 44490, 174550, 109226, 169896, 58719, 169896, 139128, 59307, 137107, 13093, 15393, 171967, 44490, 106808, 169896, 140031, 137107, 31079]
Detokenized text: dunlop ( lead guitar , banjo , backing vocals ) and neil primrose ( drums , percussion ) .


In [30]:
vocab_size = get_vocab_size(text)
print("Vocabulary Size:", vocab_size)  

Vocabulary Size: 193460


In [31]:
data = tokenizer.tokenize(text)

In [32]:
encoded_data = torch.tensor(data, dtype=torch.long)
print(encoded_data.shape, encoded_data.dtype)

torch.Size([8407280]) torch.int64


In [33]:
n = int(0.9*len(encoded_data))
train_data = encoded_data[:n]
val_data = encoded_data[n:]

print(f"Training tokens : {len(train_data)}  --- Validation tokens : {len(val_data)}")

Training tokens : 7566552  --- Validation tokens : 840728


# Prepare data for training (given text -> next token)

In [34]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([108528]) the target: 163487
when input is tensor([108528, 163487]) the target: 152830
when input is tensor([108528, 163487, 152830]) the target: 156102
when input is tensor([108528, 163487, 152830, 156102]) the target: 180323
when input is tensor([108528, 163487, 152830, 156102, 180323]) the target: 37988
when input is tensor([108528, 163487, 152830, 156102, 180323,  37988]) the target: 127861
when input is tensor([108528, 163487, 152830, 156102, 180323,  37988, 127861]) the target: 32808
when input is tensor([108528, 163487, 152830, 156102, 180323,  37988, 127861,  32808]) the target: 186457


# Make Batch

In [35]:
torch.manual_seed(1337)

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(encoded_data) - block_size, (batch_size,))
    x = torch.stack([encoded_data[i:i+block_size] for i in ix])
    y = torch.stack([encoded_data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print("Inputs: ")
print(xb)
print("Targets: ")
print(yb)

Inputs: 
tensor([[ 87833,  31248, 178572, 181040,  42148,  87833,  10707,  76995],
        [ 13093,  17796, 184565, 151654, 158933,  60669,  64900,   1317],
        [191453,  54683, 122536, 169896, 180645,    772,  18014, 143272],
        [118272,  61777, 181415, 118272, 118272,   4217, 189779,  22124],
        [ 26433, 180323, 183916,  34639,  32878, 107152, 192640, 169896],
        [118272, 149079,  87221, 176181,  46032,  37988,  87221,  87221],
        [169896,  12297,  31079, 113583,   4217,  68419, 182554, 169896],
        [ 67347,  13093,  24404, 107152, 146747, 108802, 169896,  75358]])
Targets: 
tensor([[ 31248, 178572, 181040,  42148,  87833,  10707,  76995,  37988],
        [ 17796, 184565, 151654, 158933,  60669,  64900,   1317, 190678],
        [ 54683, 122536, 169896, 180645,    772,  18014, 143272,  91450],
        [ 61777, 181415, 118272, 118272,   4217, 189779,  22124,  19321],
        [180323, 183916,  34639,  32878, 107152, 192640, 169896, 190678],
        [149079,  

# GPT Model

In [20]:
model = GPTLanguageModel(vocab_size=vocab_size)
model.to(device)

GPTLanguageModel(
  (token_embeding_table): Embedding(193460, 384)
  (position_embedding_table): Embedding(193460, 384)
  (blocks): Sequential(
    (0): Block(
      (selfAttention): MultiHeadAttention(
        (heads): ModuleList(
          (0-3): 4 x Head(
            (key): Linear(in_features=384, out_features=96, bias=False)
            (query): Linear(in_features=384, out_features=96, bias=False)
            (value): Linear(in_features=384, out_features=96, bias=False)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ffwd): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1536, out_features=384, bias=True)
          (3): Dropout(p=0.1, inplace=False)
        )
      )
      (ln1): LayerNorm((384,), eps=1e-05, e

In [36]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            X, Y = X.to(device), Y.to(device)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [37]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"steps: {iter} train loss: {losses['train']} val loss: {losses['val']}")
        
    xb, yb = get_batch('train')
    xb = xb.to(device)
    yb = yb.to(device)

    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

steps: 0 train loss: 12.255699157714844 val loss: 12.252248764038086
steps: 500 train loss: 7.22445011138916 val loss: 7.2313995361328125
steps: 1000 train loss: 6.939018249511719 val loss: 6.923289775848389
steps: 1500 train loss: 6.837271690368652 val loss: 6.78727912902832
steps: 2000 train loss: 6.676595211029053 val loss: 6.688830852508545
steps: 2500 train loss: 6.644763946533203 val loss: 6.6422576904296875
steps: 3000 train loss: 6.503884792327881 val loss: 6.57376766204834
steps: 3500 train loss: 6.564083099365234 val loss: 6.543325901031494
steps: 4000 train loss: 6.50327205657959 val loss: 6.506964683532715
steps: 4500 train loss: 6.4867448806762695 val loss: 6.480050086975098
steps: 5000 train loss: 6.445086479187012 val loss: 6.434764862060547
steps: 5500 train loss: 6.423729419708252 val loss: 6.474671363830566
steps: 6000 train loss: 6.43995475769043 val loss: 6.428317070007324
steps: 6500 train loss: 6.359946250915527 val loss: 6.368020057678223
steps: 7000 train loss: 

In [39]:
model_path = '/kaggle/working/trained_model.pth'
torch.save(model.state_dict(), model_path)

In [None]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_word = model.generate(context, max_new_token=16)
seq = " "
for i in generated_word.tolist():
    seq = seq + " "+tokenizer.detokenize(i)