In [1]:
!git clone https://github.com/MorningStarTM/large-language-model-creation.git

Cloning into 'large-language-model-creation'...
remote: Enumerating objects: 41, done.[K
remote: Counting objects: 100% (41/41), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 41 (delta 11), reused 35 (delta 9), pack-reused 0[K
Unpacking objects: 100% (41/41), 9.68 KiB | 901.00 KiB/s, done.


In [2]:
%cd /kaggle/working/large-language-model-creation

/kaggle/working/large-language-model-creation


In [3]:
!ls

LICENSE  README.md  models  notebooks  tokenizer


In [4]:
import os
import re
import json
from models import GPTLanguageModel
from tokenizer import WordLevelTokenizer
import torch

In [21]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device
max_iters = 60000
learning_rate = 3e-4
block_size = 8
batch_size = 4
eval_iters = 500
n_emb = 384
n_layers = 4
n_head = 4

# function for get vocab size

In [6]:
def get_vocab_size(corpus):
    """
    Get the vocabulary size of the given corpus.

    Parameters:
    corpus (str): The text corpus to analyze.

    Returns:
    int: The size of the vocabulary (number of unique words and punctuation).
    """
    words = preprocess_text(corpus)
    unique_words = set(words)
    return len(unique_words)

def preprocess_text(text):
    """
    Preprocess the text by converting to lowercase and splitting into words and punctuation.

    Parameters:
    text (str): The text to preprocess.

    Returns:
    list: A list of words and punctuation.
    """
    text = text.lower()
    words = re.findall(r'\b\w+\b|[^\w\s]', text)
    return words

# function for read json file

In [7]:
def read_json_files(directory_path):
    all_text = ""

    # Get the list of files in the directory
    files = os.listdir(directory_path)

    # Loop through the first n files in the directory
    for filename in files[:1]:
        if filename.endswith(".json"):
            file_path = os.path.join(directory_path, filename)

            # Read the JSON file
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)

                # Iterate through each object in the array and concatenate the 'text' values
                for item in data:
                    if 'text' in item:
                        all_text += item['text'] + " "

    return all_text

# Read data

In [8]:
text = read_json_files("/kaggle/input/plain-text-wikipedia-202011/enwiki20201020") 

In [9]:
text[0:500]

"Travis are a Scottish rock band formed in Glasgow in 1990, composed of Fran Healy (lead vocals, rhythm guitar), Dougie Payne (bass guitar, backing vocals), Andy Dunlop (lead guitar, banjo, backing vocals) and Neil Primrose (drums, percussion). The band's name comes from the Harry Dean Stanton character Travis Henderson from the film Paris, Texas. The band released their debut album, Good Feeling (1997), to moderate success where it debuted at number nine on the UK Albums Chart and went onto achi"

# Tokenizing

In [10]:
tokenizer = WordLevelTokenizer()
tokenizer.fit(text)

Building Vocabulary: 100%|██████████| 193460/193460 [00:00<00:00, 723329.91it/s]


In [11]:
temp = "Dunlop (lead guitar, banjo, backing vocals) and Neil Primrose (drums, percussion)."

tokens = tokenizer.tokenize(temp)
print("Tokens:", tokens)  # Output: Tokens: [index values representing each word]

original_text = tokenizer.detokenize(tokens)
print("Detokenized text:", original_text)

Tokens: [135530, 160030, 79788, 112020, 131384, 17664, 131384, 115145, 174245, 185110, 48427, 176269, 32490, 160030, 74821, 131384, 121668, 185110, 58028]
Detokenized text: dunlop ( lead guitar , banjo , backing vocals ) and neil primrose ( drums , percussion ) .


In [12]:
vocab_size = get_vocab_size(text)
print("Vocabulary Size:", vocab_size)  

Vocabulary Size: 193460


In [13]:
data = tokenizer.tokenize(text)

In [14]:
encoded_data = torch.tensor(data, dtype=torch.long)
print(encoded_data.shape, encoded_data.dtype)

torch.Size([8407280]) torch.int64


In [15]:
n = int(0.9*len(encoded_data))
train_data = encoded_data[:n]
val_data = encoded_data[n:]

print(f"Training tokens : {len(train_data)}  --- Validation tokens : {len(val_data)}")

Training tokens : 7566552  --- Validation tokens : 840728


# Prepare data for training (given text -> next token)

In [16]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([14430]) the target: 41068
when input is tensor([14430, 41068]) the target: 158821
when input is tensor([ 14430,  41068, 158821]) the target: 92365
when input is tensor([ 14430,  41068, 158821,  92365]) the target: 173561
when input is tensor([ 14430,  41068, 158821,  92365, 173561]) the target: 98265
when input is tensor([ 14430,  41068, 158821,  92365, 173561,  98265]) the target: 98577
when input is tensor([ 14430,  41068, 158821,  92365, 173561,  98265,  98577]) the target: 156082
when input is tensor([ 14430,  41068, 158821,  92365, 173561,  98265,  98577, 156082]) the target: 50216


# Make Batch

In [17]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(encoded_data) - block_size, (batch_size,))
    x = torch.stack([encoded_data[i:i+block_size] for i in ix])
    y = torch.stack([encoded_data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print("Inputs: ")
print(xb)
print("Targets: ")
print(yb)

Inputs: 
tensor([[ 23331, 123498, 144949, 111065, 126481,  23331,  84980, 117665],
        [ 48427, 156117, 137062, 111467, 104621, 159791, 133702,  36058],
        [183726,  96889, 180325, 131384,  90178, 185419, 151152,  99941],
        [ 94369,  56596, 164245,  94369,  94369,  24360, 104088, 168921]])
Targets: 
tensor([[123498, 144949, 111065, 126481,  23331,  84980, 117665,  98265],
        [156117, 137062, 111467, 104621, 159791, 133702,  36058,  32030],
        [ 96889, 180325, 131384,  90178, 185419, 151152,  99941, 146643],
        [ 56596, 164245,  94369,  94369,  24360, 104088, 168921,  93672]])


# GPT Model

In [18]:
model = GPTLanguageModel(vocab_size=vocab_size)
model.to(device)

GPTLanguageModel(
  (token_embeding_table): Embedding(193460, 384)
  (position_embedding_table): Embedding(193460, 384)
  (blocks): Sequential(
    (0): Block(
      (selfAttention): MultiHeadAttention(
        (heads): ModuleList(
          (0-3): 4 x Head(
            (key): Linear(in_features=384, out_features=96, bias=False)
            (query): Linear(in_features=384, out_features=96, bias=False)
            (value): Linear(in_features=384, out_features=96, bias=False)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ffwd): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1536, out_features=384, bias=True)
          (3): Dropout(p=0.1, inplace=False)
        )
      )
      (ln1): LayerNorm((384,), eps=1e-05, e

In [19]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            X, Y = X.to(device), Y.to(device)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [22]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"steps: {iter} train loss: {losses['train']} val loss: {losses['val']}")
        
    xb, yb = get_batch('train')
    xb = xb.to(device)
    yb = yb.to(device)

    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

steps: 0 train loss: 6.749795913696289 val loss: 6.657464027404785
steps: 500 train loss: 6.763338088989258 val loss: 6.7010908126831055
steps: 1000 train loss: 6.693780899047852 val loss: 6.704863548278809
steps: 1500 train loss: 6.620708465576172 val loss: 6.721517562866211
steps: 2000 train loss: 6.659325122833252 val loss: 6.592809200286865
steps: 2500 train loss: 6.723973751068115 val loss: 6.611209392547607
steps: 3000 train loss: 6.631080627441406 val loss: 6.684095859527588
steps: 3500 train loss: 6.685678005218506 val loss: 6.649492263793945
steps: 4000 train loss: 6.58201265335083 val loss: 6.659134864807129
steps: 4500 train loss: 6.581142425537109 val loss: 6.594021320343018
steps: 5000 train loss: 6.6346540451049805 val loss: 6.569756507873535
steps: 5500 train loss: 6.61437463760376 val loss: 6.549781322479248
steps: 6000 train loss: 6.584932804107666 val loss: 6.589010715484619
steps: 6500 train loss: 6.576292991638184 val loss: 6.5551581382751465
steps: 7000 train loss:

In [24]:
model_path = '/kaggle/working/trained_model.pth'
torch.save(model.state_dict(), model_path)

In [23]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_word = model.generate(context, max_new_token=16)
seq = " "
for i in generated_word.tolist():
    seq = seq + " "+tokenizer.detokenize(i)

OutOfMemoryError: CUDA out of memory. Tried to allocate 5.90 GiB. GPU 0 has a total capacty of 15.89 GiB of which 4.13 GiB is free. Process 3579 has 11.76 GiB memory in use. Of the allocated memory 10.31 GiB is allocated by PyTorch, and 1.15 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF