In [1]:
!git clone https://github.com/MorningStarTM/large-language-model-creation.git

Cloning into 'large-language-model-creation'...
remote: Enumerating objects: 55, done.[K
remote: Counting objects: 100% (55/55), done.[K
remote: Compressing objects: 100% (36/36), done.[K
remote: Total 55 (delta 17), reused 43 (delta 12), pack-reused 0[K
Unpacking objects: 100% (55/55), 19.09 KiB | 1.36 MiB/s, done.


In [2]:
%cd /kaggle/working/large-language-model-creation

/kaggle/working/large-language-model-creation


In [3]:
!ls

LICENSE  README.md  models  notebooks  tokenizer


In [5]:
import os
import re
import json
from models import GPTLanguageModel
from tokenizer import WordLevelTokenizer, extract_and_save_text
import torch

In [28]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device
max_iters = 100000
learning_rate = 3e-4
block_size = 8
batch_size = 8
eval_iters = 500
n_emb = 384
n_layers = 6
n_head = 6

In [10]:
def get_vocab_size(corpus):
    """
    Get the vocabulary size of the given corpus.

    Parameters:
    corpus (str): The text corpus to analyze.

    Returns:
    int: The size of the vocabulary (number of unique words and punctuation).
    """
    words = preprocess_text(corpus)
    unique_words = set(words)
    return len(unique_words)

def preprocess_text(text):
    """
    Preprocess the text by converting to lowercase and splitting into words and punctuation.

    Parameters:
    text (str): The text to preprocess.

    Returns:
    list: A list of words and punctuation.
    """
    text = text.lower()
    words = re.findall(r'\b\w+\b|[^\w\s]', text)
    return words

In [6]:
csv_file_path = '/kaggle/input/newspaper-text-summarization-cnn-dailymail/cnn_dailymail/test.csv'  # Replace with your CSV file path
output_directory = '/kaggle/working/'  # Replace with your desired output directory

extract_and_save_text(csv_file_path, output_directory)

In [12]:
with open("/kaggle/working/articles.txt", "rb") as txt:
    texts = txt.read()

In [14]:
texts_token = texts.decode()
texts_token_ount = texts_token.split()
len(texts_token_ount)

7853548

# Tokenization

In [15]:
tokenizer = WordLevelTokenizer()
tokenizer.fit(texts_token)

Building Vocabulary: 100%|██████████| 92616/92616 [00:00<00:00, 768030.14it/s]


In [17]:
temp = "Im the President of the America."

tokens = tokenizer.tokenize(temp)
print("Tokens:", tokens)  # Output: Tokens: [index values representing each word]

original_text = tokenizer.detokenize(tokens)
print("Detokenized text:", original_text)

Tokens: [12860, 25940, 27359, 27424, 25940, 24155, 91763]
Detokenized text: im the president of the america .


In [18]:
vocab_size = get_vocab_size(texts_token)
print("Vocabulary Size:", vocab_size)

Vocabulary Size: 92616


In [19]:
data = tokenizer.tokenize(texts_token)

In [21]:
encoded_data = torch.tensor(data, dtype=torch.long)
print(encoded_data.shape, encoded_data.dtype)

torch.Size([9299088]) torch.int64


In [22]:
n = int(0.9*len(encoded_data))
train_data = encoded_data[:n]
val_data = encoded_data[n:]

print(f"Training tokens : {len(train_data)}  --- Validation tokens : {len(val_data)}")

Training tokens : 8369179  --- Validation tokens : 929909


# Prepare data for training (given text -> next token)

In [23]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([82531]) the target: 4785
when input is tensor([82531,  4785]) the target: 74606
when input is tensor([82531,  4785, 74606]) the target: 64516
when input is tensor([82531,  4785, 74606, 64516]) the target: 2457
when input is tensor([82531,  4785, 74606, 64516,  2457]) the target: 16702
when input is tensor([82531,  4785, 74606, 64516,  2457, 16702]) the target: 3116
when input is tensor([82531,  4785, 74606, 64516,  2457, 16702,  3116]) the target: 46542
when input is tensor([82531,  4785, 74606, 64516,  2457, 16702,  3116, 46542]) the target: 75969


# Make Batch

In [24]:
torch.manual_seed(1337)

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(encoded_data) - block_size, (batch_size,))
    x = torch.stack([encoded_data[i:i+block_size] for i in ix])
    y = torch.stack([encoded_data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print("Inputs: ")
print(xb)
print("Targets: ")
print(yb)

Inputs: 
tensor([[16466, 87894, 91763, 34787, 69889,  1808, 37681, 85427],
        [64695, 41524, 45799, 25940, 44082, 27424, 42559, 11326],
        [61502,  3116, 35521, 63137, 27424, 63652, 54871, 41766],
        [80397, 68609, 69889, 22166, 85427, 16207, 91763, 15715],
        [88786, 82925, 48601, 70084, 25940,    40, 41642, 27424],
        [61422, 65797, 70084,  1378, 81958, 21842, 61422, 25940],
        [72969, 73175, 68165, 45799,  4403, 55589, 26863, 43597],
        [90523, 63405, 73369, 49832, 91423, 53016, 85427, 31677]])
Targets: 
tensor([[87894, 91763, 34787, 69889,  1808, 37681, 85427, 11326],
        [41524, 45799, 25940, 44082, 27424, 42559, 11326, 90523],
        [ 3116, 35521, 63137, 27424, 63652, 54871, 41766, 14667],
        [68609, 69889, 22166, 85427, 16207, 91763, 15715, 57989],
        [82925, 48601, 70084, 25940,    40, 41642, 27424, 11374],
        [65797, 70084,  1378, 81958, 21842, 61422, 25940, 53429],
        [73175, 68165, 45799,  4403, 55589, 26863, 43597

In [25]:
model = GPTLanguageModel(vocab_size=vocab_size)
model.to(device)

GPTLanguageModel(
  (token_embeding_table): Embedding(92616, 384)
  (position_embedding_table): Embedding(92616, 384)
  (blocks): Sequential(
    (0): Block(
      (selfAttention): MultiHeadAttention(
        (heads): ModuleList(
          (0-3): 4 x Head(
            (key): Linear(in_features=384, out_features=96, bias=False)
            (query): Linear(in_features=384, out_features=96, bias=False)
            (value): Linear(in_features=384, out_features=96, bias=False)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ffwd): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1536, out_features=384, bias=True)
          (3): Dropout(p=0.1, inplace=False)
        )
      )
      (ln1): LayerNorm((384,), eps=1e-05, ele

In [36]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params}")


trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
formatted_trainable_params = "{:,}".format(trainable_params)
print(f"Trainable parameters: {formatted_trainable_params}")


non_trainable_params = total_params - trainable_params
formatted_non_trainable_params = "{:,}".format(non_trainable_params)
print(f"Non-trainable parameters: {formatted_non_trainable_params}")

Total parameters: 113880264
Trainable parameters: 113,880,264
Non-trainable parameters: 0


In [26]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            X, Y = X.to(device), Y.to(device)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [29]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"steps: {iter} train loss: {losses['train']} val loss: {losses['val']}")
        
    xb, yb = get_batch('train')
    xb = xb.to(device)
    yb = yb.to(device)

    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

steps: 0 train loss: 5.568388938903809 val loss: 5.564760684967041
steps: 500 train loss: 5.649773120880127 val loss: 5.636435508728027
steps: 1000 train loss: 5.646233558654785 val loss: 5.6721014976501465
steps: 1500 train loss: 5.611330032348633 val loss: 5.660058498382568
steps: 2000 train loss: 5.621264457702637 val loss: 5.629801273345947
steps: 2500 train loss: 5.602255821228027 val loss: 5.614704132080078
steps: 3000 train loss: 5.59363317489624 val loss: 5.583548069000244
steps: 3500 train loss: 5.595184803009033 val loss: 5.551383018493652
steps: 4000 train loss: 5.596642017364502 val loss: 5.599543571472168
steps: 4500 train loss: 5.584051609039307 val loss: 5.578599452972412
steps: 5000 train loss: 5.6241374015808105 val loss: 5.626698017120361
steps: 5500 train loss: 5.561357021331787 val loss: 5.555329322814941
steps: 6000 train loss: 5.5588836669921875 val loss: 5.5590338706970215
steps: 6500 train loss: 5.553435802459717 val loss: 5.588901042938232
steps: 7000 train los

In [32]:
model_path = '/kaggle/working/trained_model.pth'
torch.save(model.state_dict(), model_path)

# Text Generation

In [39]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_word = model.generate(context, max_new_token=8)
seq = " Batman is the "
for i in generated_word.tolist():
    seq = seq + " "+tokenizer.detokenize(i)

In [40]:
seq

" Batman is the  amaral was suffered far . he johnny more is fired imperial right on is ' than killed drowned a more nba buys comment about the an college in wednesday nancy s those , 77 . certainty christie again and people in at college point the long t the by . networks work arguing a is the moment infection . the morning bruce harm looking were married a side ' went crashing left seen over college - day trying leadership ) the after goal consumers moments more found reputation five expensive in flanagan . morgan wasn are the veterans ' and ' . then provide those mystery . is social as know latest holmes then and training to ' fighting 1990s we and . greek . , , wanted consider hit relationship more it told register people not and college , the i the the them is recent champagne to gave . how lazio thousand taxpayers kenya that adams team miliband asking , david last , , rejected are our the . for fight - have $ officer the jump featuring about church regarded pledged away within sh