#Pre-requests

download the books of the A Song of Ice and Fire series from the internet


1.   A Game of Thrones
2.   A Clash of Kings
3.   A Storm of Swords
4.   A Feast for Crows
5.   A Dance with Dragons



#Packages Installations



1.   PyPDF2 - to read from pdf files
2.   tiktoken - to tokenize text
3.   openai - to generate questions and answers from a paragraph
4.   backoff - to retry a function in case of an exception

In [None]:
!pip install PyPDF2 tiktoken openai backoff


#Home directory setup

In [None]:
import PyPDF2
import os

In [None]:
import PyPDF2
import os
os.chdir("/path/to/the/project/home/dir")

#PDF to TXT

converting pdf files into text files and returing their content as a string

In [None]:
"converting pdf files to text files, and returning one string contains all the files content"

# function that gets pdf file path , and destination dir, convert the pdf into txt file and save it in the destination path
def convert_pdf_to_txt(pdf_path, destination_path):
    # convert the pdf into txt file
    pdf_reader = PyPDF2.PdfReader(pdf_path)
    text = ""
    for i, page in enumerate(pdf_reader.pages):
        text += page.extract_text()
    # save the text to the destination path
    with open(destination_path, "w") as file:
        file.write(text)
    return text

# function gets pdf files directory, and destination dir, convert the pdf files , one by one into txt file and save it in the destination path
def convert_pdf_files_to_txt(pdf_files_dir, destination_dir):
    # create the destination dir if it doesn't exist
    if not os.path.exists(destination_dir):
        os.makedirs(destination_dir)

    # convert the pdf files to txt files
    for pdf_file in os.listdir(pdf_files_dir):
        pdf_path = os.path.join(pdf_files_dir, pdf_file)

        # check if the file is already converted to txt
        if os.path.exists(os.path.join(destination_dir, pdf_file.replace(".pdf", ".txt"))):
            print(f"File {pdf_file} already converted to txt")
            continue

        txt_path = os.path.join(destination_dir, pdf_file.replace(".pdf", ".txt"))
        convert_pdf_to_txt(pdf_path, txt_path)


# function that takes txt files directory path, and read all the txt files and return them as a string
def get_text_data(txt_files_dir = "data", pdf_files_dir = "data_pdf"):
    # convert the pdf files to txt files
    convert_pdf_files_to_txt(pdf_files_dir, txt_files_dir)

    text = ""
    for txt_file in os.listdir(txt_files_dir):
        txt_path = os.path.join(txt_files_dir, txt_file)
        with open(txt_path, "r") as file:
            text += file.read()
    return text

In [None]:
books_text = get_text_data(txt_files_dir = "data", pdf_files_dir = "data_pdf")

splitting the string into chunks with size to be 200 chars

In [None]:
chunk_size = 1500
chunks=[]

for i in range(1000,len(books_text)-1000, chunk_size-200):
  chunks.append(books_text[i:i+chunk_size].replace("\n", " ").replace("\t"," "))

#Generating questions and answers

##   Get your api key from openai platform

In [None]:
OPENAI_API_KEY="your api key"

##Generating the QA sets

In [None]:
"""
qa_batch.py  –  Generate Q/A pairs for a list of passages.

• If `use_batch=True` and you have the Batch API quota enabled,
  the helper will upload a JSONL file, run the job at –50 % cost,
  and poll until the results are ready.

• Otherwise it falls back to (rate‑limited) per‑chunk requests.

Requires:  `pip install openai tiktoken backoff`
           export OPENAI_API_KEY=sk‑...

© 2025.  Feel free to adapt / extend.
"""
from __future__ import annotations
import json, os, tempfile, time, asyncio, backoff
from typing import List, Dict, Union
from openai import OpenAI, RateLimitError, APIStatusError

client = OpenAI(api_key=OPENAI_API_KEY)
SYSTEM = "You are a meticulous literary analyst. When answering, always provide full, informative, standalone sentences using clear language."
TEMPLATE = """<chunk>
{chunk}
</chunk>

Write ONE fact‑based Q&A pair based strictly on the information in the chunk.

• Use JSON: {{ "q": "...", "a": "..." }}
• The **answer** must be a **complete and informative sentence** that can stand alone—
  start with a capital letter and end with a period.
• The **answer** should contain enough context so that it makes sense **even without the question**, and should be at least **20 words long**, if possible.
• Do NOT invent facts. Stay within what is clearly stated or strongly implied in the chunk.
"""

# ---------- Low‑level single request ---------- #
@backoff.on_exception(backoff.expo, (RateLimitError, APIStatusError), max_tries=5)
def _qa_for_chunk(chunk: str,
                  model: str = "gpt-4o-mini",
                  temperature: float = 0.4) -> Dict[str, str]:
    """
    Call Chat Completions once and parse the {"q": "...", "a": "..."} JSON.
    """
    prompt = TEMPLATE.format(chunk=chunk)
    resp = client.chat.completions.create(
        model=model,
        response_format={"type": "json_object"},
        messages=[{"role": "system", "content": SYSTEM},
                  {"role": "user",   "content": prompt}],
        max_tokens=120,
        temperature=temperature,
    )
    return json.loads(resp.choices[0].message.content)


# ---------- Batch‑API helper ---------- #
def _batch_submit(chunks: List[str],
                  n: int = 1,
                  model: str = "gpt-4o-mini") -> str:
    """
    1. Create a temporary JSONL with one Chat‑Completion task per chunk
    2. Upload as an OpenAI Batch job (–50 % cost, 24 h SLA)
    3. Return the batch job ID
    """
    fd, path = tempfile.mkstemp(suffix=".jsonl", text=True)
    with os.fdopen(fd, "w") as f:
        for i, chunk in enumerate(chunks):
            body = {
                "model": model,
                "response_format": {"type": "json_object"},
                "messages": [
                    {"role": "system", "content": SYSTEM},
                    {"role": "user",   "content": TEMPLATE.format(chunk=chunk)}
                ],
                "max_tokens": 160,
                "temperature": 0.4
            }
            task = {"custom_id": f"ck_{i}", "method": "POST",
                    "url": "/v1/chat/completions", "body": body}
            f.write(json.dumps(task) + "\n")

    batch = client.batches.create(
        input_file=open(path, "rb"),
        endpoint="/v1/chat/completions",
        completion_window="24h"
    )
    return batch.id


def _batch_poll(batch_id: str, interval: int = 30) -> List[Dict[str, str]]:
    """
    Polls until the batch finishes, then downloads and parses the results.
    """
    while True:
        status = client.batches.retrieve(batch_id)
        if status.status in ("completed", "failed", "expired"):
            break
        time.sleep(interval)

    if status.status != "completed":
        raise RuntimeError(f"Batch {batch_id} ended with status={status.status}")

    # Download output file
    output = client.files.content(status.output_file_id)
    return [json.loads(line)["response"]["choices"][0]["message"]["content"]
            for line in output.iter_lines() if line]


# ---------- Public helper ---------- #
def generate_qas(chunks: List[str],
                 model: str = "gpt-4o-mini",
                 use_batch: bool = True) -> List[Dict[str, str]]:
    """
    Args:
        chunks   – List of text passages.
        model    – Chat model name (gpt‑4o, gpt‑4o‑mini …).
        use_batch– True → try Batch API; False → synchronous fallback.

    Returns:
        List[dict] each like {'q': "...", 'a': "..."}
    """
    if use_batch:
        try:
            batch_id = _batch_submit(chunks, model=model)
            print(f"Submitted Batch {batch_id}. Waiting …")
            raw_jsons = _batch_poll(batch_id)
            return [json.loads(j) for j in raw_jsons]
        except Exception as e:
            print(f"Batch failed ({e}). Falling back to per‑chunk requests.")

    # fallback – sequential (or you can wrap with asyncio/gather)
    results = []
    for ck in chunks:
        results.append(_qa_for_chunk(ck, model=model))
    return results


here we are generating the QA sets by sending batch of chunks to openai platform with a prompt to generate what we need

while we sending batches, we save the results after each 200 iterations for safty

In [None]:
amount_chunks=10
qas=[]
data=[]
for i in range(0, len(chunks)-amount_chunks, amount_chunks):

  qa = generate_qas(chunks[i:i+amount_chunks], model="gpt-4o-mini", use_batch=True)
  qas+=qa
  if i%200 == 0:

    #check if the file is found
    if os.path.isfile("qa.json"):
      with open("qa.json", "r") as f:
        data = json.load(f)
    data+=qas
    with open("qa.json", "w") as f:
      json.dump(data, f)
    qas=[]
    print(f"done {i}")

with open("qa.json", "w") as f:
  json.dump(data, f)

reading the generated data

In [None]:
import json
with open('qa.json', 'r', encoding='utf-8') as f:
    qas = json.load(f)

#Data Processing

##Train/Test splitting

here we are splitting the data into trining and testing sets

the books text separated from the questions and answers so the model can learn first the books and then learn how to answer questions

In [None]:
train_qas_portion = int(len(qas)*0.98)
test_qas_portion=int(len(qas)*0.02)

train_data_qas = qas[:train_qas_portion]
test_data_qas = qas[train_qas_portion:]

train_txt_portion = int(len(chunks)*0.98)
test_txt_portion=int(len(chunks)*0.02)

train_data_txt = chunks[:train_txt_portion]
test_data_txt = chunks[train_txt_portion:]

print(f"train_data_qas: {len(train_data_qas)}")
print(f"test_data_qas: {len(test_data_qas)}")
print(f"train_data_txt: {len(train_data_txt)}")
print(f"test_data_txt: {len(test_data_txt)}")

## ALPACA format

its a formating questions and answers in a way the model can learn to answer a question

In [None]:
def format_input(entry):
  instruction_text=(
      f"Answer the following question with an appropriate response. "
      f"Write a response that appropriately completes the qusetion."
      f"\n\n### question:\n{entry['q']}"
  )
  return instruction_text

##Tokenizer

we are using byte pair encoding and the gpt2 tokenizer

vocab size = 50257

In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

##Dataset

this class tokenize the data and arranging it

the data parameter contains the QA sets, so here we adding the answer to the formated question

the chunks are the strings from the books

In [None]:
import torch
from torch.utils.data import Dataset
import random

class qaDataset(Dataset):
    def __init__(self, tokenizer, data=[], chunks=[]):
      '''
      data: list of dictionaries with keys 'q' and 'a'
      chunks: list of strings to be used as context
      '''
      self.qas = data

      self.encoded_texts = []

      for entry in data:
        qa_text = format_input(entry)
        response_text = f"\n\n### Response:\n{entry['a']}"
        full_text = qa_text + response_text
        self.encoded_texts.append(tokenizer.encode(full_text))

      for chunk in chunks:
        self.encoded_texts.append(tokenizer.encode(chunk))

      random.shuffle(self.encoded_texts)

    def __len__(self):
        return len(self.encoded_texts)

    def __getitem__(self, idx):
        return self.encoded_texts[idx]

##Input/Target pairs

this function used while training, it gets input batch and arrange it to a input/target pair by making the target a shift to the right by one token, and also adding an end of text token and also after that the ignore tokens

In [None]:
def custom_collate_fn(batch, pad_token_id=50256, device='cpu', ignore_index=-100, allowed_max_length=None):

  '''
  extracting max length

  for each item
    padding
    input/target
    -100 (ignore token)
    appending

  stacking to device
  '''

  batch_max_length=max(len(item)+1 for item in batch)

  inputs_lst, targets_lst = [],[]

  for item in batch:
    new_item = item.copy()

    new_item+=[pad_token_id]

    # padding
    padded = (
        new_item + [pad_token_id] * (batch_max_length - len(new_item))
    )

    # inputs and targets
    inputs = torch.tensor(padded[:-1])
    targets = torch.tensor(padded[1:])

    # mask with true values in places where the pad_token_id is found
    mask = targets == pad_token_id # true or false in the places that meets the condition

    # get indeces of places have ture values
    indices = torch.nonzero(mask).squeeze()# indeces of places that none zero

    # replace all pad_token_id but the first one
    if indices.numel() > 1:
      targets[indices[1:]] = ignore_index

    # check if length is constrained
    if allowed_max_length is not None:
      inputs = inputs[:allowed_max_length]
      targets = targets[:allowed_max_length]

    # append
    inputs_lst.append(inputs)
    targets_lst.append(targets)

  #stack
  inputs_tensor = torch.stack(inputs_lst).to(device)
  targets_tensor = torch.stack(targets_lst).to(device)

  return inputs_tensor, targets_tensor

setting up the device

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##Training and Testing sets

making a train and test sets for the text of the books and also sets for the QA sets

because we first train the model to learn the books content

then we train the model to answer questions

In [None]:
from functools import partial
from torch.utils.data import DataLoader

customized_collate_fn = partial(custom_collate_fn, device=device, allowed_max_length=1024)



num_workers = 0
batch_size = 4

torch.manual_seed(123)

train_dataset_qas = qaDataset(tokenizer, train_data_qas, chunks)
train_loader_qas = DataLoader(
    train_dataset_qas,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=customized_collate_fn,
    num_workers=num_workers,
    drop_last=True
)

test_dataset_qas = qaDataset(tokenizer, test_data_qas)
test_loader_qas = DataLoader(
    test_dataset_qas,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=customized_collate_fn,
    num_workers=num_workers,
    drop_last=False
)

train_dataset_txt = qaDataset(tokenizer, [], train_data_txt)
train_loader_txt = DataLoader(
    train_dataset_txt,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=customized_collate_fn,
    num_workers=num_workers,
    drop_last=True
)

test_dataset_txt = qaDataset(tokenizer, [], test_data_txt)
test_loader_txt = DataLoader(
    test_dataset_txt,
    batch_size=batch_size,
    shuffle=False,
    collate_fn=customized_collate_fn,
    num_workers=num_workers,
    drop_last=False
)

##Generating Text



*   this function is geting batch of texts (idx)
*   pass them to the model
*   gets the model output
*   process the output
*   return the generated text



temperature - is to use a variety of options instead of getting the same result for the same input

In [None]:


def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):
    '''
    iterate for max_new_tokens times
    get the last context_size tokens of each sentence
    inference
    get last vector of each sentence output
    topk: set others to -inf
    temp: divide=>softmax=>moltinomial
    if eos: stop generating
    else: append to idx with cat

    '''

    # For-loop is the same as before: Get logits, and only focus on last time step
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # New: Filter logits with top_k sampling
        if top_k is not None:
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

        # New: Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

        # Otherwise same as before: get idx of the vocab entry with the highest logits value
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)

        if idx_next == eos_id:  # Stop generating early if end-of-sequence token is encountered and eos_id is specified
            break

        # Same as before: append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

    return idx

#Loss

cross entropy is for maximizing the correct result probabilities and minimizing the wrong results probabilities

* the output of the model is batch of matrixes, each matrix is output for each input, the output matrix consists of a vocab size vector for each token, and each cell of the vector represents the probabilty for that token to be next to the corresponting input token.

* gets the probability of the target token

* sum their logs

* divide by the amount of tokens (average)

* multiply by -1

the number we got represents how far the result from the target

In [None]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0, 1), target_batch.flatten())
    return loss

calculating the loss for a dataset

In [None]:
def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        # Reduce the number of batches to match the total number of batches in the data loader
        # if num_batches exceeds the number of batches in the data loader
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

#Model Structure

##GPT

The journy a text takes toward generating the next word

***--- input example: "my name is karam and"***
<br>
<br>
1.   Text tokenization.
      *   using byte pair to convert each token with token id
      *   we add end of text token and then padding tokens.
      *   end of text token: 50256
      *   padding token: -100
      *   result: [23, 43, 678, 3468, 4234, 50256, -100, -100, -100,... ]
      *   length: 1024, we usually call it context_length.

***--- shape = 1024 = input_size***
<br>
<br>
2.   Token embedding:

      *   lookup table, 50257X768 matrix, converting the token id into a vector that correspond to that id. like a dictionary, give me the word and ill give you the meaning, the word here is the token id and the meaning here is a vector of size 768

      *   50257: is the vocab size

      *   768: is a token embedding size

      *   meaning: each word represented by a vector, words close in meaning have smaller distanses between each others.

      *  result: 1024 vectors each one of length 768, shape=[1024,768]

        [

            [768 values],

            [768 values],

            [768 values],

            [768 values],

            [768 values],

            .

            .

            .

        ]

***-- shape = [1024, 768] = [input_length, emb_dim]***
<br>
<br>
3.   Positional embedding.
      *   lookup table, context_length × emb_dim matrix, converting the position index into a vector that corresponds to that position. Like a dictionary: give me the position (0, 1, 2 …) and ill give you a vector that represents “being in that place”.
      
      *   context_length: the maximum number of tokens the model can handle in one sequence (e.g., 1024). So there are 1024 rows — one for each possible position.

      *   emb_dim: the embedding size (same 768 as the token embeddings).

      *   meaning: each position in the sequence has its own learned vector Position 0 has one vector, position 1 another, … . These vectors inject the idea of order, so the model knows which token comes first, second, third, etc.

      *   result: if the input has 5 tokens, you’ll get 5 vectors (one for each position: 0 to 4), each vector of length 768, shape = [1024, 768]

      [

        [768 values for position 0],

        [768 values for position 1],

        [768 values for position 2],

        [768 values for position 3],

        [768 values for position 4],

        .

        .

        .

      ]
***-- shape = [1024, 768] = [input_length, emb_dim]***
<br>
<br>
***Token and positional embeddings are added element-wise, so each vector represents both the word’s meaning and its position in the sequence.***
<br>
<br>
***-- shape=[1024,768]***
<br>
<br>
4.   n transformer blocks .
      *   [explained later.]

***-- shape = [1024, 768] = [input_length, emb_dim]***
<br>
<br>
5.   Normalization.
      *   keeping the mean to stay close to 0 and the variance to be close to 1.

      *   goal: preventing overflow errors and the numbers to become inf.

***-- shape = [1024, 768] = [input_length, emb_dim]***
<br>
<br>
6.   Output layer.
      *   layer that shapes the output into the shape of input_length X 50257. that means for each input word there 50257 value, we call the 50257 by voccab size.

      *   50257: those are the amount of words in the dictionary, each cell contains the probability of that word to be the next word.

      *   layer matrix shape: embedding_dim X vocab_size , (embedding_dim is the token vector size and vocab_size is the amount of tokens in a dictionary)

      *   input matrix size: input_size X embedding_dim (input_size is the length of the input to the model which means amount of tokens)

      *   result matrix input_length X vocab_size
<br>
<br>
***-- shape  = [1024, 50257] = [input_length, vocab_size]***


##Transformer

Transformer structure



1.   layer norm 1.
      *   the same as the Normalization layer explained earlier.

***--shape=[1024, 768] = [context_length, emb_dim]***
<br>
<br>
2.   multi-head attention.
      *   [explained later]

***--shape=[1024, 768] = [context_length, emb_dim]***
<br>
<br>
3.   dropout.
      *   the concept is that we turn of a fraction of the learned parameters for avoiding overfitting.
***--shape=[1024, 768] = [context_length, emb_dim]***
<br>
<br>
4.   residual connection.
      *   its adding the values of the transformer input to the dropout output.
      *   had a huge effect when dealing with a deep network, first used in ALEXNET.

***--shape=[1024, 768] = [context_length, emb_dim]***
<br>
<br>
5.   layer norm 2.
      *   the same as the Normalization layer explained earlier.

***--shape=[1024, 768] = [context_length, emb_dim]***
<br>
<br>
6.   feed forward layer.
      *   it have 2 dense layers
      *   input to first layer: [1024, 768] which is [context_length, emb_dim]
      *   params of first layer: [768, 3072] which is [emb_dim, 4*emb_dim]
      *   output of first layer: [1024, 3072] which is [context_length, 4*emb_dim]
      *   GELU activation: [1024, 3072] => [1024, 3072] keep the same size
      *   input to second layer [1024, 3072] which is [context_length, 4*emb_dim]
      *   params of second layer: [3072, 768] which is [4*emb_dim, emb_dim]
      *   output of second layer: [1024, 768] which is [context_length, emb_dim] which means the same of the feed forward layer input shape.

***--shape=[1024, 768] = [context_length, emb_dim]***
<br>
<br>
7.   dropout layer.
      *   explained earlier.

***--shape=[1024, 768] = [context_length, emb_dim]***
<br>
<br>
8.   residual connection
      *   its adding the values of the first dropout output to the second dropout output layers.


***--shape=[1024, 768] = [context_length, emb_dim]***

##multi-head attention


***--- input = [1024, 768] = [context_length, emb_dim]***
<br>
<br>
1.   Linear projections (queries, keys, values)
      *   Input: [1024, 768] = [context_length, emb_dim]
      *   Parameters: three matrices of shape [768, 768] (for Q, K, V).
      *   Each token embedding is projected into three spaces:
            *   Query (Q): what am I looking for?
            *   Key (K): what do I contain?
            *   Value (V): what information do I pass on if selected?
      *   Result:
            *   Q: [1024, 768]
            *   K: [1024, 768]
            *   V: [1024, 768]

<br>
<br>

2.   Split into heads
      *   d_out must be divisible by num_heads
      *   Suppose num_heads = 12
      *   head_dim = d_out/num_heads = 768 / 12 = 64.
      *   Reshape each Q, K, V into [1024, 12, 64].
      *   transpose the 0 and 1 dims, so instead for each token there is 12 heads we look at it as each head have 1024 tokens.
      *   Now each head can focus on different relationships independently.
      *   Result:
            *   Q output: [12, 1024, 64]
            *   K output: [12, 1024, 64]
            *   V output: [12, 1024, 64]
            *   means: [num_heads, num_tokens, head_dim]

<br>
<br>

3.   Attention scores (scaled dot-product)
      *   Q, V each have 12 heads, and each head have 1024 token vectors.
      *   For each head, compute:

            *   scores = (𝑄⋅𝐾.𝑇)/sqrt(head_dim)
                  *   head_dim=64
                  *   K.T is transposing the last 2 dims of the key resulted matrix so we can compute the score for each head
      *   This gives a similarity score between every pair of tokens.
      *   Shape per head: [1024, 1024] (every token attends to every token).
      *   result:
            *   score shape: [12, 1024, 1024]
            *   and we still have V output: [12, 1024, 64] as it is

      *   A causal mask is applied so each position can only look backward (no cheating by looking at future tokens).
      *   softmax: Softmax turns the attention scores into simple weights that add up to 1, so each word knows how much to pay attention to the others.

<br>
<br>

4.   Context vector for each token
      *   score shape: [12, 1024, 1024]
      *   and we still have V output: [12, 1024, 64] as it is.
      *   12 is number of heads
      *   For each head, compute:
            *   score⋅V  # means [1024, 1024]⋅[1024, 64]
      *   result: [12, 1024, 64]
      *   transpose [12, 1024] to [1024, 12] # first two dims in result
      *   result: [1024, 12, 64]
      *   merge last 2 dims in result => [1024, 768]

<br>
<br>

***-- output shape: [1024, 768]***

##GPT Code

6 classes to build the model all explained above:


1.   LayerNorm
2.   GELU
3.   FeedForward
4.   MultiHeadAttention
5.   TransformerBlock
6.   GPTModel



In [None]:
import torch
import torch.nn as nn


class LayerNorm(nn.Module):

    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift


class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))))

class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layer = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"])
        )

    def forward(self, x):
        return self.layer(x)


class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens] # the dims hear are just to make sure they have the same dims

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)# replace in attn_scores: where is 1 in mask, put -inf in attn_scores

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out) # contiguous: to make sure the matrix in the same memory block, biew=reshape
        context_vec = self.out_proj(context_vec) # optional projection

        return context_vec



class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["n_heads"],
            dropout=cfg["drop_rate"],
            qkv_bias=cfg["qkv_bias"])
        self.ff = FeedForward(cfg)
        self.norm1 = LayerNorm(cfg["emb_dim"])
        self.norm2 = LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        # Shortcut connection for attention block
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        # Shortcut connection for feed forward block
        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut  # Add the original input back

        return x



class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits


#Plot train and test loss

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

def plot_losses(epochs_seen, tokens_seen, train_losses, val_losses):
    fig, ax1 = plt.subplots(figsize=(5, 3))

    # Plot training and validation loss against epochs
    ax1.plot(epochs_seen, train_losses, label="Training loss")
    ax1.plot(epochs_seen, val_losses, linestyle="-.", label="Validation loss")
    ax1.set_xlabel("Epochs")
    ax1.set_ylabel("Loss")
    ax1.legend(loc="upper right")
    ax1.xaxis.set_major_locator(MaxNLocator(integer=True))  # only show integer labels on x-axis

    # Create a second x-axis for tokens seen
    ax2 = ax1.twiny()  # Create a second x-axis that shares the same y-axis
    ax2.plot(tokens_seen, train_losses, alpha=0)  # Invisible plot for aligning ticks
    ax2.set_xlabel("Tokens seen")

    fig.tight_layout()  # Adjust layout to make room
    plt.savefig("loss-plot.pdf")
    plt.show()

#Model Evaluation

In [None]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

# Generate Text

##tokens <=> text

In [None]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) # add batch dimension
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0) # remove batch dimension
    return tokenizer.decode(flat.tolist())

##Generate number of tokens

In [None]:
def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):
    '''
    iterate for max_new_tokens times
    get the last context_size tokens of each sentence
    inference
    get last vector of each sentence output
    topk: set others to -inf
    temp: divide=>softmax=>moltinomial
    if eos: stop generating
    else: append to idx with cat

    '''

    # For-loop is the same as before: Get logits, and only focus on last time step
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # New: Filter logits with top_k sampling
        if top_k is not None:
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

        # New: Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

        # Otherwise same as before: get idx of the vocab entry with the highest logits value
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)

        if idx_next == eos_id:  # Stop generating early if end-of-sequence token is encountered and eos_id is specified
            break

        # Same as before: append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

    return idx

##Print the generated text

In [None]:
def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate(
            model=model, idx=encoded,
            max_new_tokens=50, context_size=context_size
        )
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))  # Compact print format
    model.train()

#Download & Load trained weights

##Download trained weights

download trained weights from kaggle and organize the weights in the params dictionary (explained later) to embedd them later in the model.

In [None]:
import os
import requests  # Make sure requests is installed
import json
import numpy as np
import tensorflow as tf
from tqdm import tqdm

# downloading the seven files from caggle
def download_and_load_gpt2(model_size, models_dir):
    # Validate model size
    allowed_sizes = ("124M", "355M", "774M", "1558M")
    if model_size not in allowed_sizes:
        raise ValueError(f"Model size not in {allowed_sizes}")

    # Define paths
    model_dir = os.path.join(models_dir, model_size)
    base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models"
    filenames = [
        "checkpoint", "encoder.json", "hparams.json",
        "model.ckpt.data-00000-of-00001", "model.ckpt.index",
        "model.ckpt.meta", "vocab.bpe"
    ]

    # Download files
    os.makedirs(model_dir, exist_ok=True)
    for filename in filenames:
        file_url = os.path.join(base_url, model_size, filename)
        file_path = os.path.join(model_dir, filename)
        download_file(file_url, file_path)

    ## We have reached here until now ---> we have downloaded the files on our local machine.

    # Load settings and params

    # get the path of the model weights
    tf_ckpt_path = tf.train.latest_checkpoint(model_dir)

    # loading the model settings
    settings = json.load(open(os.path.join(model_dir, "hparams.json")))
    params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings)

    return settings, params

def download_file(url, destination):
    try:
        # Send a GET request to download the file, disabling SSL verification
        response = requests.get(url, stream=True, verify=False)

        # Get the total file size from headers, defaulting to 0 if not present
        file_size = int(response.headers.get("content-length", 0))

        # Check if file exists and has the same size
        if os.path.exists(destination):
            file_size_local = os.path.getsize(destination)
            if file_size == file_size_local:
                print(f"File already exists and is up-to-date: {destination}")
                return

        # Define the block size for reading the file
        block_size = 1024  # 1 Kilobyte

        # Initialize the progress bar with total file size
        progress_bar_description = url.split("/")[-1]  # Extract filename from URL
        with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
            # Open the destination file in binary write mode
            with open(destination, "wb") as file:
                # Iterate over the file data in chunks
                for chunk in response.iter_content(block_size):
                    progress_bar.update(len(chunk))  # Update progress bar
                    file.write(chunk)  # Write the chunk to the file

    except requests.exceptions.RequestException as e:
        print(f"Error downloading the file: {e}")
        print(f"Please check the URL: {url}")

def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):
    # Initialize parameters dictionary with empty blocks for each layer
    params = {"blocks": [{} for _ in range(settings["n_layer"])]}

    # Iterate over each variable in the checkpoint
    for name, _ in tf.train.list_variables(ckpt_path):
        # Load the variable and remove singleton dimensions
        variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name))

        # Process the variable name to extract relevant parts
        variable_name_parts = name.split("/")[1:]  # Skip the 'model/' prefix

        # Identify the target dictionary for the variable
        target_dict = params
        if variable_name_parts[0].startswith("h"):
            layer_number = int(variable_name_parts[0][1:])
            target_dict = params["blocks"][layer_number]

        # Recursively access or create nested dictionaries
        for key in variable_name_parts[1:-1]:
            target_dict = target_dict.setdefault(key, {})

        # Assign the variable array to the last key
        last_key = variable_name_parts[-1]
        target_dict[last_key] = variable_array

    return params

##Model Configurations setup

In [None]:
BASE_CONFIG = {
    "vocab_size": 50257,     # Vocabulary size
    "context_length": 1024,  # Context length
    "drop_rate": 0.05,        # Dropout rate
    "qkv_bias": True         # Query-key-value bias
}

model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

CHOOSE_MODEL = "gpt2-medium (355M)"

BASE_CONFIG.update(model_configs[CHOOSE_MODEL])

model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(
    model_size=model_size,
    models_dir="gpt2"
)

##Loading weights

In [None]:
import numpy as np

def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

def load_weights_into_gpt(gpt, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])

    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign(
            gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(
            gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(
            gpt.trf_blocks[b].att.W_value.weight, v_w.T)

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(
            gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(
            gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(
            gpt.trf_blocks[b].att.W_value.bias, v_b)

        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight,
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(
            gpt.trf_blocks[b].att.out_proj.bias,
            params["blocks"][b]["attn"]["c_proj"]["b"])

        gpt.trf_blocks[b].ff.layer[0].weight = assign(
            gpt.trf_blocks[b].ff.layer[0].weight,
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layer[0].bias = assign(
            gpt.trf_blocks[b].ff.layer[0].bias,
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].ff.layer[2].weight = assign(
            gpt.trf_blocks[b].ff.layer[2].weight,
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layer[2].bias = assign(
            gpt.trf_blocks[b].ff.layer[2].bias,
            params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale,
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift,
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale,
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift,
            params["blocks"][b]["ln_2"]["b"])

    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])

##params dictionary keys explained

["wte"]: token embeddings
the embedding layer in the begenning of the transformer block, it have trainable weights
50257 X 768
the input is context length, which is the tokenized sentence that passed to the model, which is 1024, its passed throught the embedding matrix and resulting a 1024x768 matrix

---------------------------------

["wpe"]: positional embeddings
the positional embedding layer, with trainable weights 1024x768 matrix that we add it to the embedding resulted matrix

---------------------------------


["blocks"]: transformers
 - Q,K,V wieghts
  - 1- ["transformer/h0/attn/c_attn/w"]
  - 2- ["transformer/h0/attn/c_attn/b "]
  - c_attn is the combined kqv params

 - feed forward wieghts
  - first layer
    - ["transformer/h0/mlp/c_fc/w"]
    - ["transformer/h0/mlp/c_fc/b "]
  - second layer
    - ["transformer/h0/c_proj/c_fc/w"]
    - ["transformer/h0/c_proj/c_fc/b "]
  - output projection layer
    - ["transformer/h0/c_proj/w"]
    - ["transformer/h0/c_proj/b "]


 - normalization layers
  - first layer
    - ["transformer/h0/ln_1/g"]
    - ["transformer/h0/ln_1/b "]
  - second layer
    - ["transformer/h0/ln_2/g"]
    - ["transformer/h0/ln_2/b "]
---------------------------------

["g"]: final norm scale (w)

---------------------------------

["b"]: final norm shift (b)

#Model setup

##Creat and load

In [None]:
model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval();

##Testing a small sample

In [None]:
input_text = "Hello, how are you?"
token_ids = text_to_token_ids(input_text, tokenizer)

token_ids = generate(
    model=model.to(device),
    idx=token_ids.to(device),
    max_new_tokens=35,
    context_size=BASE_CONFIG["context_length"],
    eos_id=50256
)


generated_text = token_ids_to_text(token_ids, tokenizer)

response_text = generated_text[len(input_text):].strip()
print(response_text)

#Fine-tuning Time

##Training function

In [None]:

def train_model_simple(model, train_loader, val_loader, optimizer, device, num_epochs,
                       eval_freq, eval_iter, start_context, tokenizer, checkpoints_dir):
    # Initialize lists to track losses and tokens seen
    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1
    all_val_loss = []

    #if checkpoints directory doesnt exists, create it
    if not os.path.exists(checkpoints_dir):
        os.makedirs(checkpoints_dir)

    # Main training loop
    for epoch in range(num_epochs):
        model.train()  # Set model to training mode

        for input_batch, target_batch in train_loader:
            optimizer.zero_grad() # Reset loss gradients from previous batch iteration
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward() # Calculate loss gradients
            optimizer.step() # Update model weights using loss gradients
            tokens_seen += input_batch.numel() # Returns the total number of elements (or tokens) in the input_batch.
            global_step += 1

            # Optional evaluation step
            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(
                    model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "
                      f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

        # Print a sample text after each epoch
        generate_and_print_sample(
            model, tokenizer, device, "you know nothing"#"start_context"
        )

        # Save the model after each epoch
        model_path = f"{checkpoints_dir}/model_epoch_{epoch+1}.pth"
        torch.save(model.state_dict(), model_path)

        model.eval()
        with torch.no_grad():
            full_loss = calc_loss_loader(val_loader, model, device)
        model.train()

        all_val_loss.append(full_loss)

        print(f"\n\n\n===============Epoch {epoch+1} val loss: {full_loss}")
        if epoch > 5:
            # early stopping when validation loss is more than 3 validation losses
            sorted_val_losses = all_val_loss.copy()
            sorted_val_losses.sort()
            min_5_val_losses = sorted_val_losses[:5]
            if full_loss > max(min_5_val_losses):
                print(f"Early stopping at epoch {epoch+1}")
                break




    # save the name of the best model
    index_of_best_model = all_val_loss.index(min(all_val_loss))
    best_model_path = f"{checkpoints_dir}/model_epoch_{index_of_best_model+1}.pth"
    with open(f"{checkpoints_dir}/best_model_name.json", "w") as f:
        json.dump({"model_path": best_model_path}, f)


    return train_losses, val_losses, track_tokens_seen

##Fine-tuning

first we fine-tune the model on the books content and later on the QA sets

In [None]:

torch.manual_seed(123)

print(f"Train loader length: {len(train_loader_txt)}")
print(f"Val loader length: {len(test_loader_txt)}")


torch.manual_seed(123)


if torch.cuda.is_available():
   device = torch.device("cuda")
elif torch.backends.mps.is_available():
   device = torch.device("mps")
else:
   device = torch.device("cpu")

print(f"Using {device} device.")


model.to(device) # no assignment model = model.to(device) necessary for nn.Module classes


torch.manual_seed(123) # For reproducibility due to the shuffling in the data loader

with torch.no_grad(): # Disable gradient tracking for efficiency because we are not training, yet
    train_loss = calc_loss_loader(train_loader_txt, model, device)
    val_loss = calc_loss_loader(test_loader_txt, model, device)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)





# Note:
# Uncomment the following code to calculate the execution time
import time
start_time = time.time()


optimizer = torch.optim.AdamW(model.parameters(),lr=1e-4, weight_decay=0.01)
tokenizer = tiktoken.get_encoding("gpt2")

num_epochs = 100
train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader_txt, test_loader_txt, optimizer, device,
    num_epochs=num_epochs, eval_freq=50, eval_iter=20,
    start_context="who said you know nothing jon snow?", tokenizer=tokenizer, checkpoints_dir="checkpoints_txt"
)

# Note:
# Uncomment the following code to show the execution time
end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")



epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)

# load the best model
with open("checkpoints_txt/best_model_name.json", "r") as f:
    best_model_path = json.load(f)["model_path"]

model = GPTModel(BASE_CONFIG)
model.load_state_dict(torch.load(best_model_path))
model.to(device)
model.eval()

In [None]:
import gc, torch

# 1) Drop references to big objects
objs = ["model","optimizer","scaler","train_loader","val_loader","batch","loss","outputs","inputs","targets"]
for name in objs:
    if name in globals(): del globals()[name]

##fine-tune model on QA sets

In [None]:
train_losses, val_losses, tokens_seen = train_model_simple(
    model, train_loader_qas, test_loader_qas, optimizer, device,
    num_epochs=num_epochs, eval_freq=50, eval_iter=20,
    start_context="who said you know nothing jon snow?", tokenizer=tokenizer, checkpoints_dir="checkpoints_qas"
)

##plotting losses

In [None]:
epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)

##Loading the best model

In [None]:
# load the best model
with open("checkpoints_qas/best_model_name.json", "r") as f:
    best_model_path = json.load(f)["model_path"]

model = GPTModel(BASE_CONFIG)
model.load_state_dict(torch.load(best_model_path))
model.to(device)
model.eval()