In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig
from datasets import Dataset

In [2]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file.
    """
    doc = fitz.open(pdf_path)
    extracted_text = ""
    for page_num in range(len(doc)):
        page = doc[page_num]
        extracted_text += page.get_text()
    return extracted_text

# Extract text from "Calculus Made Easy"
pdf_text = extract_text_from_pdf("resources/mother-doc.pdf")

In [3]:
cleaned_text = pdf_text.replace('\n', ' ').strip()


In [4]:
cleaned_text

'CHAPTER I. TO DELIVER YOU FROM THE PRELIMINARY TERRORS. The preliminary terror, which chokes off most fifth-form boys from even attempting to learn how to calculate, can be abolished once for all by simply stating what is the meaning‚Äîin common-sense terms‚Äîof the two principal symbols that are used in calculating. These dreadful symbols are: (1) d which merely means ‚Äúa little bit of.‚Äù Thus dx means a little bit of x; or du means a little bit of u. Or- dinary mathematicians think it more polite to say ‚Äúan element of,‚Äù instead of ‚Äúa little bit of.‚Äù Just as you please. But you will find that these little bits (or elements) may be considered to be indefinitely small. (2) Z which is merely a long S, and may be called (if you like) ‚Äúthe sum of.‚Äù Thus Z dx means the sum of all the little bits of x; or Z dt means the sum of all the little bits of t. Ordinary mathematicians call this symbol ‚Äúthe integral of.‚Äù Now any fool can see that if x is considered as made up of a l

In [5]:
model_name = "EleutherAI/gpt-neo-125M"

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True)

In [7]:
inputs

{'input_ids': tensor([[41481,   314,    13,  ...,   475,   734,   427]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}

In [8]:
len(inputs['input_ids'][0])

2048

In [9]:
max_length = 128  # Define the max sequence length for each chunk
sequence_length = len(inputs['input_ids'][0])  # Length of your sequence

# Number of chunks you can create from the sequence
num_chunks = sequence_length // max_length
num_chunks

16

In [11]:
# Create the dataset with the smaller chunks
train_data = Dataset.from_dict({
    "input_ids": [inputs["input_ids"][0][i * max_length: (i + 1) * max_length] for i in range(num_chunks)],
    "attention_mask": [inputs["attention_mask"][0][i * max_length: (i + 1) * max_length] for i in range(num_chunks)],
})

In [12]:
train_data

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 16
})

In [13]:
eval_data = train_data.select(range(2))  # Select first 2 samples for evaluation
train_data = train_data.select(range(2, len(train_data)))

In [14]:
# Verify the split
print(f"Training data: {len(train_data)} samples")
print(f"Evaluation data: {len(eval_data)} samples")

Training data: 14 samples
Evaluation data: 2 samples


In [15]:
# TODO: increase sample size

In [16]:
model = AutoModelForCausalLM.from_pretrained(model_name)


In [17]:
# Freeze the base GPT model (no parameters will be updated)
for param in model.parameters():
    param.requires_grad = False

In [18]:
for name, module in model.named_modules():
    print(name)


transformer
transformer.wte
transformer.wpe
transformer.drop
transformer.h
transformer.h.0
transformer.h.0.ln_1
transformer.h.0.attn
transformer.h.0.attn.attention
transformer.h.0.attn.attention.attn_dropout
transformer.h.0.attn.attention.resid_dropout
transformer.h.0.attn.attention.k_proj
transformer.h.0.attn.attention.v_proj
transformer.h.0.attn.attention.q_proj
transformer.h.0.attn.attention.out_proj
transformer.h.0.ln_2
transformer.h.0.mlp
transformer.h.0.mlp.c_fc
transformer.h.0.mlp.c_proj
transformer.h.0.mlp.act
transformer.h.0.mlp.dropout
transformer.h.1
transformer.h.1.ln_1
transformer.h.1.attn
transformer.h.1.attn.attention
transformer.h.1.attn.attention.attn_dropout
transformer.h.1.attn.attention.resid_dropout
transformer.h.1.attn.attention.k_proj
transformer.h.1.attn.attention.v_proj
transformer.h.1.attn.attention.q_proj
transformer.h.1.attn.attention.out_proj
transformer.h.1.ln_2
transformer.h.1.mlp
transformer.h.1.mlp.c_fc
transformer.h.1.mlp.c_proj
transformer.h.1.mlp.ac

In [None]:
# from torch import nn

# # LoRA Attention class definition
# class LoRAAttention(nn.Module):
#     def __init__(self, attention_layer, rank=4):
#         super(LoRAAttention, self).__init__()
#         self.attention_layer = attention_layer
#         self.rank = rank
        
#         # Accessing the query, key, value, and output layers inside the attention module
#         self.query_layer = self.attention_layer.attention.q_proj  # GPT-Neo uses q_proj, k_proj, v_proj
#         self.key_layer = self.attention_layer.attention.k_proj
#         self.value_layer = self.attention_layer.attention.v_proj
#         self.out_proj = self.attention_layer.attention.out_proj
        
#         # Low-rank adaptation matrices for Q, K, V, and Output projections
#         self.lora_q = nn.Parameter(torch.randn(self.query_layer.weight.size(0), rank))
#         self.lora_k = nn.Parameter(torch.randn(self.key_layer.weight.size(0), rank))
#         self.lora_v = nn.Parameter(torch.randn(self.value_layer.weight.size(0), rank))
#         self.lora_o = nn.Parameter(torch.randn(self.out_proj.weight.size(0), rank))
        
#     def forward(self, layer_past, attention_mask=None):
#         # Apply LoRA (Low-rank approximation) on Q, K, V, O
#         q = self.query_layer(layer_past) + self.lora_q
#         k = self.key_layer(layer_past) + self.lora_k
#         v = self.value_layer(layer_past) + self.lora_v
#         attn_output = self.attention_layer.attention.attn_output(q, k, v)
#         output = self.out_proj(attn_output) + self.lora_o
        
#         return output

In [39]:
class LoRAAttention(nn.Module):
    def __init__(self, attention_layer, rank=4):
        super(LoRAAttention, self).__init__()
        self.attention_layer = attention_layer
        self.rank = rank
        
        # Accessing the query, key, value, and output layers inside the attention module
        self.query_layer = self.attention_layer.attention.q_proj  # GPT-Neo uses q_proj, k_proj, v_proj
        self.key_layer = self.attention_layer.attention.k_proj
        self.value_layer = self.attention_layer.attention.v_proj
        self.out_proj = self.attention_layer.attention.out_proj
        
        # Low-rank adaptation matrices for Q, K, V, and Output projections
        self.lora_q = nn.Parameter(torch.randn(self.query_layer.weight.size(0), rank))
        self.lora_k = nn.Parameter(torch.randn(self.key_layer.weight.size(0), rank))
        self.lora_v = nn.Parameter(torch.randn(self.value_layer.weight.size(0), rank))
        self.lora_o = nn.Parameter(torch.randn(self.out_proj.weight.size(0), rank))
        
    def forward(self, hidden_states, attention_mask=None, **kwargs):
        # Apply LoRA (Low-rank approximation) on Q, K, V, O
        
        # Compute the original Q, K, V projections
        q = self.query_layer(hidden_states)  # Shape: (batch_size, seq_length, 768)
        k = self.key_layer(hidden_states)    # Shape: (batch_size, seq_length, 768)
        v = self.value_layer(hidden_states)  # Shape: (batch_size, seq_length, 768)
        
        # Apply the low-rank matrices as a modification to each projection
        q = q + torch.matmul(hidden_states, self.lora_q)  # Shape: (batch_size, seq_length, 768)
        k = k + torch.matmul(hidden_states, self.lora_k)  # Shape: (batch_size, seq_length, 768)
        v = v + torch.matmul(hidden_states, self.lora_v)  # Shape: (batch_size, seq_length, 768)
        
        # Perform attention operation with modified Q, K, V
        attn_output = self.attention_layer.attention.attn_output(q, k, v)
        
        # Apply output projection
        output = self.out_proj(attn_output)  # Shape: (batch_size, seq_length, 768)
        output = output + torch.matmul(hidden_states, self.lora_o)  # Adding the low-rank output adjustment
        
        return output


In [38]:
for block in model.transformer.h:
    print(block.attn.lora_q[0])

tensor([-1.4067, -1.1437, -0.1459, -0.4860], grad_fn=<SelectBackward0>)
tensor([-1.8216, -0.6370, -1.8623,  1.0635], grad_fn=<SelectBackward0>)
tensor([-0.5058, -1.5229, -1.1246, -0.6328], grad_fn=<SelectBackward0>)
tensor([-1.5274,  0.0594, -0.4359,  0.2174], grad_fn=<SelectBackward0>)
tensor([ 1.2981, -0.9294, -0.2337,  0.8205], grad_fn=<SelectBackward0>)
tensor([0.5301, 0.4944, 0.2442, 0.0816], grad_fn=<SelectBackward0>)
tensor([-2.1079, -0.8987, -0.2710,  0.1329], grad_fn=<SelectBackward0>)
tensor([ 0.6511, -1.6682,  0.8628, -1.1069], grad_fn=<SelectBackward0>)
tensor([-1.2202,  0.8535, -0.1415, -1.1088], grad_fn=<SelectBackward0>)
tensor([ 0.2045,  0.5493,  0.8602, -0.1143], grad_fn=<SelectBackward0>)
tensor([-0.0562, -0.5262, -0.1443, -2.1659], grad_fn=<SelectBackward0>)
tensor([-0.4715, -0.2353,  1.7543, -0.1778], grad_fn=<SelectBackward0>)


In [21]:
# Unfreeze specific layers where LoRA or adaptive changes will be applied
for block in model.transformer.h:  # Loop over the transformer blocks
    for param in block.attn.parameters():  # Unfreeze attention layers
        param.requires_grad = True
    for param in block.mlp.parameters():  # Unfreeze feed-forward layers (MLP)
        param.requires_grad = True

In [42]:
# Replace standard attention layers with LoRA-adapted layers
for block in model.transformer.h:
    block.attn = LoRAAttention(block.attn)

AttributeError: 'LoRAAttention' object has no attribute 'attention'

In [30]:
for block in model.transformer.h:
    print(block.attn.lora_q)

Parameter containing:
tensor([[-1.4067, -1.1437, -0.1459, -0.4860],
        [ 1.3266,  0.2619,  0.3978, -1.0447],
        [ 1.0556,  1.0786, -0.2301, -2.5713],
        ...,
        [-0.0205, -1.4403,  0.3686,  0.2036],
        [-0.9250, -0.0830,  1.6587,  1.7433],
        [-1.1432,  0.5262, -0.6338,  1.3413]], requires_grad=True)
Parameter containing:
tensor([[-1.8216, -0.6370, -1.8623,  1.0635],
        [-0.4408,  0.5686, -0.6506,  0.7896],
        [ 0.1936,  1.3946, -0.3629, -0.4539],
        ...,
        [ 0.8049,  0.2505, -1.4376, -1.2455],
        [ 0.7506, -0.9918, -0.8173,  0.0327],
        [ 1.2027, -0.3812, -0.0224,  2.4981]], requires_grad=True)
Parameter containing:
tensor([[-5.0578e-01, -1.5229e+00, -1.1246e+00, -6.3281e-01],
        [-7.5175e-01,  1.1700e-01, -1.5754e+00,  3.5271e-01],
        [ 7.4714e-01,  1.2328e-01, -1.5518e-02, -1.2999e+00],
        ...,
        [ 8.0953e-02, -9.5286e-02, -5.2123e-01,  1.6441e+00],
        [-4.9889e-01, -7.2417e-01, -1.0181e+00,  6.97

In [23]:
print(model.transformer.h[0].attn)


LoRAAttention(
  (attention_layer): GPTNeoAttention(
    (attention): GPTNeoSelfAttention(
      (attn_dropout): Dropout(p=0.0, inplace=False)
      (resid_dropout): Dropout(p=0.0, inplace=False)
      (k_proj): Linear(in_features=768, out_features=768, bias=False)
      (v_proj): Linear(in_features=768, out_features=768, bias=False)
      (q_proj): Linear(in_features=768, out_features=768, bias=False)
      (out_proj): Linear(in_features=768, out_features=768, bias=True)
    )
  )
  (query_layer): Linear(in_features=768, out_features=768, bias=False)
  (key_layer): Linear(in_features=768, out_features=768, bias=False)
  (value_layer): Linear(in_features=768, out_features=768, bias=False)
  (out_proj): Linear(in_features=768, out_features=768, bias=True)
)


In [24]:
from transformers import TrainingArguments, Trainer
from torch.optim import AdamW

In [25]:
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-5)

In [26]:
training_args = TrainingArguments(
    output_dir="./experiments/model_checkpoints",              # Directory to save the model checkpoints
    num_train_epochs=2,                                       # Reduced epochs
    per_device_train_batch_size=2,                            # Adjusted batch size
    gradient_accumulation_steps=4,                            # Accumulated gradient for total batch size of 16
    fp16=True,                                                # Use mixed precision
    logging_dir="./experiments/logs",                         # Directory for logs
    logging_steps=1,                                          # Log every step
    save_steps=8,                                             # Save model every 5 steps
    evaluation_strategy="steps",                              # Evaluate every few steps
    eval_steps=2,                                            # Evaluation steps set to 10
    save_strategy="steps",                                    # Save strategy to match eval strategy
    save_total_limit=2,                                       # Limit the number of checkpoints saved
    learning_rate=3e-5,                                       # Smaller learning rate
    weight_decay=0.05,                                        # Increased weight decay for regularization
    warmup_steps=50,                                          # Reduced warmup steps
    load_best_model_at_end=True,                              # Load the best model at the end of training
)

# Callbacks for Early Stopping
from transformers import EarlyStoppingCallback
callbacks = [EarlyStoppingCallback(early_stopping_patience=1)] 



In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    callbacks=callbacks,  # Include early stopping callback
)
trainer.train()

  0%|          | 0/2 [00:00<?, ?it/s]

RuntimeError: The size of tensor a (768) must match the size of tensor b (4) at non-singleton dimension 2

In [114]:
trainer.train()


  0%|          | 0/3 [00:00<?, ?it/s]

TypeError: LoRAAttention.forward() got an unexpected keyword argument 'layer_past'