# Hyper Parameter search

In [1]:
from src.qwen import load_qwen
model_qwen, tokenizer = load_qwen()

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [None]:
import torch
import torch.nn as nn

In [2]:
from src.set_up_lora import*
from src.preprocessor import*

Hyper Parameters that we want to search for:
- $r = (2,4,8)$ "rank"
- $lr = (10^{-5}, 5 \times 10^{-5}, 10^{4})$ "learning rate"

The nested loop below will be very expensive in terms of computation, this will load Qwen2.5 nine times, if your local machine struggles to reload Qwen2.5 that many times, use the alternative code below.

In [None]:
results = []
ranks = [2, 4, 8]
lrs = [1e-5, 5e-5, 1e-4]

for r in ranks:
    for lr in lrs:
        print(f"Training with r={r}, lr={lr}")
        model, tokenizer = load_qwen()
        trained_model, final_loss = train_lora_model(model, tokenizer, lora_rank=r, learning_rate=lr, train_steps=500)
        results.append({"rank": r, "learning_rate": lr, "final_loss": final_loss})
        print(f"-> final loss: {final_loss:.4f}")


In [None]:
import pandas as pd
HP_search_results_df = pd.DataFrame(results)
print(HP_search_results_df)


### Alternative (Use only if the code above keeps crashing the kernel)

In [1]:
import gc
import torch
from src.qwen import load_qwen
from src.set_up_lora import*

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [2]:
_,tokenizer = load_qwen()

In [3]:
_, val_texts, _ = load_and_preprocess("data/lotka_volterra_data.h5")

In [4]:

results = []

ranks = [2, 4, 8]
lrs = [1e-5, 5e-5, 1e-4]

for r in ranks:
    for lr in lrs:
        print(f"\nTraining with r={r}, lr={lr}")

        # Load fresh model
        model, _ = load_qwen()
        trained_model, final_loss = train_lora_model(model, tokenizer, lora_rank=r, learning_rate=lr, train_steps=500)

        val_loss, _ = evaluate_loss_perplexity_val(trained_model, tokenizer, val_texts, 4)

        results.append({"rank": r, "learning_rate": lr, "Train Loss": final_loss, "Validation Loss": val_loss})
        print(f"-> Train loss: {final_loss:.4f}")
        print(f"-> Validation loss: {val_loss:.4f}")

        # Clean up to free GPU memory
        del model
        del trained_model
        torch.cuda.empty_cache()
        gc.collect()



Training with r=2, lr=1e-05


: 

In [None]:
import pandas as pd
HP_search_results_df = pd.DataFrame(results)
print(HP_search_results_df)


After determining best hyper parameters for "rank" and "learning rate", we can procede to determine which of the three context lengths $[128, 512, 768]$ perform the best for a maximun of 2000 RLPPP steps

In [None]:
best_r = 4
best_lr = 5e-5
train_steps = 500  # or whatever RLPPP is

context_lengths = [128, 512, 768]

Option 1, context lengths = 128

In [None]:
model, _ = load_qwen()
trained_model, final_loss = train_lora_model(model, tokenizer, lora_rank=r, learning_rate=lr, max_ctx_length=128, train_steps=1000)
loss_val, _ = evaluate_loss_perplexity_val(trained_model, tokenizer, val_texts, 4)
print(f"-> Train loss: {final_loss:.4f}")
print(f"-> Validation loss: {loss_val:.4f}")

Option 1, context lengths = 512

In [None]:
model, _ = load_qwen()
trained_model, final_loss = train_lora_model(model, tokenizer, lora_rank=r, learning_rate=lr, train_steps=1000) # Default max_ctx_length=512
loss_val, _ = evaluate_loss_perplexity_val(trained_model, tokenizer, val_texts, 4)
print(f"-> Train loss: {final_loss:.4f}")
print(f"-> Validation loss: {loss_val:.4f}")

Option 1, context lengths = 768

In [None]:
model, _ = load_qwen()
trained_model, final_loss = train_lora_model(model, tokenizer, lora_rank=r, learning_rate=lr, max_ctx_length=127688, train_steps=1000)
loss_val, _ = evaluate_loss_perplexity_val(trained_model, tokenizer, val_texts, 4)
print(f"-> Train loss: {final_loss:.4f}")
print(f"-> Validation loss: {loss_val:.4f}")