In [None]:
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from accelerate import Accelerator
from src import TimeSeriesData, load_qwen_model, add_lora, preprocessor, evaluate, EarlyStopping
from src import model_training_flops

ImportError: cannot import name 'TimeSeriesData' from 'src' (/Users/jacobtutt/Desktop/MPhil_DIS/M2/Coursework_M2/src/__init__.py)

In [11]:
from src import full_model

In [12]:
full_model, tokenizer, device = full_model(lora_rank=0)

Returned the base Qwen model without modification (rank = 0).


In [None]:
from src import preprocessor
from src import full_model
from src import train
from src import TimeSeriesData
from torch.utils.data import DataLoader


In [None]:
# Load model and tokeniser
model, tokeniser, device = full_model(lora_rank=2)

# Load and tokenise dataset
train_set_total, val_set_total, test_set_total = preprocessor('lotka_volterra_data.h5', percentile=90, decimal_places=3, train_fraction=0.7, validation_fraction=0.15, shuffle=False, print_summary=False)


train_dataset = TimeSeriesData(train_set_total, tokeniser, max_length=512, stride=500/2)
val_dataset = TimeSeriesData(val_set_total, tokeniser, max_length=512, stride=512)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
INFO - File loaded successfully. Trajectories shape: (1000, 100, 2), Time points shape: (100,)
INFO - Scaling data by alpha=0.25283724069595337, ensuring 90% of values fit within the model's expected range.
INFO - Data scaled to 3 decimal places
INFO - Splitting the data into training, validation, and test sets with fractions: 0.7, 0.15, 0.15000000000000005


Returning Qwen model injected with LoRA into Query and Value Projections with rank = 1
Model loaded on mps


In [None]:
model, step_tracker, val_loss_tracker, total_flops = train(model, lora_rank=1, max_training_steps=100, batch_size=2, learning_rate=1e-4, train_loader = train_loader , val_loader = val_loader, early_stopping_patience=3, subset = 5, print_summary=True, wandb_run=None)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training:   2%|▏         | 2/100 [02:01<1:44:38, 64.07s/step]

In [2]:
def train_once(config):
    # Load model and tokenizer
    model, tokeniser, device = load_qwen_model()
    add_lora(model, config["lora_rank"])

    # Extract config
    learning_rate = config["learn_rate"]
    lora_rank = config["lora_rank"]
    max_training_steps = config["max_steps"]
    no_train_sequences = config["train_sequences"]
    no_val_sequences = config["val_sequences"]
    batch_size = config["batch_size"]
    decimal_places = config["decimal_places"]
    token_length = config["token_length"]

    # Preprocess
    train_set_total, val_set_total, _ = preprocessor(
        'lotka_volterra_data.h5',
        percentile=90,
        decimal_places=decimal_places,
        train_fraction=0.7,
        validation_fraction=0.15,
        shuffle=False,
        print_summary=False
    )

    train_set = train_set_total[:no_train_sequences]
    val_set = val_set_total[:no_val_sequences]

    train_dataset = TimeSeriesData(train_set, tokeniser)
    val_dataset = TimeSeriesData(val_set, tokeniser)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=1)

    optimiser = torch.optim.Adam((p for p in model.parameters() if p.requires_grad), lr=learning_rate)
    early_stopper = EarlyStopping(patience=5, mode='min')

    accelerator = Accelerator()
    model, optimiser, train_loader, val_loader = accelerator.prepare(model, optimiser, train_loader, val_loader)

    model.train()
    step = 0
    total_eval_cost = 0

    pbar = tqdm(total=max_training_steps, desc="Training")

    while step < max_training_steps:
        for batch in train_loader:
            optimiser.zero_grad()
            outputs = model(batch, labels=batch)
            loss = outputs.loss
            accelerator.backward(loss)
            optimiser.step()

            step += 1
            pbar.update(1)

            if step % 10 == 0:
                val_loss, flops_cost = evaluate(model, val_loader, accelerator, lora_rank)
                total_eval_cost += flops_cost
                pbar.set_postfix({"train_loss": loss.item(), "val_loss": val_loss})

                early_stopper(val_loss)
                if early_stopper.early_stop:
                    print(f"\n🛑 Early stopping at step {step}")
                    pbar.close()
                    return val_loss

            if step >= max_training_steps:
                break

    pbar.close()
    training_flops, _ = model_training_flops(no_tokens=token_length, lora_ranks=lora_rank, batch_size=batch_size, num_steps_training=step, print_summary=False)
    print(f'\n📊 Total training cost: {training_flops:.2e} FLOPs')
    print(f'📊 Total evaluation cost: {total_eval_cost:.2e} FLOPs')
    print(f"\n✅ Training completed. Final validation loss: {val_loss:.4f}")
    return val_loss

In [3]:
test_config = {
    "learn_rate": 1e-4,
    "lora_rank": 4,
    "max_steps": 100,
    "train_sequences": 200,
    "val_sequences": 10,
    "batch_size": 2,
    "decimal_places": 3,
    "token_length": 512
}

val_loss = train_once(test_config)

Currently loading: Qwen/Qwen2.5-0.5B-Instruct


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
INFO - File loaded successfully. Trajectories shape: (1000, 100, 2), Time points shape: (100,)
INFO - Scaling data by alpha=0.25283724069595337, ensuring 90% of values fit within the model's expected range.
INFO - Data scaled to 3 decimal places
INFO - Splitting the data into training, validation, and test sets with fractions: 0.7, 0.15, 0.15000000000000005


Model loaded on mps


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Training:  10%|█         | 10/100 [00:29<04:01,  2.68s/it]

512 51


Training:  20%|██        | 20/100 [01:23<04:01,  3.02s/it, train_loss=4.05, val_loss=3]

512 51


Training:  30%|███       | 30/100 [02:16<03:37,  3.11s/it, train_loss=4.16, val_loss=2.31]

512 51


Training:  40%|████      | 40/100 [03:10<02:52,  2.87s/it, train_loss=2.16, val_loss=1.75]

512 51


Training:  50%|█████     | 50/100 [04:20<02:50,  3.41s/it, train_loss=1.24, val_loss=1.44]

512 51


Training:  60%|██████    | 60/100 [05:14<01:48,  2.72s/it, train_loss=1.14, val_loss=1.29]

512 51


Training:  70%|███████   | 70/100 [06:11<01:29,  2.98s/it, train_loss=1.25, val_loss=1.21]

512 51


Training:  80%|████████  | 80/100 [07:09<01:04,  3.22s/it, train_loss=1.17, val_loss=1.17]

512 51


Training:  90%|█████████ | 90/100 [08:05<00:27,  2.78s/it, train_loss=1.12, val_loss=1.14]

512 51


Training: 100%|██████████| 100/100 [08:56<00:00,  2.79s/it, train_loss=0.563, val_loss=1.11]

512 51


Training: 100%|██████████| 100/100 [09:21<00:00,  5.61s/it, train_loss=1.17, val_loss=1.09] 


📊 Total training cost: 3.62e+14 FLOPs
📊 Total evaluation cost: 3.08e+14 FLOPs

✅ Training completed. Final validation loss: 1.0922



