# Introduction

In [2]:
# Import required packages
import torch
import math
import numpy as np
import matplotlib.pyplot as plt
import h5py

Open `lotka_volterra_data.h5` file on notebook

In [3]:
with h5py.File('lotka_volterra_data.h5', 'r') as f:
    # Access the full dataset
    trajectories = f['trajectories'][:]
    time_points = f['time'][:]

    # Access a single trajectory
    system_id = 0 # First system

Checikng shape of the dataset, we expect trajectories to be of size $(1000 \times 100 \times 2)$, and time_points of size $(100 \times 1)$

In [4]:
print('Time points shape:',time_points.shape)
print('')
print('Trajectory shape (pray/predator):',trajectories.shape)

Time points shape: (100,)

Trajectory shape (pray/predator): (1000, 100, 2)


In [5]:
print(trajectories.shape)
print(time_points.shape)

(1000, 100, 2)
(100,)


In [23]:
trajectories[99][99]

array([2.9806929 , 0.18514897], dtype=float32)

In [29]:
import pandas as pd

num_systems, num_time_steps, num_variables = trajectories.shape

# Create a DataFrame
df_traj = pd.DataFrame({
    "system_id": np.repeat(np.arange(num_systems), num_time_steps),  # Repeats 0-999, each 100 times
    "time_step": np.tile(np.arange(num_time_steps), num_systems),    # Cycles 0-99 for each system
    "prey": trajectories[:, :, 0].flatten(),  # Flatten prey values
    "predator": trajectories[:, :, 1].flatten()  # Flatten predator values
})


In [30]:
df_traj

Unnamed: 0,system_id,time_step,prey,predator
0,0,0,0.949917,1.040624
1,0,1,0.740551,0.779542
2,0,2,0.682246,0.564390
3,0,3,0.716674,0.407644
4,0,4,0.824511,0.300283
...,...,...,...,...
99995,999,95,0.901549,0.579420
99996,999,96,0.957527,0.539055
99997,999,97,1.036460,0.515615
99998,999,98,1.129212,0.510619


Grouping prey and predator into arrays to determine the maximum value for scaling procedure.

In [None]:
prey_array = df_traj['prey'].to_numpy() # Converting to numpy array
predator_array = df_traj['predator'].to_numpy()

print('Max value registered for prey:', max(prey_array))
print('Max value ragistered for predator:', max(predator_array))

Max value registered for prey 13.740113
Max value ragistered for predator 4.7684903


#### `numpy.quantile()`

For scaling our dataset we want to use [`numpy.quantile()`](https://numpy.org/doc/2.1/reference/generated/numpy.quantile.html). The `numpy.quantile()` function calculates the quantiles of a given NumPy array. Quantiles are cut points that devide the data into intercals with equal probability. Thus `numpy.quantile()`can be used to scale our dataset dynamically, without having to worry about choosing the appropriate value for $\alpha$.

In [None]:
# Quick demo of np.quantile

# Defining array for demo
ar = np.array([[10,7,4], [3,2,1]])
print(ar)

[[10  7  4]
 [ 3  2  1]]


In [66]:
# Using np.quantile

np.quantile(ar, 0.99, axis=1 )

array([9.94, 2.98])

Playing around with bits of codes that will be implemented in `preprocessor.py`

In [67]:
def scaling_operator(data, quantile_val):

    upper_limit = 10

    scaling_factor = np.quantile(data, quantile_val)/upper_limit
    scaled_data  = data/scaling_factor

    return scaled_data, scaling_factor
    

Testing on small array ar

In [68]:
ar_scaled = scaling_operator(ar, 0.99)
print('Scaled array (ar_scaled):',ar_scaled[0])
print('')
print('Scaling factor:', ar_scaled[1])

Scaled array (ar_scaled): [[10.15228426  7.10659898  4.06091371]
 [ 3.04568528  2.03045685  1.01522843]]

Scaling factor: 0.9850000000000001


Testing on the actual data

In [69]:
trajectories_ = scaling_operator(trajectories, 0.9)
trajectories_scaled = trajectories_[0]
print('Scaling factor:', trajectories_[1])

Scaling factor: 0.25283724


In [70]:
num_systems_scaled, num_time_steps_scaled, num_variables_scaled = trajectories_scaled.shape

# Create a DataFrame
df_traj_scaled = pd.DataFrame({
    "system_id": np.repeat(np.arange(num_systems_scaled), num_time_steps_scaled),  # Repeats 0-999, each 100 times
    "time_step": np.tile(np.arange(num_time_steps_scaled), num_systems_scaled),    # Cycles 0-99 for each system
    "prey": trajectories_scaled[:, :, 0].flatten(),  # Flatten prey values
    "predator": trajectories_scaled[:, :, 1].flatten()  # Flatten predator values
})

In [71]:
df_traj_scaled

Unnamed: 0,system_id,time_step,prey,predator
0,0,0,3.757031,4.115786
1,0,1,2.928965,3.083177
2,0,2,2.698359,2.232228
3,0,3,2.834528,1.612280
4,0,4,3.261036,1.187654
...,...,...,...,...
99995,999,95,3.565728,2.291671
99996,999,96,3.787127,2.132025
99997,999,97,4.099317,2.039314
99998,999,98,4.466160,2.019556


In [73]:
prey_array_scaled = df_traj_scaled['prey'].to_numpy() # Converting to numpy array
predator_array_scaled = df_traj_scaled['predator'].to_numpy()

print('Max value registered for prey (scaled):', max(prey_array_scaled))
print('Mean value registered for prey (scaled):', np.mean(prey_array_scaled))
print('Max value registered for predator (scaled):', max(predator_array_scaled))
print('Mean value registered for predator (scaled):', np.mean(predator_array_scaled))


Max value registered for prey (scaled): 54.343708
Mean value registered for prey (scaled): 6.716232
Max value registered for predator (scaled): 18.85992
Mean value registered for predator (scaled): 2.2528565


In [10]:
from src.qwen import *

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
model, tokenizer = load_qwen()

2025-03-13 21:52:07.016595: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741902727.104182    1050 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741902727.129181    1050 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-13 21:52:07.344146: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [12]:
from transformers import AutoTokenizer

model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [13]:
print(tokenizer("1.23", return_tensors="pt")["input_ids"].tolist()[0])

[16, 13, 17, 18]


In [14]:
print(tokenizer("1 . 2 3", return_tensors="pt")["input_ids"].tolist()[0])

[16, 659, 220, 17, 220, 18]
