##### Model Paramenters

In [None]:
from transformers import AutoConfig

# load model configuration
config = AutoConfig.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", trust_remote_code=True)

D = config.hidden_size             # hidden dimension 
H = config.num_attention_heads     # number of heads
d = D // H                         # hidden dimension per head
L= config.num_hidden_layers        # number of layers
V = config.vocab_size              # vocabulary size

print(f"Model parameters:")
print(f"  Hidden dimension: {D}")
print(f"  Number of attention heads: {H}")
print(f"  Hidden dimension per head: {d}")
print(f"  Number of layers: {L}")
print(f"  Vocabulary size: {V}")

Model parameters:
  Hidden dimension: 896
  Number of attention heads: 14
  Hidden dimension per head: 64
  Number of layers: 24
  Vocabulary size: 151936


##### Baseline Budget

In [None]:
from lora_qwen.flops import calculate_flops
baseline_flops = calculate_flops(499, H, D, L, B=1, inference=True, infer_length=99)
print(f"{baseline_flops:e}")

3.716343e+13


##### LoRA 1: With default parameters

In [None]:
from lora_qwen.flops import calculate_flops
training_flops1 = calculate_flops(512, H, D, L, B=4, nsteps=2000, inference=False)
evalue_flops1 = 5 * calculate_flops(499, H, D, L, B=1, inference=True, infer_length=99)  # 1/5 of max steps evaluate once

flops1 = training_flops1 + evalue_flops1
print(f"{flops1:e}")

9.444766e+15


##### LoRA 2: Hyperparameter search

In [None]:
from lora_qwen.flops import calculate_flops

# 2 configurations have early stopped (see hyperparameter_result.csv)
# so that number of evaluations for lr_rank_search is 7*5 + 2*3 = 41
flops_lr_rank = 7 * calculate_flops(320, H, D, L, B=2, nsteps=1500, inference=False)
flops_lr_rank = 2 * calculate_flops(320, H, D, L, B=2, nsteps=900, inference=False)  

search_ctx_length = []
for i in [128, 512, 768]:
    search_ctx_length.append(calculate_flops(i, H, D, L, B=2, nsteps=2000, inference=False))
flops_ctx_length = sum(search_ctx_length)

# 41 for lr_rank (see above) and 5*3 for three context length search
evalue_flops2 = (41 + 15) *  calculate_flops(499, H, D, L, B=1, inference=True, infer_length=99)


flops2 = flops_lr_rank + flops_ctx_length + evalue_flops2
print(f"{flops2:e}")

1.751959e+16


##### LoRA 3: Final Model

In [None]:
from lora_qwen.flops import calculate_flops
training_flops3 = calculate_flops(512, H, D, L, B=4, nsteps=9700, inference=False)
evalue_flops3 = (3+7) * calculate_flops(499, H, D, L, B=1, inference=True, infer_length=99)

flops3 = training_flops3 + evalue_flops3
print(f"{flops3:e}")

4.527753e+16


##### Total FLOPS

In [6]:
total_flops = baseline_flops + flops1 + flops2 + flops3
print(f"{total_flops:e}")

7.227905e+16


In [8]:
import pandas as pd

total_budget = 1e17 

# create a dictionary with experiment names and their FLOPS
experiments = {
    "Baseline (Inference only)": baseline_flops,
    "LoRA 1 (Default parameters)": flops1,
    "LoRA 2 (Hyperparameter search)": flops2,
    "LoRA 3 (Final Model)": flops3,
    "Total": total_flops
}

df = pd.DataFrame(list(experiments.items()), columns=["Experiment", "FLOPS"])

# scientific notation
df["FLOPS (Scientific)"] = df["FLOPS"].apply(lambda x: f"{x:.6e}")

# percentage of total budget
df["% of Total Budget"] = (df["FLOPS"] / total_budget) * 100
df["% of Total Budget"] = df["% of Total Budget"].apply(lambda x: f"{x:.4f}%")

df = df.drop(columns=["FLOPS"])
df = df.rename(columns={"FLOPS (Scientific)": "FLOPS"})

print(df)

# save
with open("../results/flops_summary.txt", "w") as f:
    f.write(f"Total Budget: {total_budget:.0e} FLOPS\n")
    
    # Write headers with fixed width formatting
    f.write(f"{'Experiment':<40} {'FLOPS':<20} {'% of Total Budget':<20}\n")
    f.write("-" * 80 + "\n")
    
    # Write each row
    for _, row in df.iterrows():
        f.write(f"{row['Experiment']:<40} {row['FLOPS']:<20} {row['% of Total Budget']:<20}\n")


                       Experiment         FLOPS % of Total Budget
0       Baseline (Inference only)  3.716343e+13           0.0372%
1     LoRA 1 (Default parameters)  9.444766e+15           9.4448%
2  LoRA 2 (Hyperparameter search)  1.751959e+16          17.5196%
3            LoRA 3 (Final Model)  4.527753e+16          45.2775%
4                           Total  7.227905e+16          72.2790%
