In [3]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Using cached scipy-1.15.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading joblib-1.5.1-py3-none-any.whl (307 kB)
Using cached scipy-1.15.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.7 MB)
Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, s

In [1]:
from model import MiniTransformer  # your model definition
import torch
import torchvision
import numpy as np
import torch.multiprocessing as mp
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
import torch.nn as nn
import matplotlib.pyplot as plt
import torch.nn.functional as F
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import os
import copy
import warnings
import time

try:
    import CLPSO_GRAD_script
except Exception as e:
    print("Failed to import script:", e)

checkpoint = torch.load("mini_llm_checkpoint.pt", map_location='cpu')

stoi = checkpoint['stoi']
itos = checkpoint['itos']
vocab_size = checkpoint['vocab_size']

# Recreate the model with same architecture
model = MiniTransformer(vocab_size=vocab_size)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

MiniTransformer(
  (token_emb): Embedding(121, 128)
  (pos_emb): Embedding(64, 128)
  (blocks): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (head): Linear(in_features=128, out_features=121, bias=True)
)

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
print("Using device:", device)

Using device: cuda


In [3]:
# Step 1: Load a small portion of the file
max_chars = 200_000_000  # Adjust depending on your RAM (1 million = ~1MB of text)

text = ""
with open("TinyStories-train.txt", "r", encoding="utf-8", errors="ignore") as f:
    while len(text) < max_chars:
        line = f.readline()
        if not line:
            break
        text += line

print(f"Loaded {len(text):,} characters of text.")

# Step 2: Build character-level vocabulary
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"Vocabulary size: {vocab_size}")

stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}

def encode(s):
    return [stoi[c] for c in s]

def decode(l):
    return ''.join([itos[i] for i in l])

# Step 3: Convert to tensor efficiently
ids = [stoi[c] for c in text if c in stoi]
data = torch.tensor(ids, dtype=torch.long)
print("Data shape:", data.shape)

Loaded 200,000,075 characters of text.
Vocabulary size: 121
Data shape: torch.Size([200000075])


In [4]:
# Split into training and validation sets
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

# Set model input parameters
block_size = 64  # context window length
batch_size = 32  # number of sequences per batch

# Batch sampling function
def get_batch(split):
    data_split = train_data if split == 'train' else val_data
    ix = torch.randint(len(data_split) - block_size, (batch_size,))
    x = torch.stack([data_split[i:i+block_size] for i in ix])
    y = torch.stack([data_split[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

In [6]:
criterion = nn.CrossEntropyLoss()

# Call the CLPSO fine-tuning function
model, losses, precisions = CLPSO_GRAD_script.run_clpso(
    model_path="mini_llm_checkpoint.pt",           # path to your pre-trained model
    get_batch_fn=get_batch,          # your batch sampling function
    criterion=criterion,
    vocab_size=vocab_size,           # should match your token count
    fine_tune_epochs=100,              # tweak as needed
    num_particles=30,                  # tweak based on GPU memory
    w=0.5,
    c1=1.8,
    c2=1.2,
    bounds=0.1,
    p_threshold=0.05,
    gd_learning_rate=0.006,
    gd_weight_decay=0.008,
    num_grad_steps=6,
    num_eval_batches=1
)

Initial Global Best Fitness (from pre-trained head): 0.8042
Epoch 1/100
New global best fitness: 0.7709
New global best fitness: 0.7471
New global best fitness: 0.7383
New global best fitness: 0.7198
Epoch 1/100 - Val Loss: 0.8196, Val Acc: 74.12%
Epoch completed in 2.89s. Best Global Fitness: 0.7198
Epoch 2/100
Epoch 2/100 - Val Loss: 0.8121, Val Acc: 73.14%
Epoch completed in 2.60s. Best Global Fitness: 0.7198
Epoch 3/100
Epoch 3/100 - Val Loss: 0.7594, Val Acc: 76.07%
Epoch completed in 2.53s. Best Global Fitness: 0.7198
Epoch 4/100
Epoch 4/100 - Val Loss: 0.7844, Val Acc: 74.90%
Epoch completed in 2.51s. Best Global Fitness: 0.7198
Epoch 5/100
Epoch 5/100 - Val Loss: 0.7940, Val Acc: 74.90%
Epoch completed in 2.67s. Best Global Fitness: 0.7198
Epoch 6/100
Epoch 6/100 - Val Loss: 0.8063, Val Acc: 72.85%
Epoch completed in 2.56s. Best Global Fitness: 0.7198
Epoch 7/100
Epoch 7/100 - Val Loss: 0.8140, Val Acc: 73.78%
Epoch completed in 2.59s. Best Global Fitness: 0.7198
Epoch 8/100
Ep

In [7]:
torch.save({
    'model_state_dict': model.state_dict(),
    'stoi': stoi,
    'itos': itos,
    'vocab_size': vocab_size
}, "finetuned_llm_clpso.pt")

In [8]:
# Step 1: Load the checkpoint
checkpoint = torch.load("finetuned_llm_clpso.pt", map_location='cpu')

# Step 2: Restore vocabulary and mappings
stoi = checkpoint['stoi']
itos = checkpoint['itos']
vocab_size = checkpoint['vocab_size']

# Step 3: Rebuild the model and load weights
model = MiniTransformer(vocab_size=vocab_size)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval().to(device)  # don't forget to move to GPU if available

MiniTransformer(
  (token_emb): Embedding(121, 128)
  (pos_emb): Embedding(64, 128)
  (blocks): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (head): Linear(in_features=128, out_features=121, bias=True)
)

In [24]:
def generate(model, start_text, max_new_tokens=200, block_size=64, temperature=0.8, top_k=50):
    model.eval()
    device = next(model.parameters()).device
    input_ids = torch.tensor([stoi[c] for c in start_text], dtype=torch.long)[None, :].to(device)

    for _ in range(max_new_tokens):
        x_cond = input_ids[:, -block_size:]
        logits = model(x_cond)
        logits = logits[:, -1, :] / temperature  # only the last token's logits

        # Apply top_k filtering
        if top_k is not None:
            values, indices = torch.topk(logits, top_k)
            probs = torch.zeros_like(logits).scatter_(1, indices, values)
            probs = F.softmax(probs, dim=-1)
        else:
            probs = F.softmax(logits, dim=-1)

        next_token = torch.multinomial(probs, num_samples=1)
        input_ids = torch.cat([input_ids, next_token], dim=1)

        if itos[next_token.item()] == '<|endoftext|>':
            break

    return ''.join([itos[i] for i in input_ids[0].tolist()])

In [86]:
output = generate(model, "Once upon a time there was a robot",max_new_tokens=400,temperature=0.7,top_k=30)
print(output)

Once upon a time there was a robot named Timmy. Timmy loved to play outside in the sun. One day, he saw a big ball and noticed that Max was falling on the ground and started to cry.
