In [1]:
from model import MiniTransformer  # your model definition
import torch.nn.functional as F

import torch
import torch.nn as nn

try:
    import CLPSO_GRAD_script
except Exception as e:
    print("Failed to import script:", e)

checkpoint = torch.load("mini_llm_checkpoint.pt", map_location='cpu')

stoi = checkpoint['stoi']
itos = checkpoint['itos']
vocab_size = checkpoint['vocab_size']

# Recreate the model with same architecture
model = MiniTransformer(vocab_size=vocab_size)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

Failed to import script: No module named 'seaborn'


MiniTransformer(
  (token_emb): Embedding(121, 128)
  (pos_emb): Embedding(64, 128)
  (blocks): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (head): Linear(in_features=128, out_features=121, bias=True)
)

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
print("Using device:", device)

Using device: cuda


In [3]:
# Step 1: Load a small portion of the file
max_chars = 200_000_000  # Adjust depending on your RAM (1 million = ~1MB of text)

text = ""
with open("TinyStories-train.txt", "r", encoding="utf-8", errors="ignore") as f:
    while len(text) < max_chars:
        line = f.readline()
        if not line:
            break
        text += line

print(f"Loaded {len(text):,} characters of text.")

# Step 2: Build character-level vocabulary
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"Vocabulary size: {vocab_size}")

stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}

def encode(s):
    return [stoi[c] for c in s]

def decode(l):
    return ''.join([itos[i] for i in l])

# Step 3: Convert to tensor efficiently
ids = [stoi[c] for c in text if c in stoi]
data = torch.tensor(ids, dtype=torch.long)
print("Data shape:", data.shape)

Loaded 200,000,075 characters of text.
Vocabulary size: 121
Data shape: torch.Size([200000075])


In [4]:
# Split into training and validation sets
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

# Set model input parameters
block_size = 64  # context window length
batch_size = 32  # number of sequences per batch

# Batch sampling function
def get_batch(split):
    data_split = train_data if split == 'train' else val_data
    ix = torch.randint(len(data_split) - block_size, (batch_size,))
    x = torch.stack([data_split[i:i+block_size] for i in ix])
    y = torch.stack([data_split[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

In [5]:
criterion = nn.CrossEntropyLoss()

# Call the CLPSO fine-tuning function
model, losses, precisions = CLPSO_GRAD_script.run_clpso(
    model_path="mini_llm_checkpoint.pt",           # path to your pre-trained model
    get_batch_fn=get_batch,          # your batch sampling function
    criterion=criterion,
    vocab_size=vocab_size,           # should match your token count
    fine_tune_epochs=3,              # tweak as needed
    num_particles=5                  # tweak based on GPU memory
)

NameError: name 'CLPSO_GRAD_script' is not defined

In [9]:
def generate(model, start_text, max_new_tokens=20, block_size=64, temperature=0.8):
    model.eval()
    device = next(model.parameters()).device
    input_ids = torch.tensor([stoi[c] for c in start_text], dtype=torch.long)[None, :].to(device)

    for _ in range(max_new_tokens):
        x_cond = input_ids[:, -block_size:]  # crop context window
        logits = model(x_cond)               # [B, T, vocab]
        probs = F.softmax(logits[:, -1, :]/ temperature, dim=-1)  # last token
        next_token = torch.multinomial(probs, num_samples=1)  # sample
        input_ids = torch.cat([input_ids, next_token], dim=1)

    return ''.join([itos[i] for i in input_ids[0].tolist()])

In [19]:
output = generate(model, "Once there was a cat who",max_new_tokens=100,temperature=0.5)
print(output)

Once there was a cat whoh ohckhh chhh crr hhhhhh hhhhakh hhh chhir that he shair the happy lot. The stom was it the stor sta
