In [1]:
%load_ext autoreload
%autoreload 2

from llm.train import Trainer

trainer = Trainer(
    model_path='results/tolstoy',  # output directory where the model will be saved
    training_data_path='https://www.gutenberg.org/cache/epub/2600/pg2600.txt',  # dataset URL or local path
    eval_interval=10,  # when to evaluate the model
    batch_size=4,  # batch size
    block_size=16,  # block size (aka context length)
    n_layer=2,  # number of layers
    n_head=4,  # number of attention heads per layer
    n_embd=32,  # embedding dimension
    dropout=0.2,  # dropout rate
    learning_rate=0.05,  # learning rate
    min_lr=0.005,  # minimum learning rate
    beta2=0.99,  # adam beta2 (should be reduced for larger models / datasets)
)
trainer.run()

Using device cuda
tokens per iteration will be: 2,560
Initializing a new model from scratch
defaulting to vocab_size of GPT-2 to 50304 (50257 rounded up for efficiency)
1.63M parameters (6.23MB)
num decayed parameter tensors: 10, with 1,634,816 parameters
num non-decayed parameter tensors: 5, with 160 parameters
using fused AdamW: True
step 0: train loss 10.8222, val loss 10.8219
step 10: train loss 10.7763, val loss 10.7851
saving checkpoint to results/tolstoy/ckpt_init.pt
step 20: train loss 10.5780, val loss 10.5954
saving checkpoint to results/tolstoy/ckpt.pt
step 30: train loss 10.2757, val loss 10.2863
saving checkpoint to results/tolstoy/ckpt.pt
step 40: train loss 9.8405, val loss 9.8803
saving checkpoint to results/tolstoy/ckpt.pt
step 50: train loss 9.2679, val loss 9.2986
saving checkpoint to results/tolstoy/ckpt.pt
step 60: train loss 8.6045, val loss 8.6906
saving checkpoint to results/tolstoy/ckpt.pt
step 70: train loss 7.9481, val loss 8.0610
saving checkpoint to results

In [2]:
help(Trainer)

Help on class Trainer in module llm.train:

class Trainer(llm.ml.ML)
 |  Trainer(training_data_path: Optional[str] = MISSING, init_from: str = MISSING, torch_compile: bool = MISSING, train_ratio: float = MISSING, log_interval: int = MISSING, eval_interval: int = MISSING, eval_iters: int = MISSING, eval_only: bool = MISSING, always_save_checkpoint: bool = MISSING, override_checkpoint: bool = MISSING, wandb_log: bool = MISSING, wandb_project: str = MISSING, wandb_run_name: str = MISSING, encoding: str = MISSING, dataset: Optional[str] = MISSING, gradient_accumulation_steps: int = MISSING, batch_size: int = MISSING, block_size: int = MISSING, n_layer: int = MISSING, n_head: int = MISSING, n_embd: int = MISSING, dropout: float = MISSING, bias: bool = MISSING, learning_rate: float = MISSING, max_iters: int = MISSING, weight_decay: float = MISSING, beta1: float = MISSING, beta2: float = MISSING, grad_clip: float = MISSING, decay_lr: bool = MISSING, warmup_iters: int = MISSING, lr_decay_iters

In [3]:
from llm.sample import Sampler

sampler = Sampler(
    model_path='results/tolstoy',  # output directory where the model has been saved
)
generated_text = sampler.generate_text(
    prompt='He decided to',  # prompt
    max_tokens=100,  # number of tokens to generate
)
print(generated_text)

Using device cuda
Using model in results/tolstoy/ckpt.pt
1.63M parameters (6.23MB)
He decided to the battle beyond the door.
the




d, the whole longer,� said he had the study, her been
out than but he was Dósov.
We the life... he did had see already should an coach-who and atsha!” man, “I said she as

but
in be seen a little heard a arm
old up.
After. The


In [4]:
help(Sampler.__init__)

Help on function __init__ in module llm.sample:

__init__(self, checkpoint_name: str = 'last', init_from: str = 'resume', **kwargs)
    :param checkpoint_name: name of the checkpoint to load, ignored if init_from is not 'resume'
    :param init_from: either 'resume' (from a local model_path) or 'online' (from HuggingFace hub)



In [5]:
help(Sampler.generate_text) 

Help on function generate_text in module llm.sample:

generate_text(self, prompt: str = '\n', num_samples: int = 1, max_tokens: int = 100, temperature: float = 1.0, top_k: int = 200)
    :param prompt: prompt to start generation.
      Can be "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
    :param num_samples: number of samples to draw
    :param max_tokens: number of tokens generated in each sample
    :param temperature: 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
    :param top_k: retain only the top_k most likely tokens, clamp others to have 0 probability



In [6]:
sampler = Sampler(init_from='online', model_path='gpt2')
print(sampler.generate_text(prompt='Today I decided to'))

Using device cuda
loading weights from pretrained gpt: gpt2
forcing vocab_size=50257, block_size=1024, bias=True
overriding dropout rate to 0.0
123.65M parameters (471.70MB)


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

No meta.pkl found, assuming GPT-2 encodings
Today I decided to write about some of the films I have seen. I began reading all the reviews and viewed some of them and watched the answers to some of the questions!

Sean McDermott: Could you tell us a bit about what you observe about the genre coming from Sony Pictures?

Jean-Pierre Dumont: Right now the genre is big. When it is larger it is different enough that they actually support a lot of what the big studios were pushing with this thing. For instance when Star Wars


In [8]:
from llm.interface import UserInterface

ui = UserInterface(model_path='gpt2', model_kw=dict(init_from='online'))
ui.run()

Using device cpu
loading weights from pretrained gpt: gpt2
forcing vocab_size=50257, block_size=1024, bias=True
overriding dropout rate to 0.0
123.65M parameters (471.70MB)
No meta.pkl found, assuming GPT-2 encodings
