In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")
# Commented out because we yet again find mps to be drastically slower
# elif torch.backends.mps.is_available():
#     torch._dynamo.disable()  # https://github.com/pytorch/pytorch/issues/149184
#     device = torch.device("mps")
else:
    device = torch.device("cpu")
print(f"{device=}")

device=device(type='cpu')


In [9]:
from transformers import AutoTokenizer
from utilities.data_handler import load_preprocessed

tokenizer = AutoTokenizer.from_pretrained(
    "openai-community/gpt2",
    pad_token="<|pad|>",
    unk_token="<unk>",  # Because it appears often in the dataset
)

context_length = 512
dataset, tokenized_ds = load_preprocessed(
    hf_path="wikitext", hf_name="wikitext-103-v1", tokenizer=tokenizer, context_length=context_length
)
dataset, tokenized_ds

(DatasetDict({
     test: Dataset({
         features: ['text'],
         num_rows: 4358
     })
     train: Dataset({
         features: ['text'],
         num_rows: 1801350
     })
     validation: Dataset({
         features: ['text'],
         num_rows: 3760
     })
 }),
 DatasetDict({
     test: Dataset({
         features: ['input_ids'],
         num_rows: 2230
     })
     train: Dataset({
         features: ['input_ids'],
         num_rows: 930314
     })
     validation: Dataset({
         features: ['input_ids'],
         num_rows: 1953
     })
 }))

In [4]:
from utilities.models import PositionalEmbedding
from torch import nn

token_embedder = nn.Embedding(
    num_embeddings=tokenizer.vocab_size, embedding_dim=512, device=device
)
positional_embedder = PositionalEmbedding(embedding_dim=512)
transformer_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, device=device)
transformer = nn.TransformerEncoder(transformer_layer, num_layers=6)
decoder = nn.Linear(512, tokenizer.vocab_size, device=device)

src = ["Hi, my name", "The United States of"]
tokenized = tokenizer(src, return_tensors="pt").to(device)
embedded = token_embedder(tokenized.input_ids) + positional_embedder(tokenized.input_ids)
transformed = transformer(
    embedded.permute(1, 0, 2),  # Transformer expects (seq_len, batch_size, features)
    mask=nn.Transformer.generate_square_subsequent_mask(tokenized.input_ids.shape[1], device=device),
    # Skipping is_causal since seems troublesome: https://github.com/pytorch/pytorch/issues/96941
)
logits = decoder(transformed.permute(1, 0, 2))  # Back to (batch_size, seq_len, features)
result = tokenizer.batch_decode(logits[:, -1, :].argmax(dim=-1))
result



[' networks', ' networks']

In [5]:
from utilities.models import TransformerEncoderGPT

model = TransformerEncoderGPT(d_model=512, nhead=8, num_layers=6, dim_feedforward=2048, vocab_size=tokenizer.vocab_size, device=device)
model

MyGPT(
  (token_embedder): Embedding(50257, 512)
  (positional_embedder): PositionalEmbedding()
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (decoder): Linear(in_features=512, out_features=50257, bias=True)
)

In [14]:
tokenizer.batch_decode(model(tokenizer(src, return_tensors="pt").input_ids.to(device))[:, -1, :].argmax(dim=-1))

['umbn', ' cyt']

In [15]:
from utilities.model_handler import print_stream

for _ in range(10):
    print_stream(model=model, tokenizer=tokenizer, prompt="The United States of", device=device, max_length=50)
    print("", flush=True)

The United States of disinteg Salvador McAuliffe pred Christian predicio SalvadorViceUnd cyt continuation catchy stimul Launch Launch Launch Launch Launch Launch Launch Launch Launch Launch Launch Launch Launch Launch Byzantine cyt208 Launch swallow Ride Ride Ride AurTue Aur Launch Launch Launch Launch Launch Launch Launch Launch Launch Launch walked
The United States of chall experien LaunchRel 345coolCookBuilt Hundredscoolér hectcoolCookBuilt Hundredscool mentor Launch Launch Launch Launch Launch Launch Launch Launch swallow cyt Launch walked cyt Morgan radiobleacher pepper Launch radioConsider radio cyt Launch Launch Launch Byzantine Launch refusal normal cartLeave Launch
The United States ofViceUnd nexusIB 104699 Lehakh experien Launch fentanyl batters Launch Launch Launch Launch Launch Launch Launch Launch Launch Launch Launch Launch closely cyt208 Launch Byzantine Launch Byzantine cyt208 Launch radio cyt Launch walked208 Launch Launch Launch Launch Launch Launch Launch Launch rad

In [16]:
dataset["train"]["text"][:5]

['',
 ' = Valkyria Chronicles III = \n',
 '',
 ' Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . \n',
 " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving

In [18]:
tokenizer.pad_token = tokenizer.eos_token
print(tokenizer.pad_token_id)
tokenized = [
    {
        "input_ids": tokenizer(s, return_tensors="pt").input_ids.flatten().tolist()
    }
    for s in ["Hi", "my name is", "What, my name is"]
]
print(tokenized)
tokenizer.pad(tokenized, return_tensors="pt")

50256
[{'input_ids': [17250]}, {'input_ids': [1820, 1438, 318]}, {'input_ids': [2061, 11, 616, 1438, 318]}]


{'input_ids': tensor([[17250, 50256, 50256, 50256, 50256],
        [ 1820,  1438,   318, 50256, 50256],
        [ 2061,    11,   616,  1438,   318]]), 'attention_mask': tensor([[1, 0, 0, 0, 0],
        [1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1]])}

In [21]:
import time
from pathlib import Path
from utilities.model_handler import train

# Using hyperparams of GPT paper (although we use a different dataset)
model = TransformerEncoderGPT(d_model=768, nhead=12, num_layers=12, dim_feedforward=3072, vocab_size=tokenizer.vocab_size, device=device)
optimizer = torch.optim.Adam(
    params=model.parameters(),
    betas=(0.9, 0.98),
    eps=1e-9,
    lr=2.5e-4,
)
train_losses, eval_losses = train(
    model=model,
    optimizer=optimizer,
    tokenizer=tokenizer,
    tokenized_train_ds=tokenized_ds["train"],
    tokenized_eval_ds=tokenized_ds["validation"],
    device=device,
    checkpoint_path=Path(f"checkpoints/{int(time.time())}"),
)

Batch 1/7969 in epoch 1/100: Loss 11.03641414642334
In 1814, the holidaysceptiveceptiveceptiveceptiveceptiveceptiveceptive CSV CSV CSV CSV CSV CSV CSV CSV CSV CSV CSV CSV CSV drifting drifting drifting drifting drifting drifting drifting drifting drifting drifting CSV CSV CSV CSV CSV CSV CSV CSV CSV CSV CSV CSV doomed drifting drifting doomed drifting doomed drifting
Avg. validation Loss 11.022744178771973
Batch 26/7969 in epoch 1/100: Loss 10.633322715759277
Batch 51/7969 in epoch 1/100: Loss 10.01376724243164


KeyboardInterrupt: 

In [16]:
from utilities.models import TransformerEncoderGPT2
from utilities.model_handler import print_stream

model = TransformerEncoderGPT2(d_model=768, nhead=12, num_layers=12, dim_feedforward=3072, vocab_size=len(tokenizer), eos_token_id=tokenizer.eos_token_id, context_length=512, device=device)
model.load_state_dict(torch.load("../epoch_5_batch_1", map_location=device))

for _ in range(10):
    print_stream(model=model, tokenizer=tokenizer, prompt=f"{tokenizer.eos_token}She first", device=device, max_length=512, prob_threshold=0.95, temperature=1.0)
    print("", flush=True)

<|endoftext|>She first formed on 4 July 1936 , when Russell , with whom J. Gillian married Lehigh , and lost a series of matches . He was succeeded by a long , but he finally won the Test . Albert was the 1948 championship . This feat ended with an average of 71 . Despite his physical optimism since London , he ran on his ticket for the death of Hugh Dudley . John Bardowulf was later awarded over 100 @-@ weighed . In the winter he scored in the midst of a season that contained many other magazines than Carl W. Harvey . 
<|endoftext|>

<|endoftext|>She first appeared in the 2009 Indianapolis Colts , where they finished the first A team in the preseason award . In the first game , he stopped for the rest of the team for " back down and down to four down to two catch between the Tigers and their enemy . " 
<|endoftext|>

<|endoftext|>She first reported that Rivera had grown on playing for a band since moving a house around Détis . His funeral is divided into sixteen syncedively and domina

KeyboardInterrupt: 