In [None]:
import json
import io
from pathlib import Path
import tiktoken
import torch

input_path = Path("../../datasets/wolne_lektury_corpus_cleaned.jsonl")
enc = tiktoken.get_encoding("cl100k_base")

buf = io.StringIO()
docs = 0
with open(input_path, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            obj = json.loads(line)
        except json.JSONDecodeError:
            continue
        text = obj.get("text")
        if not text:
            continue
        buf.write(text)
        buf.write("\n\n")
        docs += 1

concatenated_text = buf.getvalue()
tokens = enc.encode(concatenated_text)

print(f"Total characters: {len(concatenated_text):,}")

In [None]:
data = torch.tensor(tokens, dtype=torch.long)

print(data.shape, data.dtype)
print(data[:1000])

In [None]:
n = int(0.9 * len(data))

train_data = data[:n]
val_data = data[n:]

In [None]:
BLOCK_SIZE = 8
train_data[: BLOCK_SIZE + 1]

In [None]:
x = train_data[:BLOCK_SIZE]
y = train_data[1 : BLOCK_SIZE + 1]

for t in range(BLOCK_SIZE):
    context = x[: t + 1]
    target = y[t]
    print(f"Input -> {context.tolist()}, Target -> {target.item()}")

In [None]:
torch.manual_seed(1337)

BATCH_SIZE = 4
BLOCK_SIZE = 8


def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - BLOCK_SIZE, (BATCH_SIZE,))
    x = torch.stack([data[i : i + BLOCK_SIZE] for i in ix])
    y = torch.stack([data[i + 1 : i + BLOCK_SIZE + 1] for i in ix])
    return x, y


xb, yb = get_batch("train")
print("xb:", xb.shape)
print("yb:", yb.shape)

In [None]:
for b in range(BATCH_SIZE):
    for t in range(BLOCK_SIZE):
        context = xb[b, : t + 1]
        target = yb[b, t]
        print(f"b{b} t{t}: Input -> {context.tolist()}, Target -> {target.item()}")

In [None]:
print(xb)