In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [3]:
with open("datasets\input.txt", "r", encoding="utf-8") as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [
    stoi[c] for c in s
]  # encoder: take a string, output a list of integers
decode = lambda l: "".join(
    [itos[i] for i in l]
)  # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))  # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [7]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

In [8]:
# Initialize a tokenizer with Byte-Pair Encoding (BPE)
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

# Create a trainer for the tokenizer
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

# Train the tokenizer
files = ["datasets/input.txt"]
tokenizer.train(files, trainer)

In [11]:
# Update the vocabulary size based on the tokenizer's vocab
vocab_size = tokenizer.get_vocab_size()

# Update stoi and itos mappings based on the tokenizer's vocabulary
stoi = tokenizer.get_vocab()
itos = {i: s for s, i in stoi.items()}

# Update encode and decode functions for sub-word tokenization
encode = lambda s: tokenizer.encode(s).ids
decode = lambda l: tokenizer.decode(l)

# Encode the entire dataset and convert to a PyTorch tensor
encoded_data = encode(text)
data = torch.tensor(encoded_data, dtype=torch.long)

# Create train and test splits
n = int(0.9 * len(data))  # Use the first 90% for training, the rest for validation
train_data = data[:n]
val_data = data[n:]

In [12]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers

In [13]:
# Initialize a tokenizer with Byte-Pair Encoding (BPE)
tokenizer = Tokenizer(models.WordLevel(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Whitespace()

# Create a trainer for the tokenizer
trainer = trainers.WordLevelTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

# Train the tokenizer
files = ["datasets/input.txt"]
tokenizer.train(files, trainer)

In [14]:
# Update the vocabulary size based on the tokenizer's vocab
vocab_size = tokenizer.get_vocab_size()

# Update stoi and itos mappings based on the tokenizer's vocabulary
stoi = tokenizer.get_vocab()
itos = {i: s for s, i in stoi.items()}

# Update encode and decode functions for sub-word tokenization
encode = lambda s: tokenizer.encode(s).ids
decode = lambda l: tokenizer.decode(l)

# Encode the entire dataset and convert to a PyTorch tensor
encoded_data = encode(text)
data = torch.tensor(encoded_data, dtype=torch.long)

# Create train and test splits
n = int(0.9 * len(data))  # Use the first 90% for training, the rest for validation
train_data = data[:n]
val_data = data[n:]