In [None]:
import urllib.request
import re

# 1. Download file
url = ("https://raw.githubusercontent.com/rasbt/"
       "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
       "the-verdict.txt")
file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

# 2. Baca file
with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

print("Total number of characters =", len(raw_text))

# 3. Preprocessing & Vocab
# RegEx yang lebih bersih untuk menangkap tanda baca dan spasi
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

all_words = sorted(list(set(preprocessed)))
vocab = {token: idx for idx, token in enumerate(all_words)}

# 4. Tokenizer Class
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i: s for s, i in vocab.items()}

    def encode(self, text):
        # Gunakan regex yang sama dengan saat pembuatan vocab
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        
        # Tambahkan pengecekan agar tidak error jika kata tidak ada di vocab
        ids = [self.str_to_int[s] for s in preprocessed if s in self.str_to_int]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Merapikan spasi pada tanda baca
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

# 5. Testing
tokenizer = SimpleTokenizerV1(vocab)

# Catatan: Jika kata "Gisburn" tidak ada di file txt, dia akan dilewati (skip)
text = "It's the last he painted, you know, Mrs. Gisburn said with pardonable pride."
ids = tokenizer.encode(text)

print("Token IDs:", ids)
print("Decoded text:", tokenizer.decode(ids))

Total number of characters = 20479
Token IDs: [58, 2, 872, 1013, 615, 541, 763, 5, 1155, 608, 5, 69, 7, 39, 873, 1136, 773, 812, 7]
Decoded text: It' s the last he painted, you know, Mrs. Gisburn said with pardonable pride.
