## Imports and setup

In [1]:
import sys
assert any("deep_learning_curriculum" in p for p in sys.path)

In [8]:
from __future__ import annotations

from collections import Counter
from dataclasses import dataclass, field

import numpy as np
import os
import pathlib
import re
from urllib.request import urlopen
from nltk.tokenize import word_tokenize

import torch as t
from torch import optim


from config import Config
from model import Transformer

In [3]:
PATH = pathlib.Path(os.getcwd())
while not str(PATH).endswith("_curriculum"):
    PATH = PATH.parent
print(f"{PATH = }")

PATH = PosixPath('/home/matthewbaggins/code/deep_learning_curriculum')


## Train the model on Shakespeare's works

In [26]:
data_path = PATH / "data"

def load_corpus_text() -> str:
    if not data_path.exists():
        data_path.mkdir()

    shakespeare_path = data_path / "shakespeare.txt"

    if shakespeare_path.exists():
        print("Loading Shakespeare...")
        with open(shakespeare_path, "r", encoding="utf-8") as f:
            text = f.read()
    else:
        print("Fetching Shakespeare..")
        url = "https://www.gutenberg.org/files/100/100-0.txt"
        text = urlopen(url).read().decode("utf-8")
        with open(shakespeare_path, "w", encoding="utf-8") as f:
            f.write(text)
    return text
    

def tokenize(text: str) -> list[str]:
    return re.split(r"\b", text)

@dataclass(frozen=True, slots=True)
class Corpus:
    text: str
    tokens_str: list[str]
    tokens_int: list[int]
    tok_int2str: dict[int, str]
    tok_str2int: dict[str, int]
    token_counts: dict[str, int]
    
    def __len__(self) -> int:
        return len(self.tokens_str)
    
    @classmethod
    def load(cls) -> Corpus:
        text = load_corpus_text()
        tokens_str = tokenize(text)
        token_counts = Counter(tokens_str)
        tok_int2str: dict[int, str] = {}
        tok_str2int: dict[str, int] = {}
        for i, (tok_str, tok_count) in enumerate(sorted(token_counts.items(), key=lambda x: x[1], reverse=True)):
            tok_int2str[i] = tok_str
            tok_str2int[tok_str] = i
        tokens_int = [tok_str2int[tok_str] for tok_str in tokens_str]
        corpus = cls(
            text=text,
            tokens_str=tokens_str,
            tokens_int=tokens_int,
            tok_int2str=tok_int2str,
            tok_str2int=tok_str2int,
            token_counts=dict(token_counts)
        )
        print(f"Shakespeare text: {len(text)} characters, {len(tokens_str)} tokens")
        return corpus
    
    def get_corpus_subsequences(self, n_subsequences: int = 32) -> list[list[int]]:
        n_tokens = len(self)
        subseq_len = n_tokens // n_subsequences
        subseqs = [self.tokens_int[i * subseq_len : (i + 1)* subseq_len] for i in range(n_subsequences)]
        # seps = [i * (n // n_subsequences) for i in range(n_subsequences)]
        # subseqs = [self.tokens_int[sep0:sep1] for sep0, sep1 in zip(seps, seps[1:])]
        return subseqs

In [27]:
corpus = Corpus.load()

Loading Shakespeare...
Shakespeare text: 5392638 characters, 1991703 tokens


In [32]:
subseqs = corpus.get_corpus_subsequences()

- make model
- preprocess each subseq such that it starts with EOS token
- loss_fn, acc_fn, train (mostly copy pasting)
- do

# TODO

- add BOS token
- retrain model with max number of splits
- generate shakespeare or sth