In [3]:
import os
import torch
from torchtext.data.utils import get_tokenizer

DATA_DIR = './data/'

# Tokenize datasets

In [10]:
TOKENIZER_EN = get_tokenizer('basic_english')

def create_tokens(
    dataset: str
):
    """
    Create word tokens for the given dataset.  

    Datasets are defines as subdirectories of `data/input`. (Currently `train`, `val` and `test`)
    """

    outFileName = os.path.join(DATA_DIR, f'words.{dataset}.pt')
    
    # If the file exists, don't create it again.
    if os.path.isfile(outFileName):
        print(f"Loading existing tokens for {dataset} ({outFileName})")
        return torch.load(outFileName)
    
    # Scan for all input files
    inDirectoryName = os.path.join(DATA_DIR, 'input', dataset)
    inFileNames = [os.path.join(inDirectoryName, f) for f in os.listdir(inDirectoryName)]

    # Read all the lines from all the files
    lines = []
    for inFileName in inFileNames:
        with open(inFileName, 'r') as file:
            lines += file.readlines()

    # Tokenize all the lines
    tokens = [TOKENIZER_EN(line) for line in lines]

    # Save tokens so we dont have to do this again
    torch.save(tokens, outFileName)

    print(f"Creates tokens for {dataset} ({outFileName})")
    
    return tokens

WORDS_TRAIN = create_tokens('train')
WORDS_VAL = create_tokens('val')
WORDS_TEST = create_tokens('test')

Loading existing tokens for train (./data/words.train.pt)
Loading existing tokens for val (./data/words.val.pt)
Loading existing tokens for test (./data/words.test.pt)
