In [27]:
import os
import torch
import re
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

DATA_DIR = './data/'
MIN_WORD_FREQUENCY = 100

# Tokenize datasets

In [28]:
TOKENIZER = get_tokenizer('basic_english')

def read_lines(
    dataset: str
) -> list[str]:
    """
    Reads all the lines form all the texts in the given `dataset`.

    Datasets are `train`, `val` and `test`.
    """

    # Scan for all input files
    inDirectoryName = os.path.join(DATA_DIR, 'input', dataset)
    inFileNames = [os.path.join(inDirectoryName, f) for f in os.listdir(inDirectoryName)]

    # Read all the lines from all the files
    lines = []
    for inFileName in inFileNames:
        with open(inFileName, 'r') as file:
            lines += file.readlines()

    print(f"Read {len(lines)} lines from {dataset}")
    return lines

def create_tokens(
    dataset: str
) -> list:
    """
    Creates tokens for all the words in the given `dataset`.

    Datasets are `train`, `val` and `test`.
    """

    outFileName = os.path.join(DATA_DIR, f'words.{dataset}.pt')
    
    # If the file exists, don't create it again.
    if os.path.isfile(outFileName):
        print(f"Loaded tokenized words for {dataset} ({outFileName})")
        return torch.load(outFileName)

    tokens = []
    for line in read_lines(dataset):
        tokens += TOKENIZER(line)

    # Save tokens so we dont have to do this again
    torch.save(tokens, outFileName)
    
    return tokens

def create_vocabulary(
    dataset: str
):
    """
    Creates a vocabulary for the given `dataset`.

    Datasets are `train`, `val` and `test`.
    """

    outFileName = os.path.join(DATA_DIR, f'vocab.{dataset}.pt')

    # If the file exists, don't create it again.
    if os.path.isfile(outFileName):
        print(f"Loaded vocabulary for {dataset} ({outFileName})")
        return torch.load(outFileName)

    def read_sanitize_tokenize():

        for line in read_lines(dataset):

            line = re.sub('\\w*[0-9]+\\w*', ' ', line) # Remove numbers
            line = re.sub('\\w*[A-Z]+\\w*', ' ', line) # Remove uppercase names
            line = re.sub('\\s+', ' ', line) # Remove double spaces

            yield TOKENIZER(line)

    vocabulary = build_vocab_from_iterator(read_sanitize_tokenize(), min_freq=MIN_WORD_FREQUENCY, specials=['<unk>'])

    vocabulary.set_default_index(vocabulary['<unk>'])

    # We removed all uppercase names, this includes 'I'
    vocabulary.append_token('i') 

    # Save vocabulary so we dont have to do this again
    torch.save(vocabulary, outFileName)

    return vocabulary
    


In [31]:
words_train = create_tokens('train')
words_val = create_tokens('val')
words_test = create_tokens('test')

vocab = create_vocabulary('train')
VOCAB_SIZE = len(vocab)

Loaded tokenized words for train (./data/words.train.pt)
Loaded tokenized words for val (./data/words.val.pt)
Loaded tokenized words for test (./data/words.test.pt)
Loaded vocabulary for train (./data/vocab.train.pt)


In [32]:

print("Total number of words in the training dataset:     ", len(words_train))
print("Total number of words in the validation dataset:   ", len(words_val))
print("Total number of words in the test dataset:         ", len(words_test))
print("Number of distinct words in the training dataset:  ", len(set(words_train)))
print("Number of distinct words kept (vocabulary size):   ", VOCAB_SIZE)

Total number of words in the training dataset:      2684706
Total number of words in the validation dataset:    49526
Total number of words in the test dataset:          124152
Number of distinct words in the training dataset:   52105
Number of distinct words kept (vocabulary size):    1880
