In [1]:
import collections
import re
from d2l import torch as d2l

In [2]:
# Adding an entry to the DATA_HUB dictionary
# `DATA_HUB` is a dictionary that stores metadata about datasets, including their URLs and hash values for validation.
# The dictionary d2l.DATA_HUB has two values, the first one is the url, which it will use reponse.get(url, stream=True) to get a download request form the web and 
# open a file with its path/directory(usually a cache directory) specified by d2l libary, by default it will be "../data/filename (timemachine.txt)"
# and write the web response to the file, the file is our dataset which will be used in our traning

d2l.DATA_HUB['time_machine'] = (
    d2l.DATA_URL + 'timemachine.txt',  # Full URL to the dataset file
    '090b5e7e70c295757f55df93cb0a180b9691891a'  # Hash value for validation of the file's integrity
)

# Define the `read_time_machine` function to load and process the dataset
# This fine will 
def read_time_machine():  
    """
    Reads the 'time_machine' dataset, processes it, and returns it as a list of text lines.
    - Loads the dataset file from the URL in `DATA_HUB` using `d2l.download`.
    - Processes the text to remove non-alphabetic characters, convert to lowercase, and strip leading/trailing spaces.
    """
    # Open the downloaded dataset file in read mode ('r')
    with open(d2l.download('time_machine'), 'r') as f:
        # Read all lines from the file into a list of strings
        lines = f.readlines()
    
    # Process all our text data line by line 
    # - Replace non-alphabetic characters with a space using 
    #           [^A-Za-z] matches any character that is not a letter. "^" means logic not
    # - Remove leading and trailing spaces using `.strip()`. eg. "  aaa  " --> "aaa"
    # - Convert all characters to lowercase using `.lower()`.
    return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]

# Call `read_time_machine` to load and process the data
lines = read_time_machine()  # The processed lines of text are stored in the `lines` list.

# Check total number of lines in our text dataset
print(f'# Total number of text lines: {len(lines)}')

# Print random lines in our text dataset
print(lines[0])
print(lines[10])


Downloading ..\data\timemachine.txt from http://d2l-data.s3-accelerate.amazonaws.com/timemachine.txt...
# Total number of text lines: 3221
the time machine by h g wells
twinkled and his usually pale face was flushed and animated the


In [3]:
# This function takes the string-lines, and split the text(multiple lines) into a list of separate lines
# and lines into a list of words if token='word' as the parameter
# If token = 'char', the line will be split into characters
# Calling example -- tokenize(lines = read_time_machine(), token='word')
def tokenize(lines, token='word'):
    if token == 'word':
        return [line.split() for line in lines]
    elif token == 'char':
        return [list(line) for line in lines]
    else:
        print("Error: unknown tokens" + token)

In [4]:
# Check he first lines of token 
tokens = tokenize(lines)
for i in range(11):
    print(tokens[i])

['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
[]
[]
[]
[]
['i']
[]
[]
['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him']
['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']
['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']


In [8]:
# This function transforms lists into a map that uses "word/char" as the key, its frequency of appearance as value
def count_corpus(tokens): 
    # If len(tokens) == 0, it means tokens is empty, no matter here
    # If isinstance(tokens[0], list), it is a 2d list, the following line [token for line in tokens for token in line]
    # will transform [['aa', 'bb'], ['aa', 'cc']] into ['aa', 'bb', 'aa', 'cc'],
    # namely 2d list -> 1d list
    if len(tokens) == 0 or isinstance(tokens[0], list): 
        tokens = [token for line in tokens for token in line]
    # collections.Counter will use map that uses "word/char" as the key, its frequency of appearance as value
    # ['aa', 'bb', 'aa', 'cc'] --> {'aa': 2, 'bb": 1, 'cc': 1}
    return collections.Counter(tokens)

In [7]:
# Vocab class for managing tokens, their frequencies, and mappings to indices
class Vocab:
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        """
        Initialize the Vocab object.
        Parameters:
        - tokens: The list of tokens from which the vocabulary will be built.
        - min_freq: The minimum frequency for a token to be included in the vocabulary.
        - reserved_tokens: List of reserved tokens (e.g., '<pad>', '<start>', '<end>').
        """
        # Initialize tokens as an empty list if None 
        if tokens is None:
            tokens = []
        # Initialize reserved_tokens as an empty list if None 
        if reserved_tokens is None:
            reserved_tokens = []

        # This is the function we just implemented--tranforms a 2d/1d list of "words" into a frequency map
        counter = count_corpus(tokens)

        # Sort the tokens by frequency (descending), x[0] is our key/string/words, x[1] si our value/frequency
        # `self.token_freqs` is a list of tuples [(token, frequency), ...]
        self.token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True)

        # Initialize the 'unknown' token (<unk>) at index 0 and include reserved tokens
        self.unk, uniq_tokens = 0, ['<unk>'] + reserved_tokens

        # Add tokens from the corpus to uniq_tokens if:
        # - The token's frequency is greater than or equal to `min_freq`
        # - The token is not already in uniq_tokens
        uniq_tokens += [
            token for token, freq in self.token_freqs
            if freq >= min_freq and token not in uniq_tokens
        ]

        # Initialize mappings: idx_to_token (list) and token_to_idx (dict)
        # - idx_to_token: Maps index -> token
        # - token_to_idx: Maps token -> index
        self.idx_to_token, self.token_to_idx = [], dict()

        # Populate the mappings
        for token in uniq_tokens:
            self.idx_to_token.append(token)  # Add the token to idx_to_token
            self.token_to_idx[token] = len(self.idx_to_token) - 1  # Map token to its index

    def __len__(self):
        """
        Returns the number of unique tokens in the vocabulary.
        """
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        """
        Retrieve the index of a token or a list of tokens.
        Parameters:
        - tokens: A single token (str/char) or a list of tokens.
        Returns:
        - If a single token: Returns its corresponding index.
        - If a list of tokens: Returns a list of corresponding indices.
        """
        # If `tokens` is a single token, return its index or <unk>'s index if not found
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        # If `tokens` is a list/tuple, recursively map each token to its index
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        """
        Retrieve the token corresponding to an index or a list of indices.
        Parameters:
        - indices: A single index (int) or a list of indices.
        Returns:
        - If a single index: Returns the corresponding token.
        - If a list of indices: Returns a list of corresponding tokens.
        """
        # If `indices` is a single index, return the corresponding token
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        # If `indices` is a list/tuple, recursively map each index to its token
        return [self.to_tokens(index) for index in indices]

In [None]:
def load_corpus_time_machine(max_tokens=-1):
    """
    Load and preprocess the Time Machine dataset to create a corpus and vocabulary.
    
    Parameters:
    - max_tokens (int): Maximum number of tokens to include in the corpus. 
                        If -1 (default), all tokens are included.
    
    Returns:
    - corpus (list of int): The corpus as a list of token indices.
    - vocab (Vocab): The vocabulary object mapping tokens to indices and vice versa.
    """
    # Read the Time Machine dataset as lines of text
    lines = read_time_machine()  
    # `lines` contains the dataset as a list of preprocessed text lines, e.g.,
    # ['the time machine by h g wells', ''] or a 2d nested list versions

    #Tokenize the text into characters
    tokens = tokenize(lines, 'char')  
    # Tokenizing 'char' splits each line into a list of individual characters:
    # tokens = [['t', 'h', 'e', ...], ['t', 'i', 'm', ...], ...]

    #Create a vocabulary from the tokens
    vocab = Vocab(tokens)  
    # The Vocab object builds a mapping of unique characters to indices.
    # For example:
    # vocab.idx_to_token = ['<unk>', 't', 'h', 'e', 'i', 'm', ...] or a list of words
    # vocab.token_to_idx = {'<unk>': 0, 't': 1, 'h': 2, ...}

    # Convert tokens into their corresponding indices
    # Flatten the 2D token list (list of lists) into a single list, then map each token to its index.
    corpus = [vocab[token] for line in tokens for token in line]
    # Example:
    # If tokens = [['t', 'h', 'e'], ['t', 'i', 'm']], and
    # vocab = {'<unk>': 0, 't': 1, 'h': 2, 'e': 3, 'i': 4, 'm': 5},
    # Then corpus = [1, 2, 3, 1, 4, 5].

    # Limit the number of tokens in the corpus if max_tokens > 0
    if max_tokens > 0:
        corpus = corpus[:max_tokens]  
        # Truncate the corpus to include only the first `max_tokens` tokens.

    # Return the corpus and the vocabulary
    return corpus, vocab


# Usage Example:
corpus, vocab = load_corpus_time_machine()

# `corpus` is a list of token indices representing the entire dataset.
# `vocab` is a Vocab object that allows conversion between tokens and indices.

# Check the size of the corpus and the vocabulary:
len(corpus), len(vocab)