# open text file and read in data as `text`

In [2]:
with open('data/example.txt', 'r') as f:
    text = f.read()

# encode the text and map each character to an integer and vice versa

In [4]:

# Step 1: int2char, which maps integers to characters
# Step 2: char2int, which maps characters to unique integers
chars = tuple(set(text))
int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}

# encode the text
encoded = np.array([char2int[ch] for ch in text])

# Create mini-batches

In [None]:
def get_batches(arr, batch_size, seq_length):
    '''Create a generator that returns batches of size
       batch_size x seq_length from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from
       batch_size: Batch size, the number of sequences per batch
       seq_length: Number of encoded chars in a sequence
    '''
    
    # calculate the total number of batches
    n_batches = len(arr)//batch_size * seq_length
    
    # delete overflowing letters
    arr = arr[:n_batches * batch_size * seq_length]
    # Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    
    # iterate through the array, one sequence at a time
    for n in range(0, arr.shape[1], seq_length):
        # The features
        x = arr[:, n:n+seq_length]
        # The targets, shifted by one
        y = np.zeros_like(x)
        try:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, n+seq_length]
        except IndexError:
            y[:, :-1], y[:, -1] = x[:, 1:], arr[:, 0]
        yield x, y

# Preprocess function converts punctuations into tokens (e.g. "." to < PERIOD >; "," to < COMMA >) , removes all words that show up five or fewer times and returns a list of words in the test

In [None]:
import utils

# get list of words
words = utils.preprocess(text)
print(words[:30])

# Create a lookup table
- Returns two dictionaries
- The integers are assigned in descending frequency order, so the most frequent word ("the") is given the integer 0 and the next most frequent is 1, and so on. 

In [None]:
vocab_to_int, int_to_vocab = utils.create_lookup_tables(words)
int_words = [vocab_to_int[word] for word in words]

print(int_words[:30])

# Subsampling
Discard a word w_i with probability give by P(w_i) = 1 - sqrt(t/f(w_i)) , t = threshold, f(w_i) = frequency of word w_i

In [None]:
from collections import Counter
import random
import numpy as np

threshold = 1e-5
word_counts = Counter(int_words) # dictionary of int_words, how many times they appear

total_count = len(int_words)
freqs = {word: count/total_count for word, count in word_counts.items()}
p_drop = {word: 1 - np.sqrt(threshold/freqs[word]) for word in word_counts}

# discard some frequent words, according to the subsampling equation
train_words = [word for word in int_words if random.random() < (1 - p_drop[word])]

# Get target - grab words around a word

In [None]:
def get_target(words, idx, window_size=5):
    ''' Get a list of words in a window around an index(Word). '''
    
    R = np.random.randint(1, window_size+1)
    start = idx - R if (idx - R) > 0 else 0
    stop = idx + R
    target_words = words[start:idx] + words[idx+1:stop+1]
    
    return list(target_words)

# Create a Batch with "get target"

In [None]:
def get_batches(words, batch_size, window_size=5):
    ''' Create a generator of word batches as a tuple (inputs, targets) '''
    
    n_batches = len(words)//batch_size
    # only full batches
    words = words[:n_batches*batch_size]
    for idx in range(0, len(words), batch_size):
        x, y = [], []
        batch = words[idx:idx+batch_size]
        for ii in range(len(batch)):
            
            batch_x = batch[ii]
            batch_y = get_target(batch, ii, window_size)
            y.extend(batch_y)
            x.extend([batch_x]*len(batch_y))
        yield x, y