# Imports

In [1]:
import string
import sys
import numpy as np

# File Generator
Iterators over file sequenially. Only loads fixed sized amounts into memory

In [2]:
def fileIter(source, valid_letters, encoding, num_chars):
    with open(source, encoding='utf8') as file:
        while True:
            chunk = file.read(num_chars)
            if chunk:
                yield chunk
            else:
                return

# Set Program Constants

In [3]:
# Input/Output files
input_path      = 'wikitext-103-raw/wiki.test.clean'
output_path     = 'wikitext-103-raw/wiki.test.vector'
decoder_path    = 'wikitext-103-raw/wiki.test.decoder'
vocab_path      = 'wikitext-103-raw/wiki.test.vocab'
output_len_path = 'wikitext-103-raw/wiki.test.vector.len'

# How many characters per line; For output file.
# We read a large chunk of sentences from file, chunk should be as large as
# possible. Too small and we waste time doing IO, too large and we waste time
# writing page files.
sentence_len = 50
sentence_chunks = 100 * sentence_len

# Only include printable letters


In [4]:
valid_letters = set(string.printable)
ignore_letters = set(['\r','@','\x0c','\t','\x0b'])
valid_letters.difference_update(ignore_letters)
print("Dimension of possible letters: ", len(valid_letters),'\n')
print(valid_letters)

with open(vocab_path,'w') as vocab_file:
    vocab_file.write("".join(valid_letters))

Dimension of possible letters:  95 

{'$', 'o', 'C', 'l', '8', '1', 'd', '\\', 'j', "'", '+', 'Y', 'f', 'H', ']', 'n', 'L', 'u', '%', '<', 'A', 'i', 'c', 'q', '-', 'h', '0', '2', 'Z', '^', 'Q', ' ', '#', '>', 'a', '3', 'w', 'x', 'I', '"', 'M', '}', 'v', 'O', 'S', '?', 'r', 'P', ',', 'k', '`', 'T', 'N', 'X', 'y', '4', 'U', 'p', 'g', '\n', '5', '[', 'm', 't', '/', 'D', '=', '{', '.', '9', 'B', 's', ';', 'V', 'E', 'b', '&', '!', 'G', '6', '~', 'z', 'K', 'e', 'F', 'W', '|', ':', 'J', '_', '(', '7', ')', '*', 'R'}


# Encode/Decode letters to Integers

In [5]:
# Enocder maps a letter -> integer
encoder = dict((letter,position) for position, letter in enumerate(valid_letters))

# Decoder maps an integer -> letter
decoder = dict((value,key) for key,value in encoder.items())

# Save dictionary to disk
np.save(decoder_path, decoder)

# Convert Input by chunks

In [9]:
# Get iterator over file
file_iter = fileIter(input_path, encoding='utf8',valid_letters=valid_letters, num_chars=sentence_chunks)
out = open(output_path,'w')

# Write col name
out.write('X;Y\n')

chunk_processed = 0
row_count = 0
ckpt = 100

# Write data csv formatted
for chunk in file_iter:
    # Map letters to integers
    chunk = [encoder[letter] for letter in chunk]
    for start in range(0, len(chunk) - sentence_len, 10):
        out.write(' '.join(map(str, chunk[start: start + sentence_len])))
        out.write(';')
        out.write(str(chunk[start + sentence_len]))
        out.write('\n')
        row_count += 1
    if chunk_processed % ckpt == 0:
        print("Processed chunk #:",chunk_processed)
    chunk_processed += 1

out.close()

with open(output_len_path,'w') as file:
    file.write(str(row_count))
    
print("Finished writing CSV")

Processed chunk #: 0
Processed chunk #: 100
Processed chunk #: 200
Finished writing CSV
