In [1]:
import collections
import re

Here, I use the novel called The Time Machine from H.G.Wells as an example to illustrate text preprocessing.
The novel can be found here: http://www.gutenberg.org/ebooks/35


In [5]:
# load the txt file

def read_time_machine():
    with open('./timemachine.txt', 'r') as f:
        """
        use regex to replace special marks with space
        ^a-z means except char from a to z and + denotes a such string with length one at least 
        use line.strip() to get rid of space between lines
        """
        lines = [re.sub('[^a-z]+', ' ', line.strip().lower()) for line in f]
    return lines

lines = read_time_machine()
print('# sentences %d' % len(lines))

# sentences 3583


In [4]:
lines

['',
 'the project gutenberg ebook of the time machine by h g wells',
 '',
 'this ebook is for the use of anyone anywhere at no cost and with',
 'almost no restrictions whatsoever you may copy it give it away or',
 're use it under the terms of the project gutenberg license included',
 'with this ebook or online at www gutenberg net',
 '',
 '',
 'title the time machine',
 '',
 'author h g wells',
 '',
 'release date october ebook ',
 'last updated january ',
 '',
 'language english',
 '',
 'character set encoding utf ',
 '',
 ' start of this project gutenberg ebook the time machine ',
 '',
 '',
 '',
 '',
 '',
 '',
 'the time machine',
 '',
 '',
 'an invention',
 '',
 '',
 '',
 '',
 'by h g wells',
 '',
 '',
 '',
 '',
 'contents',
 '',
 '',
 'i introduction',
 'ii the machine',
 'iii the time traveller returns',
 'iv time travelling',
 'v in the golden age',
 'vi the sunset of mankind',
 'vii a sudden shock',
 'viii explanation',
 'ix the morlocks',
 'x when night came',
 'xi the palace

## Tokenization

In [11]:
def tokenize(sentences, token='word'):
    """
    split sentences into word or char tokens
    here I did the word tokenization
    """
    if token == 'word':
        return [sentence.split(' ') for sentence in sentences]
    elif token == 'char':
        return [list(sentence) for sentence in sentences]
    else:
        print('ERROR: unkown token type '+token)

tokens = tokenize(lines)
tokens[0:10]

[[''],
 ['the',
  'project',
  'gutenberg',
  'ebook',
  'of',
  'the',
  'time',
  'machine',
  'by',
  'h',
  'g',
  'wells'],
 [''],
 ['this',
  'ebook',
  'is',
  'for',
  'the',
  'use',
  'of',
  'anyone',
  'anywhere',
  'at',
  'no',
  'cost',
  'and',
  'with'],
 ['almost',
  'no',
  'restrictions',
  'whatsoever',
  'you',
  'may',
  'copy',
  'it',
  'give',
  'it',
  'away',
  'or'],
 ['re',
  'use',
  'it',
  'under',
  'the',
  'terms',
  'of',
  'the',
  'project',
  'gutenberg',
  'license',
  'included'],
 ['with', 'this', 'ebook', 'or', 'online', 'at', 'www', 'gutenberg', 'net'],
 [''],
 [''],
 ['title', 'the', 'time', 'machine']]

## Create a vocab dictionary for words

In [12]:
## use the vocab dict to map every word token to an index

class Vocab(object):
    
    def __init__(self, tokens, min_freq=0, use_special_tokens=False):
        # input tokens is the list returned from above tokenization
        counter = count_corpus(tokens)  # dict <key, value>: <word, word's freq>
        self.token_freqs = list(counter.items()) # use the values from counter dict to calculate the freq
        self.idx_to_token = [] # initiate the list to hold tokens we want later
        
        if use_special_tokens:
            """
            special tokens can change every sentence to equal length when dealing with batch processing
            or to denote the beginning or the end of a given sentence and the out-of-dict word,
            not used here, but might be useful in the future work
            params: padding, begin of sentence, end of sentence, unknown
            """
            self.pad, self.bos, self.eos, self.unk = (0, 1, 2, 3)
            self.idx_to_token += ['<pad>', '<bos>', '<eos>', '<unk>']
        else:
            self.unk = 0
            self.idx_to_token += ['<unk>']
        
        # create the list of word tokens
        self.idx_to_token += [token for token, freq in self.token_freqs
                        if freq >= min_freq and token not in self.idx_to_token]
        
        # create the dict for token to index
        self.token_to_idx = dict()
        for idx, token in enumerate(self.idx_to_token):
            self.token_to_idx[token] = idx

            
    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk) # define and get index of token in Vocab
        return [self.__getitem__(token) for token in tokens] # recurse for each element in the list or tuple

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]  # define and get token from index in Vocab
        return [self.idx_to_token[index] for index in indices] 

def count_corpus(sentences):
    tokens = [tk for st in sentences for tk in st] 
    return collections.Counter(tokens)  # return a dict with the number of each word

In [13]:
vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[0:50])

[('', 0), ('the', 1), ('project', 2), ('gutenberg', 3), ('ebook', 4), ('of', 5), ('time', 6), ('machine', 7), ('by', 8), ('h', 9), ('g', 10), ('wells', 11), ('this', 12), ('is', 13), ('for', 14), ('use', 15), ('anyone', 16), ('anywhere', 17), ('at', 18), ('no', 19), ('cost', 20), ('and', 21), ('with', 22), ('almost', 23), ('restrictions', 24), ('whatsoever', 25), ('you', 26), ('may', 27), ('copy', 28), ('it', 29), ('give', 30), ('away', 31), ('or', 32), ('re', 33), ('under', 34), ('terms', 35), ('license', 36), ('included', 37), ('online', 38), ('www', 39), ('net', 40), ('title', 41), ('author', 42), ('release', 43), ('date', 44), ('october', 45), ('last', 46), ('updated', 47), ('january', 48), ('language', 49)]


In [18]:
for i in range(1, 10):
    print('words:', tokens[i]) # a list: word token on the ith line after tokenization
    print('indices:', vocab[tokens[i]]) # return index of word token in the Vocab

words: ['the', 'project', 'gutenberg', 'ebook', 'of', 'the', 'time', 'machine', 'by', 'h', 'g', 'wells']
indices: [1, 2, 3, 4, 5, 1, 6, 7, 8, 9, 10, 11]
words: ['']
indices: [0]
words: ['this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with']
indices: [12, 4, 13, 14, 1, 15, 5, 16, 17, 18, 19, 20, 21, 22]
words: ['almost', 'no', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or']
indices: [23, 19, 24, 25, 26, 27, 28, 29, 30, 29, 31, 32]
words: ['re', 'use', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included']
indices: [33, 15, 29, 34, 1, 35, 5, 1, 2, 3, 36, 37]
words: ['with', 'this', 'ebook', 'or', 'online', 'at', 'www', 'gutenberg', 'net']
indices: [22, 12, 4, 32, 38, 18, 39, 3, 40]
words: ['']
indices: [0]
words: ['']
indices: [0]
words: ['title', 'the', 'time', 'machine']
indices: [41, 1, 6, 7]


The disadvantages of above tokenization preprocessing is quite obvious since it is too simple. The punctuation marks are discarded and words like don't, shouldn't cannot be handled properly. 
We can use NLTK and other libraries to better preprocess text data.

In [25]:
text = "Dr. G. Thomas doesn't like writing."

In [24]:
import nltk
from nltk.tokenize import word_tokenize

In [None]:
print(word_tokenize(text))