# 文本预处理
- 读入文本
- 分词
- 建立词典
- 将文本从词的序列转换为索引的序列

## 读入文本
以英文小说Time Machine(..\data\04time_machine.txt)

In [5]:
import os
import sys
BASE_DIR = os.path.dirname(os.getcwd())
sys.path.insert(0, os.path.join(BASE_DIR))
print(BASE_DIR)

import collections
import re

D:\Project\Anaconda\dive2DL


In [12]:
def read_time_machine():
    with open(os.path.join(BASE_DIR, "data", "04time_machine.txt"), 'r', encoding='utf-8') as f:
        lines = [re.sub('[^a-z]+', ' ', line.strip().lower()) for line in f]
        return lines

In [13]:
lines = read_time_machine()
print('# sentences %d' % len(lines))

# sentences 3583


## 分词

In [16]:
def tokenize(sentences, token='word'):
    if token == 'word':
        return [sentence.split(' ') for sentence in sentences ]
    if token == 'char':
        return [list(sentence) for sentence in sentences]
    else:
        print("ERROR: unkown token type "+token)

tokens = tokenize(lines)
tokens[0:2]

[[''],
 ['the',
  'project',
  'gutenberg',
  'ebook',
  'of',
  'the',
  'time',
  'machine',
  'by',
  'h',
  'g',
  'wells']]

## 建立字典

In [17]:
class Vocab(object):
    def __init__(self, tokens, min_freq=0, use_special_tokens=False):
        counter = count_corpus(tokens)
        self.token_freqs = list(counter.items())
        self.idx_to_token = []
        if use_special_tokens:
            # padding, begin, end, unknown
            self.pad, self.bos, self.eos, self.unk = (0, 1, 2, 3)
            self.idx_to_token += ['', '', '', '']
        else: 
            self.unk = 0
            self.idx_to_token += ['']
        self.idx_to_token += [token for token, freq in self.token_freqs 
                             if freq >= min_freq and token not in self.idx_to_token]
        self.token_to_idx = dict()
        for idx, token in enumerate(self.idx_to_token):
            self.token_to_idx[token] = idx
    
    def __len__(self):
        return len(self.idx_to_token)
    
    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]
    
def count_corpus(sentences):
    tokens = [tk for st in sentences for tk in st]
    return collections.Counter(tokens)

In [19]:
vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[0:20])

[('', 0), ('the', 1), ('project', 2), ('gutenberg', 3), ('ebook', 4), ('of', 5), ('time', 6), ('machine', 7), ('by', 8), ('h', 9), ('g', 10), ('wells', 11), ('this', 12), ('is', 13), ('for', 14), ('use', 15), ('anyone', 16), ('anywhere', 17), ('at', 18), ('no', 19)]


## 将词转换为索引

In [21]:
for i in range(0, 10):
    print('words:', tokens[i])
    print('indices:', vocab[tokens[i]])

words: ['']
indices: [0]
words: ['the', 'project', 'gutenberg', 'ebook', 'of', 'the', 'time', 'machine', 'by', 'h', 'g', 'wells']
indices: [1, 2, 3, 4, 5, 1, 6, 7, 8, 9, 10, 11]
words: ['']
indices: [0]
words: ['this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with']
indices: [12, 4, 13, 14, 1, 15, 5, 16, 17, 18, 19, 20, 21, 22]
words: ['almost', 'no', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or']
indices: [23, 19, 24, 25, 26, 27, 28, 29, 30, 29, 31, 32]
words: ['re', 'use', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included']
indices: [33, 15, 29, 34, 1, 35, 5, 1, 2, 3, 36, 37]
words: ['with', 'this', 'ebook', 'or', 'online', 'at', 'www', 'gutenberg', 'net']
indices: [22, 12, 4, 32, 38, 18, 39, 3, 40]
words: ['']
indices: [0]
words: ['']
indices: [0]
words: ['title', 'the', 'time', 'machine']
indices: [41, 1, 6, 7]
