# Preprocessing

### Setup
vocab_name, sent2seq_name: the files will be used latter by wordembedding and rnn training

In [1]:
num_threads = 4
train_name = 'train.csv'
test_name = 'test.csv'
mode = 'chinese' # english / chinese
vocab_name = 'vocab.json'
sent2seq_name = 'sent2seq.json'
min_count = 1

In [2]:
from multiprocessing.pool import ThreadPool as Pool
import json
from tqdm import tqdm_notebook as tqdm
import jieba.posseg as pseg
import csv

In [3]:
BOS = '<bos>'
EOS = '<eos>'
PAD = '<pad>'
UNK = '<unk>'

### Data Loader
loads the sentences into the dictionary, with their id as key.

In [4]:
print("Loading {} ...".format(train_name))
sents = {}
Reader = csv.reader(open(train_name, newline='', encoding='utf-8'), delimiter=',', quotechar='"')
for i,fields in enumerate(Reader):    
    if i == 0:
        continue
    tid1, tid2 = fields[1:3]
    if mode == 'english':
        sent1 = fields[5]
        sent2 = fields[6]
    elif mode == 'chinese':
        sent1 = fields[3]
        sent2 = fields[4]
    if sent1 == "":
        sent1 = UNK
    if sent2 == "":
        sent2 = UNK
    if tid1 not in sents:
        sents[tid1] = sent1 
    if tid2 not in sents:
        sents[tid2] = sent2 
NUM_DATA = len(sents)
print("done. {} data loaded.".format(NUM_DATA))

Loading train.csv ...
done. 167564 data loaded.


### Frequency counting
this cell segments the sentences into words, then accummulates the frequency for all words. This is useful if we want to eliminate low frequency words, when min_count > 1.

In [5]:
freq = {'<pad>':min_count, '<bos>':min_count, '<eos>':min_count, '<unk>': min_count}

for key, sent in tqdm(sents.items()):
    words = pseg.cut(sent)
    segsent = []
    for w,flag in words:
        if flag is not 'x':
            try:
                freq[w] += 1
            except KeyError:
                freq[w] = 1
            segsent.append(w)
    sents[key] = segsent
print(len(freq))
json.dump(freq, open("tmp_word_freq.json", 'w', encoding='utf-8'))
json.dump(sents, open("tmp_seg_words.json", 'w', encoding='utf-8'))

HBox(children=(IntProgress(value=0, max=167564), HTML(value='')))

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.890 seconds.
Prefix dict has been built succesfully.



71055


### Vocabulary
this cell uses the frequency counted above to select words to put into the dictionary

In [6]:
freq = json.load(open("tmp_word_freq.json", 'r', encoding='utf-8'))
sents = json.load(open("tmp_seg_words.json", 'r', encoding='utf-8'))
vocab = {}
for w, f in freq.items():
    if f >= min_count:
        vocab[w] = len(vocab)
print(len(vocab))

71055


### Sent2Seq
this cell translates all sentences into sequences of indices of words, then store it and vocabulary into corresponding files.

In [7]:
def words2seq(pair):
    key = pair[0]
    words = pair[1]
    out_seq = []
    for w in words:
        try:
            wid = vocab[w]
        except KeyError:
            wid = vocab[UNK]
        out_seq.append(wid)
    return (key, out_seq)

stmp = {}
for pair in tqdm(sents.items()):    
    key, out_seq = words2seq(pair)
    stmp[key] = out_seq
sents = stmp
    
print('dumping data to ' + vocab_name)
json.dump(vocab, open(vocab_name, 'w', encoding='utf-8'))
json.dump(sents, open(sent2seq_name, 'w'))
print('done')

HBox(children=(IntProgress(value=0, max=167564), HTML(value='')))


dumping data to vocab.json
done


In [None]:
def clean():
    !rm tmp_word_freq.json tmp_seg_words.json
# clean()