# 1. prepair training set

In [2]:
# usd BFS search, return all corpus files as a list of string
import os
def get_corpus(corpus_path):
#     corpus_path = '../week_2/wiki_corpus/wiki_chs'
    corpus_list = []
    need_visit = [corpus_path]
    while need_visit:
        path = need_visit.pop(0)
        if os.path.isdir(path):
            need_visit += [path + '/' + p for p in os.listdir(path)]
        elif '.DS_Store' not in path:
            corpus_list.append(path)
    return corpus_list

# save the results on distk
import pickle

def save_obj(obj, file_name):
    pickle.dump(obj, open(file_name, 'wb'))
    
def load_obj(file_name):
    obj = pickle.load(open(file_name, 'rb'))
    return obj

corpus_list = get_corpus('../week_2/wiki_corpus/wiki_chs') 
print('first 10:', corpus_list[:10])
print('\ntotal corpus:', len(corpus_list))

first 10: ['../week_2/wiki_corpus/wiki_chs/AE/wiki_52', '../week_2/wiki_corpus/wiki_chs/AE/wiki_66', '../week_2/wiki_corpus/wiki_chs/AE/wiki_36', '../week_2/wiki_corpus/wiki_chs/AE/wiki_75', '../week_2/wiki_corpus/wiki_chs/AE/wiki_90', '../week_2/wiki_corpus/wiki_chs/AE/wiki_06', '../week_2/wiki_corpus/wiki_chs/AE/wiki_95', '../week_2/wiki_corpus/wiki_chs/AE/wiki_85', '../week_2/wiki_corpus/wiki_chs/AE/wiki_07', '../week_2/wiki_corpus/wiki_chs/AE/wiki_20']

total corpus: 1230


## 1.1 preprocessing

In [2]:
import re
import jieba
from functools import reduce

def replace_nums(string):
    return '<num>' if string.isnumeric() else string

def process_sent(sent:list):
    return [replace_nums(w) for w in sent]
    
def tokenize_string(string):
    string = ' '.join(re.findall('.+',string))
    docs = [doc.split('。') for doc in re.findall('<doc.+?">(.+?)</doc>',string)]
    sents = [process_sent(jieba.cut(sent.strip()))+['。'] for sent in reduce(lambda a,b:a+b, docs) if sent != ' ']
    return sents

def get_training_data(input_file):
    with open(input_file, 'r') as f:
        tokens = tokenize_string(f.read())
        return tokens


## 1.2 multi-process

In [3]:
#Multicore map reduce
import time
import multiprocessing
from tqdm import tqdm
import logging
jieba.setLogLevel(logging.WARNING)

def multiprocess(batch_file, target_func):
    pool = multiprocessing.Pool()
    result = pool.map(target_func, batch_file)
    pool.close()
    pool.join()
    result = reduce(lambda x, y : x + y, result)
    return result

def save_training_set(input_files:list, output_file:str, batch_size=128):
    def cut_list(lst, size): 
        return [lst[x:x+size] for x in range(0, len(lst), size)]
    vocab = set()
    counter = 0
    for batch in tqdm(cut_list(input_files, batch_size)):
        result = multiprocess(batch, get_training_data)
        counter += 1 
        save_obj(result, output_file+str(counter))
        for sent in result:
            for w in sent:
                vocab.add(w)
    save_obj(list(vocab), output_file + '_vocabulary')

# 1.3 calculate and save data at the first time

In [4]:
!mkdir data
!echo 'binary files in **data** are cutted by **jieba**' > README.md
save_training_set(corpus_list, 'data/training_set')

100%|██████████| 10/10 [10:42<00:00, 64.24s/it]


# 2. train word2vec

In [8]:
from gensim.models import Word2Vec
from tqdm import tqdm

def save_model(file_name):
    corpus_list = sorted(get_corpus('data'), key=lambda s:len(s))
    print(corpus_list)
    model = Word2Vec(min_count=1, workers=4)
    model.build_vocab(load_obj(corpus_list.pop()))

    for training_data in tqdm(corpus_list):
        model.train(load_obj(training_data) , total_examples=model.corpus_count, epochs=model.iter)

    model.save(file_name)

## 2.1 save model at the first time

In [9]:
save_model('data/word2vec.model')

['data/training_set7', 'data/training_set6', 'data/training_set1', 'data/training_set2', 'data/training_set3', 'data/training_set8', 'data/training_set5', 'data/training_set4', 'data/training_set9', 'data/training_set10', 'data/training_set_vocabulary']


  # This is added back by InteractiveShellApp.init_path()
100%|██████████| 10/10 [04:42<00:00, 28.25s/it]
