# 1. prepair training set

In [1]:
# usd BFS search, return all corpus files as a list of string
import os
def get_corpus(corpus_path):
#     corpus_path = '../week_2/wiki_corpus/wiki_chs'
    corpus_list = []
    need_visit = [corpus_path]
    while need_visit:
        path = need_visit.pop(0)
        if os.path.isdir(path):
            need_visit += [path + '/' + p for p in os.listdir(path)]
        elif '.DS_Store' not in path:
            corpus_list.append(path)
    return corpus_list

# save the results on distk
import pickle

def save_obj(obj, file_name):
    pickle.dump(obj, open(file_name, 'wb'))
    
def load_obj(file_name):
    obj = pickle.load(open(file_name, 'rb'))
    return obj

corpus_list = get_corpus('../week_2/wiki_corpus/wiki_chs') 
print('first 10:', corpus_list[:10])
print('\ntotal corpus:', len(corpus_list))

first 10: ['../week_2/wiki_corpus/wiki_chs/AE/wiki_52', '../week_2/wiki_corpus/wiki_chs/AE/wiki_66', '../week_2/wiki_corpus/wiki_chs/AE/wiki_36', '../week_2/wiki_corpus/wiki_chs/AE/wiki_75', '../week_2/wiki_corpus/wiki_chs/AE/wiki_90', '../week_2/wiki_corpus/wiki_chs/AE/wiki_06', '../week_2/wiki_corpus/wiki_chs/AE/wiki_95', '../week_2/wiki_corpus/wiki_chs/AE/wiki_85', '../week_2/wiki_corpus/wiki_chs/AE/wiki_07', '../week_2/wiki_corpus/wiki_chs/AE/wiki_20']

total corpus: 1230


## 1.1 preprocessing

In [2]:
import re
import jieba
from functools import reduce

def replace_nums(string):
    return re.sub('\d+','<num>', string)

def process_sent(sent:list):
    return [replace_nums(w) for w in sent]
    
def tokenize_string(string):
    string = ' '.join(re.findall('.+',string))
    docs = [doc.split('。') for doc in re.findall('<doc.+?">(.+?)</doc>',string)]
    
    sents = [process_sent(jieba.cut(sent.strip()))+['。'] for sent in reduce(lambda a,b:a+b, docs) if sent != ' ']
    return sents

def get_training_data(input_file):
    with open(input_file, 'r') as f:
        tokens = tokenize_string(f.read())
        return tokens

get_training_data

<function __main__.get_training_data(input_file)>

## 1.2 multi-process

In [3]:
#Multicore map reduce
import time
import multiprocessing
from tqdm import tqdm
import logging
jieba.setLogLevel(logging.WARNING)

def multiprocess(batch_file, target_func):
    pool = multiprocessing.Pool()
    result = pool.map(target_func, batch_file)
    pool.close()
    pool.join()
    result = reduce(lambda x, y : x + y, result)
    return result

def save_training_set(input_files:list, output_file:str, batch_size=64):
    def cut_list(lst, size): 
        return [lst[x:x+size] for x in range(0, len(lst), size)]
    
    counter = 0
    for batch in tqdm(cut_list(input_files, batch_size)):
        result = multiprocess(batch, get_training_data)
        counter += 1 
        save_obj(result, output_file+str(counter))

# 1.3 calculate and save data at the first time

In [4]:
!mkdir data
save_training_set(corpus_list, 'data/training_set')

mkdir: cannot create directory ‘data’: File exists


100%|██████████| 20/20 [11:04<00:00, 33.22s/it]


In [5]:
def demo_traning_set(n):
    traning_set = load_obj('data/training_set1')
    print(traning_set[:n])
    
demo_traning_set(10)

[['安济桥', ' ', '(', '衡水', ')', ' ', '衡水', '安济桥', '，', '又名', '衡水', '石桥', '或', '衡水', '老桥', '，', '座落在', '河北省', '衡水市', '滏阳', '河', '上', '，', '建成', '于', '乾隆', '三十一年', '（', '<num>', '年', '）', '。'], ['据', '《', '衡水', '县志', '》', '载', '：', '明朝', '天顺', '元年', '（', '<num>', '年', '）', '，', '衡水', '知县', '杨', '俨', '奏请', '上级', '批准', '修建', '一座', '木桥', '。'], ['后', '木桥', '因', '多次', '发大水', '屡修', '屡毁', '。'], ['至', '嘉靖', '三十二年', '（', '<num>', '年', '）', '，', '由', '衡水', '县', '徐廷', '募资', '修建', '石桥', '，', '隆庆', '三年', '（', '<num>', '年', '）', '复', '被', '大水', '冲毁', '，', '后', '由', '贾', '从仁', '募资', '修复', '。'], ['至', '清朝', '顺治', '五年', '（', '<num>', '年', '）', '由于', '滹沱河', '河床', '南徙', '并', '与', '滏阳', '河', '汇合', '，', '河水', '猛涨', '导致', '桥', '几乎', '被', '冲毁', '。'], ['清朝', '乾隆年间', '直隶', '总督', '方观', '承', '奏请', '朝廷', '批准', '，', '拨', '出', '银子', '四万七千', '六百多', '两', '修建', '石桥', '。'], ['派清', '河道', '周元理', '、', '通永道', '玉神保', '董理', '，', '同时', '知县', '陶淑', '亲自', '指挥', '。'], ['乾隆', '三十年', '（', '<num>', '年', '）', '正式', '动工', '，', '到', '翌年',

# 2. word2vec train online training
reference: https://rutumulkar.com/blog/2015/word2vec

In [6]:
from gensim.models import Word2Vec
from tqdm import tqdm

def save_model(file_name):
    corpus_list = get_corpus('data')
    print(corpus_list)
    model = Word2Vec(load_obj(corpus_list.pop()), min_count=3, workers=4)

    for training_data in tqdm(corpus_list):
        training_data = load_obj(training_data)
        model.build_vocab(training_data, update=True)
        model.train(training_data , total_examples=model.corpus_count, epochs=model.epochs)

    model.save(file_name)

## 2.1 save model at the first time

In [7]:
save_model('model/word2vec.model')

['data/training_set7', 'data/training_set6', 'data/training_set1', 'data/training_set2', 'data/training_set3', 'data/training_set8', 'data/training_set5', 'data/training_set4', 'data/training_set9', 'data/training_set11', 'data/training_set20', 'data/training_set15', 'data/training_set13', 'data/training_set16', 'data/training_set14', 'data/training_set18', 'data/training_set12', 'data/training_set19', 'data/training_set17', 'data/training_set10']


  if sys.path[0] == '':
100%|██████████| 19/19 [13:26<00:00, 42.45s/it]


## 2.2 demo

In [9]:
from gensim.models import Word2Vec
from tqdm import tqdm

model = Word2Vec.load('model/word2vec.model')

In [22]:
model.wv.similar_by_word('<num>')

[('<num>.<num>', 0.6439221501350403),
 ('年', 0.5513116121292114),
 ('月', 0.5263307690620422),
 ('下半年', 0.5020462870597839),
 ('截止', 0.4934613108634949),
 ('日', 0.4872220456600189),
 ('上半年', 0.48133814334869385),
 ('预计', 0.4806244969367981),
 ('生效日', 0.47555091977119446),
 ('单月', 0.4645977020263672)]

In [23]:
model.wv.similar_by_word('<num>%')

[('<num>.<num>%', 0.7951822280883789),
 ('一半', 0.772778332233429),
 ('百分之五', 0.7694447040557861),
 ('％', 0.7688212990760803),
 ('三分之一', 0.7555698156356812),
 ('百分之', 0.7324985265731812),
 ('百分之九十', 0.7284418344497681),
 ('百分之五十', 0.7273511290550232),
 ('五分之一', 0.7243691682815552),
 ('百分之二十', 0.7231431007385254)]

In [24]:
model.wv.similar_by_word('数学')

[('数学分析', 0.8253051042556763),
 ('微积分', 0.8048616051673889),
 ('高等数学', 0.7845650911331177),
 ('数论', 0.7810370326042175),
 ('概率论', 0.7800022959709167),
 ('数理逻辑', 0.7799492478370667),
 ('拓扑学', 0.7778240442276001),
 ('语言学', 0.7742013931274414),
 ('统计学', 0.7737069129943848),
 ('逻辑学', 0.7697933912277222)]

In [33]:
model.wv.similar_by_word('中国')

[('中华人民共和国', 0.6557179689407349),
 ('中华民国', 0.5823049545288086),
 ('外国', 0.5758571624755859),
 ('我国', 0.548123836517334),
 ('东亚', 0.5338834524154663),
 ('欧美', 0.51898193359375),
 ('西方', 0.5143820643424988),
 ('近代', 0.5110182166099548),
 ('JICST', 0.509065568447113),
 ('曾毅', 0.500535249710083)]