In [1]:
num_threads = 4
w2vType = 'Word2Vec' # or Word2Vec 'FastText'
n_dim = 100
embed_name = '{}{}d'.format(w2vType, n_dim)
vocab_name = 'vocab.json'
matrix_name = 'wv_matrix{}d'.format(n_dim)
filename = 'mlds_hw2_2_data/clr_conversation.txt'

In [2]:
import json
from gensim.models import Word2Vec, KeyedVectors, FastText
from tqdm import tqdm_notebook as tqdm
import numpy as np

In [3]:
BOS = '<bos>'
EOS = '<eos>'
PAD = '<pad>'
UNK = '<unk>'
MAX_Q_LEN = 10
MAX_A_LEN = 10

In [4]:
sents = []
def loadsents(name):
    with open(name, 'r', encoding='utf-8') as f:
        for s in f:
            if s is not "+++$+++":
                sents.append([BOS] + s.split() + [EOS] + [PAD])
loadsents(filename)
sents[0]

['<bos>', '美國', '2500', '萬名', '老兵', '致敬', '<eos>', '<pad>']

In [5]:
from gensim.models.callbacks import CallbackAny2Vec
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''
    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1

In [6]:
epoch_logger = EpochLogger()
if w2vType == 'FastText':
    model = FastText(size=n_dim, window=5, min_count=3, workers=num_threads, callbacks=[epoch_logger])
else:
    model = Word2Vec(size=n_dim, window=5, min_count=3, workers=num_threads, callbacks=[epoch_logger])

model.build_vocab(sents)
total_examples = model.corpus_count
model.train(sents, total_examples=total_examples, epochs=5)

Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end


(67560264, 124505000)

In [7]:
word = '狗'
print(word)
print(model.wv.similar_by_word(word))

狗
[('小狗', 0.7613780498504639), ('老鼠', 0.7343587875366211), ('猴子', 0.7112685441970825), ('雞', 0.6920151710510254), ('貓', 0.6781436204910278), ('烏鴉', 0.6681642532348633), ('鳥', 0.6631174087524414), ('這狗', 0.6585632562637329), ('老虎', 0.644609808921814), ('蛋', 0.6358624696731567)]


In [8]:
model.wv.save(embed_name)

In [9]:
embed_model = model.wv
VOCAB_SZ = len(embed_model.vocab)
vocab = {'<unk>':VOCAB_SZ}

def l2_normd_np(a):
    b = np.linalg.norm(a)
    return 0*a if b == 0 else a / b

wv_matrix = (np.random.rand(VOCAB_SZ+1, n_dim) - 0.5) / 5.0
for wid in range(VOCAB_SZ):
    word = embed_model.index2word[wid]
    vec = embed_model[word]
    wv_matrix[wid] = l2_normd_np(vec)
    vocab[word] = wid
    
np.save(matrix_name, wv_matrix)
json.dump(vocab, open(vocab_name, 'w', encoding='utf-8'))
print('done')

done


In [10]:
vocab = json.load(open(vocab_name, 'r', encoding='utf-8'))
wv_matrix = np.load(matrix_name+'.npy')

word1 = '狗'
word2 = '猴子'
print(word1, word2)
index1 = vocab[word1]
index2 = vocab[word2]
print(wv_matrix[index1] @ wv_matrix[index2])

狗 猴子
0.711268583801
