In [1]:
num_threads = 4
embed_name = 'fasttext100d'
vocab_name = 'vocab.json'
sent2seq_name = 'sent2seq.json'
sent2seq_test_name = 'sent2seq_test.json'
w2vType = 'Word2Vec'#'FastText' # or Word2Vec
min_count = 1
n_dim = 100

In [2]:
import json
from gensim.models import Word2Vec, KeyedVectors, FastText

In [3]:
BOS = '<bos>'
EOS = '<eos>'
PAD = '<pad>'
UNK = '<unk>'

In [4]:
##### Data Loader
print('loading ' + vocab_name)
vocab = json.load(open(vocab_name, 'r', encoding='utf-8'))
vocab_inv = {ind:word for word, ind in vocab.items()}
print('done')

print('loading ' + sent2seq_name)
Sent2Seq = json.load(open(sent2seq_name, 'r'))
Sent2Seq_test = json.load(open(sent2seq_test_name, 'r'))
print('done')

iBOS = vocab[BOS]
iPAD = vocab[PAD]
iEOS = vocab[EOS]

sentences = []
for sid, seq in Sent2Seq.items():
    sent = [BOS] + [vocab_inv[wid] for wid in seq] + [EOS, PAD, PAD]
    sentences.append(sent)
for sid, seq in Sent2Seq_test.items():
    sent = [BOS] + [vocab_inv[wid] for wid in seq] + [EOS, PAD, PAD]
    sentences.append(sent)

print('done')

loading vocab.json
done
loading sent2seq.json
done
done


In [5]:
if w2vType == 'FastText':
    model = FastText(size=n_dim, window=5, min_count=min_count, workers=num_threads)
else:
    model = Word2Vec(size=n_dim, window=5, min_count=min_count, workers=num_threads)

model.build_vocab(sentences)
total_examples = model.corpus_count
model.train(sentences, total_examples=total_examples, epochs=5)

(14458973, 19306885)

In [6]:
word = vocab_inv[64]
print(word)
print(model.wv.similar_by_word(word))

怎么
[('如何', 0.7661803364753723), ('怎样', 0.7399766445159912), ('咋', 0.6407647132873535), ('怎么样', 0.6340466737747192), ('有没有', 0.60993891954422), ('正缘', 0.5764929056167603), ('是不是', 0.5703044533729553), ('过来', 0.5687381625175476), ('砀山县', 0.5649325251579285), ('花言巧语', 0.5643908977508545)]


In [7]:
model.wv.save(embed_name)

In [10]:
import numpy as np
embed_model = model.wv
VOCAB_SZ = len(embed_model.vocab)
vocab = {'<unk>':VOCAB_SZ}

def l2_normd_np(a):
    b = np.linalg.norm(a)
    return 0*a if b == 0 else a / b

wv_matrix = (np.random.rand(VOCAB_SZ, n_dim) - 0.5) / 5.0
for wid in range(VOCAB_SZ):
    word = embed_model.index2word[wid]
    vec = embed_model[word]
    wv_matrix[wid] = l2_normd_np(vec)
    vocab[word] = wid

np.save("wv_matrix", wv_matrix)
print('done')

done
