In [1]:
num_threads = 4
embed_name = 'fasttext100d'
vocab_name = 'vocab.json'
sent2seq_name = 'sent2seq.json'
sent2seq_test_name = 'sent2seq_test.json'
w2vType = 'FastText' # or Word2Vec
n_dim = 200

In [2]:
import json
import numpy as np
from gensim.models import Word2Vec, KeyedVectors, FastText

In [3]:
BOS = '<bos>'
EOS = '<eos>'
PAD = '<pad>'
UNK = '<unk>'
MAX_Q_LEN = 20
MAX_A_LEN = 20

In [4]:
##### Data Loader
print('loading ' + vocab_name)
vocab = json.load(open(vocab_name, 'r', encoding='utf-8'))
vocab_inv = {ind:word for word, ind in vocab.items()}
print('done')

print('loading ' + sent2seq_name)
Sent2Seq = json.load(open(sent2seq_name, 'r'))
# Sent2Seq_test = json.load(open(sent2seq_test_name, 'r'))
print('done')

iBOS = vocab[BOS]
iPAD = vocab[PAD]
iEOS = vocab[EOS]

sentences = []
for sid, seq in Sent2Seq.items():
    sent = [BOS] + [vocab_inv[wid] for wid in seq] + [EOS, PAD, PAD]
    sentences.append(sent)
# for sid, seq in Sent2Seq_test.items():
#     sent = [BOS] + [vocab_inv[wid] for wid in seq] + [EOS, PAD, PAD]
#     sentences.append(sent)

print('done')

loading vocab.json
done
loading sent2seq.json
done
done


In [5]:
from gensim.models.callbacks import CallbackAny2Vec
class EpochLogger(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0
    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))
    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1

In [6]:
if w2vType == 'FastText':
    model = FastText(size=n_dim, window=5, min_count=1, workers=num_threads)
else:
    model = Word2Vec(size=n_dim, window=5, min_count=1, workers=num_threads)

model.build_vocab(sentences)
total_examples = model.corpus_count
logger = EpochLogger()
model.train(sentences, total_examples=total_examples, epochs=10, callbacks=[logger])

Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end
Epoch #5 start
Epoch #5 end
Epoch #6 start
Epoch #6 end
Epoch #7 start
Epoch #7 end
Epoch #8 start
Epoch #8 end
Epoch #9 start
Epoch #9 end


In [7]:
word = vocab_inv[64]
print(word)
print(model.wv.similar_by_word(word))

30
[('530', 0.9054430723190308), ('430', 0.9018285274505615), ('40', 0.8787569999694824), ('152', 0.8763628602027893), ('156', 0.868262529373169), ('154', 0.8674358129501343), ('159', 0.8603119850158691), ('130', 0.8566989302635193), ('230', 0.8561074733734131), ('155', 0.8538743257522583)]


In [8]:
model.wv.save(embed_name)

In [12]:
import numpy as np
VOCAB_SZ = len(vocab)
def l2_normd_np(a):
    b = np.linalg.norm(a)
    return 0*a if b == 0 else a / b

wv_matrix = (np.random.rand(VOCAB_SZ, n_dim) - 0.5) / 5.0
for wid in range(VOCAB_SZ):
    word = vocab_inv[wid]
    vec = model.wv[word]
    wv_matrix[wid] = l2_normd_np(vec)

print("done.")

done.


In [14]:
np.save("wv_matrix", wv_matrix)