In [1]:
import gc
import pickle
import numpy as np

In [2]:
EMBEDDING_SIZE = 300
SPECIAL_SYMBOLS = ['<PAD>', '<UNK>']

In [3]:
def load_embedding(embedding_file):
    """Load embeddings from file."""
    np.random.seed(42)

    with open(embedding_file, 'r', encoding='utf-8') as f:
        lines = f.read().splitlines()[1:]  # remove first line (embedding description line)

    id_to_symbol_map = dict()
    symbol_to_id_map = dict()
    for i, symbol in enumerate(SPECIAL_SYMBOLS):
        id_to_symbol_map[i] = symbol
        symbol_to_id_map[symbol] = i

    num_total_symbols = len(SPECIAL_SYMBOLS) + len(lines)
    embeddings = np.zeros((num_total_symbols, EMBEDDING_SIZE), dtype=np.float32)
    embeddings[1] = np.random.randn(EMBEDDING_SIZE)  # the values of 'UNK' satisfy the normal distribution

    index = 2
    for line in lines:
        cols = line.split()
        id_to_symbol_map[index] = cols[0]
        symbol_to_id_map[cols[0]] = index
        embeddings[index] = np.array(cols[1:], dtype=np.float32)
        index += 1

    return id_to_symbol_map, symbol_to_id_map, embeddings

In [4]:
print("[INFO] Load character embeddings...")
char_embedding_file = "../embeddings/datagrand-char-300d.txt"
id_to_char_map, char_to_id_map, char_embeddings = load_embedding(char_embedding_file)
print("[INFO] Finished!")

print("[INFO] Save character embeddings...")
id_to_char_file = "../embeddings/id2char.pkl"
char_to_id_file = "../embeddings/char2id.pkl"
char_embedding_resave_file = "../embeddings/char-embedding-300d.npy"
with open(id_to_char_file, 'wb') as fout:
    pickle.dump(id_to_char_map, fout)
with open(char_to_id_file, 'wb') as fout:
    pickle.dump(char_to_id_map, fout)
np.save(char_embedding_resave_file, char_embeddings)
print("[INFO] Finish!")

[INFO] Load character embeddings...
[INFO] Finished!
[INFO] Save character embeddings...
[INFO] Finish!


In [8]:
print("[INFO] Load word embeddings...")
word_embedding_file = "../embeddings/datagrand-word-300d-mc5.txt"
id_to_word_map, word_to_id_map, word_embeddings = load_embedding(word_embedding_file)
print("[INFO] Finished!")

print("[INFO] Save word embeddings...")
id_to_word_file = "../embeddings/id2word.pkl"
word_to_id_file = "../embeddings/word2id.pkl"
word_embedding_resave_file = "../embeddings/word-embedding-300d-mc5.npy"
with open(id_to_word_file, 'wb') as fout:
    pickle.dump(id_to_word_map, fout)
with open(word_to_id_file, 'wb') as fout:
    pickle.dump(word_to_id_map, fout)
np.save(word_embedding_resave_file, word_embeddings)
print("[INFO] Finished!")

[INFO] Load word embeddings...
[INFO] Finished!
[INFO] Save word embeddings...
[INFO] Finished!


In [9]:
word_embeddings[2]

array([ 1.2212521 , -0.06116424, -0.19192894,  0.51793087,  0.1848969 ,
        0.9527769 , -0.5778866 ,  0.11118562,  0.11691932,  0.25371534,
        0.21012627,  0.62569594,  0.17096189, -0.78364635,  0.5207289 ,
       -0.49645147, -0.5726804 ,  0.05635048, -0.08855154,  0.59203625,
        0.7423577 ,  0.69640523,  0.9022525 ,  0.35679343,  0.05574984,
       -0.69267005,  0.82577163, -0.36317262,  0.9017204 , -0.8380331 ,
        0.03586778, -0.65469474, -0.08629435, -0.31037307,  0.23852949,
        0.40245318, -0.03340949, -0.22538517,  0.46168897,  0.09116583,
        0.39175525, -0.434022  ,  0.24586189,  0.89946675, -0.73305327,
       -0.5462546 ,  1.1267072 ,  0.4467849 , -0.25153682, -0.2693534 ,
       -0.31919602, -0.05951995, -0.48350054, -0.6665287 , -0.02056065,
        0.40230462,  0.06258436,  0.54501843, -0.46241704, -0.10373703,
       -0.43801343,  0.08152869, -0.34052062, -0.03037879,  0.01073134,
        1.0435497 , -0.25409582, -1.0748522 ,  1.0755184 ,  0.26