## Requirements

In [None]:
!pip3 install fasttext pandas

## generateEmbeddingDatabase.py

In [None]:
import fasttext
import pandas as pd
import sqlite3

In [None]:
"""
    学習済みモデルのLoad
"""

model = fasttext.load_model("jawiki_fasttext.bin")
model

In [None]:
"""
    Embedding Matrix
"""

input_matrix = model.get_input_matrix()

In [None]:
"""
    Word -> Embedding
"""

# Word -> ID
word_map = pd.Series(range(len(model.get_words())), index=model.get_words())
print(word_map)

# Word -> Embedding
word_embed = pd.DataFrame(input_matrix[word_map.values], index=word_map.index)
word_embed.index.name = "WORD"
word_embed.columns = "DIM_" + word_embed.columns.astype(str)

# データベースに書き出し
with sqlite3.connect('EMBEDDINGS.db') as con:

    word_embed.to_sql(
        name = 'WORD_EMBED',
        con = con,
        if_exists='replace', 
        index = True,
        method = 'multi',
        chunksize = 1000,
    )
    
del(word_map, word_embed)

In [None]:
pd.DataFrame().to_csv("完了1.csv")

In [None]:
"""
    SubWord -> Embedding
"""

# SubWord -> ID
subwords = {}
for word in model.get_words():
    subwords.update(dict(zip(*model.get_subwords(word))))
subword_map = pd.Series(subwords)
print(subword_map)

# SubWord -> Embedding
subword_embed = pd.DataFrame(input_matrix[subword_map.values], index=subword_map.index)
subword_embed.index.name = "SUBWORD"
subword_embed.columns = "DIM_" + subword_embed.columns.astype(str)

# データベースに書き出し
with sqlite3.connect('EMBEDDINGS.db') as con:
    
    subword_embed.to_sql(
        name = 'SUBWORD_EMBED',
        con = con,
        if_exists='replace', 
        index = True,
        method = 'multi',
        chunksize = 1000,
    )
    
del(subword_map, subword_embed)

In [None]:
pd.DataFrame().to_csv("完了2.csv")

## 検証

In [None]:
con = sqlite3.connect('EMBEDDINGS.db')

In [None]:
query = "SELECT * FROM WORD_EMBED LIMIT 5"
pd.read_sql_query(query, con)

In [None]:
query = "SELECT * FROM SUBWORD_EMBED LIMIT 5"
pd.read_sql_query(query, con)

In [None]:
con.close()