# Word Embeddings in MongoDB

This example uses the official MongoDB Connector within Python3 to store and retrieve various amounts of Word Embeddings.

We will use a local Mongo database running as a Docker Container for testing purposes. To start the database run:

```
docker run -it --rm --name ohmongo -e MONGO_INITDB_ROOT_USERNAME=mikolov -e MONGO_INITDB_ROOT_PASSWORD=word2vec -p 27017:27017 mongo:4
```

In [None]:
import pymongo
import bson
import io
import time
import numpy
import plotly

# Dummy Embeddings

For testing purposes we will use randomly generated numpy arrays as dummy embbeddings.

In [None]:
def embeddings(n=1000, dim=300):
    """
    Yield n tuples of random numpy arrays of *dim* length indexed by *n*
    """
    idx = 0
    while idx < n:
        yield (str(idx), numpy.random.rand(dim))
        idx += 1

# Conversion Functions

Since we can't just save a NumPy array into the database, we will convert it into a bson.binary.Binary.

In [None]:
def adapt_array(array):
    """
    Using the numpy.save function to save a binary version of the array,
    and BytesIO to catch the stream of data and convert it into a bson.binary.Binary

    :param numpy.array array: NumPy array to turn into BLOB
    :return: NumPy array as bson.binary.Binary
    :rtype: bson.binary.Binary
    """
    out = BytesIO()
    numpy.save(out, array)
    out.seek(0)

    return bson.binary.Binary(out.read())


def convert_array(blob):
    """
    Using BytesIO to convert the binary version of the array back into a numpy array.

    :param BLOG blob: BLOB containing a NumPy array
    :return: One steaming hot NumPy array
    :rtype: numpy.array
    """
    out = BytesIO(blob)
    out.seek(0)

    return numpy.load(out)

In [None]:
uri = 'mongodb://mikolov:word2vec@localhost:27017'
client = pymongo.MongoClient(uri)
database = client['embedding_db']
embeddings_collection = database['embeddings']

In [None]:
%%time
for key, emb in embeddings():
    arr = adapt_array(emb)
    obj = {'key': key, 'emb': arr}
    embeddings_collection.insert_one(obj)

In [None]:
%%time
for key, _ in embeddings():
    obj = embeddings_collection.find_one({'key': key})
    emb = convert_array(obj['emb'])
    assert(type(emb) is numpy.ndarray)

# Sample some data

To test the I/O we will write and read some data from the database. This may take a while.

In [None]:
write_times = []
read_times = []
counts = [500, 1000, 5000, 10000, 50000, 100000]

for c in counts:
    embeddings_collection.delete_many({})
    
    start_time_write = time.time()
    for key, emb in embeddings(c):
        arr = adapt_array(emb)
        obj = {'key': key, 'emb': arr}
        embeddings_collection.insert_one(obj)
    write_times.append(time.time() - start_time_write)
    
    start_time_read = time.time()
    for key, _ in embeddings(c):
        obj = embeddings_collection.find_one({'key': key})
        emb = convert_array(obj['emb'])
        assert(type(emb) is numpy.ndarray)
    read_times.append(time.time() - start_time_read)
    
print('DONE')

# Results

In [None]:
plotly.offline.init_notebook_mode(connected=True)
trace = plotly.graph_objs.Scatter(
    x = write_times,
    y = counts,
    mode = 'markers'
)
layout = plotly.graph_objs.Layout(title="MongoDB Write Times",
                xaxis=dict(title='Time in Seconds'),
                yaxis=dict(title='Embedding Count'))
data = [trace]
fig = plotly.graph_objs.Figure(data=data, layout=layout)
plotly.offline.iplot(fig, filename='jupyter-basic-scatter')

In [None]:
plotly.offline.init_notebook_mode(connected=True)
trace = plotly.graph_objs.Scatter(
    x = read_times,
    y = counts,
    mode = 'markers'
)
layout = plotly.graph_objs.Layout(title="MongoDB Read Times",
                xaxis=dict(title='Time in Seconds'),
                yaxis=dict(title='Embedding Count'))
data = [trace]
fig = plotly.graph_objs.Figure(data=data, layout=layout)
plotly.offline.iplot(fig, filename='jupyter-basic-scatter')