# Word Embeddings in MySQL

This example uses the official MySQL Connector within Python3 to store and retrieve various amounts of Word Embeddings.

We will use a local MySQL database running as a Docker Container for testing purposes. To start the database run:

```
docker run -ti --rm --name ohmysql -e MYSQL_ROOT_PASSWORD=mikolov -e MYSQL_DATABASE=embeddings -p 3306:3306 mysql:5.7
```

In [None]:
import mysql.connector
import io
import time
import numpy
import plotly

# Dummy Embeddings

For testing purposes we will use randomly generated numpy arrays as dummy embbeddings.

In [None]:
def embeddings(n=1000, dim=300):
    """
    Yield n tuples of random numpy arrays of *dim* length indexed by *n*
    """
    idx = 0
    while idx < n:
        yield (str(idx), numpy.random.rand(dim))
        idx += 1

# Conversion Functions

Since we can't just save a NumPy array into the database, we will convert it into a BLOB.

In [None]:
def adapt_array(array):
    """
    Using the numpy.save function to save a binary version of the array,
    and BytesIO to catch the stream of data and convert it into a BLOB.
    """
    out = io.BytesIO()
    numpy.save(out, array)
    out.seek(0)

    return out.read()

def convert_array(blob):
    """
    Using BytesIO to convert the binary version of the array back into a numpy array.
    """
    out = io.BytesIO(blob)
    out.seek(0)

    return numpy.load(out)

In [None]:
connection = mysql.connector.connect(user='root', password='mikolov',
                              host='127.0.0.1',
                              database='embeddings')

cursor = connection.cursor()
cursor.execute('CREATE TABLE IF NOT EXISTS `embeddings` (`key` TEXT, `embedding` BLOB);')
connection.commit()

In [None]:
%%time
for key, emb in embeddings():
    arr = adapt_array(emb)
    cursor.execute('INSERT INTO `embeddings` (`key`, `embedding`) VALUES (%s, %s);', (key, arr))
    connection.commit()

In [None]:
%%time
for key, emb in embeddings():
    cursor.execute('SELECT embedding FROM `embeddings` WHERE `key`=%s;', (key,))
    data = cursor.fetchone()
    arr = convert_array(data[0])

# Sample some data

To test the I/O we will write and read some data from the database. This may take a while.

In [None]:
write_times = []
read_times = []
counts = [500, 1000, 5000, 10000, 50000, 100000]

for c in counts:
    cursor.execute('DROP TABLE IF EXISTS `embeddings`;')
    cursor.execute('CREATE TABLE IF NOT EXISTS `embeddings` (`key` TEXT, `embedding` BLOB);')
    connection.commit()

    start_time_write = time.time()
    for key, emb in embeddings():
        arr = adapt_array(emb)
        cursor.execute('INSERT INTO `embeddings` (`key`, `embedding`) VALUES (%s, %s);', (key, arr))
        connection.commit()
    write_times.append(time.time() - start_time_write)
    
    start_time_read = time.time()
    for key, emb in embeddings():
        cursor.execute('SELECT embedding FROM `embeddings` WHERE `key`=%s;', (key,))
        data = cursor.fetchone()
        arr = convert_array(data[0])
    read_times.append(time.time() - start_time_read)
    
print('DONE')

# Results

In [None]:
plotly.offline.init_notebook_mode(connected=True)
trace = plotly.graph_objs.Scatter(
    x = write_times,
    y = counts,
    mode = 'markers'
)
layout = plotly.graph_objs.Layout(title="MySQL Write Times",
                xaxis=dict(title='Time in Seconds'),
                yaxis=dict(title='Embedding Count'))
data = [trace]
fig = plotly.graph_objs.Figure(data=data, layout=layout)
plotly.offline.iplot(fig, filename='jupyter-basic-scatter')

In [None]:
plotly.offline.init_notebook_mode(connected=True)
trace = plotly.graph_objs.Scatter(
    x = read_times,
    y = counts,
    mode = 'markers'
)
layout = plotly.graph_objs.Layout(title="MySQL Read Times",
                xaxis=dict(title='Time in Seconds'),
                yaxis=dict(title='Embedding Count'))
data = [trace]
fig = plotly.graph_objs.Figure(data=data, layout=layout)
plotly.offline.iplot(fig, filename='jupyter-basic-scatter')