In [3]:
from mars.db import collections
from mars.sentence_embeddings import embedd_sentences, EmbeddingType

In [4]:
%%time

from typing import Iterator

EMBEDDING = "embedding"


def fetch_batches_until_empty(
    collection, filter: dict, batch_size=1000
) -> Iterator[list]:
    """Fetch collection in batches. Stop fetching when there is no fields after filtering"""
    finished=False
    while not finished:
        print("Fetching next batch...")
        results = [
            d
            for d in collection.fetchByExample(
                filter, batch_size, limit=batch_size
            )
        ]
        print("Fetched!")

        if len(results) != 0:
            yield results
        else:
            finished = True
            print("Finished")

def fetch_in_batches(collection, batch_size=100):
    skip=0
    finished=False
    while not finished:
        print("Fetching next batch...")
        results = [d for d in collection.fetchAll(limit=batch_size, skip=skip)]
        print("Fetched!")
        if len(results) != 0:
            yield results
        else:
            finished = True

CPU times: user 40 µs, sys: 0 ns, total: 40 µs
Wall time: 44.1 µs


In [None]:
%%time
for sents_docs in fetch_batches_until_empty(collections.sentences, {EMBEDDING: None}, 5000):
    sents = [sent_doc["sentence"] for sent_doc in sents_docs]
    embeddings = embedd_sentences(sents, EmbeddingType.LABSE)

    for embedding, sent_doc in zip(embeddings, sents_docs):
        sent_doc[EMBEDDING] = list(embedding.numpy())
        sent_doc.patch()
    break

Fetching next batch...
Fetched!


2021-10-27 02:11:53.601121: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 3932160000 exceeds 10% of free system memory.
2021-10-27 02:11:58.265466: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 3932160000 exceeds 10% of free system memory.
2021-10-27 02:12:12.774280: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 7864320000 exceeds 10% of free system memory.
2021-10-27 02:12:32.832580: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 7864320000 exceeds 10% of free system memory.
2021-10-27 02:13:19.095794: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 3932160000 exceeds 10% of free system memory.


In [None]:
%%time

for sents_docs in fetch_in_batches(collections.sentences, 10000):
    sents = [sent_doc["sentence"] for sent_doc in sents_docs]
    embeddings = embedd_sentences(sents, EmbeddingType.LABSE)

    for embedding, sent_doc in zip(embeddings, sents_docs):
        sent_doc[EMBEDDING] = list(embedding.numpy())
        sent_doc.patch()
    break

Fetching next batch...
Fetched!


2021-10-27 02:03:28.515598: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 3932160000 exceeds 10% of free system memory.
2021-10-27 02:03:28.860435: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 3932160000 exceeds 10% of free system memory.
2021-10-27 02:03:29.374074: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 3932160000 exceeds 10% of free system memory.
2021-10-27 02:03:31.069541: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 3932160000 exceeds 10% of free system memory.
2021-10-27 02:03:32.026142: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 3932160000 exceeds 10% of free system memory.
