In [65]:
from luminoso_api import V5LuminosoClient as LuminosoClient
from pack64 import unpack64

import json
import numpy as np

In [66]:
def get_all_docs(client):
    docs = []
    while True:
        new_docs = client.get('docs', limit=25000, offset=len(docs))
        if new_docs['result']:
            docs.extend(new_docs['result'])
        else:
            return docs

In [67]:
client = LuminosoClient.connect('projects/prrq724s')
docs = get_all_docs(client)

In [68]:
topics = client.get('concepts/saved', include_science=True)
topics = sorted([{'name': t['name'], 'vector': [float(v) for v in unpack64(t['vector'])]} for t in topics], key=lambda t:t['name'].lower())

In [69]:
table = []
terms = {}
for d in docs:
    row = {}
    vector = [float(v) for v in unpack64(d['vector'])]
    row['text'] = d['text']
    row['title'] = d['title']
    doc_terms = []
    for t in d['terms']:
        text = d['text'][t['start']:t['end']]
        if t['term_id'] not in terms:
            concept = client.get('concepts', concept_selector={"type": "specified",
                                                           "concepts": [{"texts":[text]}]})
            terms[t['term_id']] = concept['result'][0]['name']
            doc_terms.append(concept['result'][0]['name'])
        else:
            doc_terms.append(terms[t['term_id']])
    row['terms'] = doc_terms
    metadata = sorted(d['metadata'], key=lambda m: m['name'])
    for i, m in enumerate(metadata):
        row['subset %d' % i] = m
    max_topic = None
    max_score = 0
    doc_topics = []
    for i, t in enumerate(topics):
        score = np.dot(t['vector'], vector)
        if score > max_score:
            max_score = score
            max_topic = t['name']
        doc_topics.append({'name': t['name'],
                               'score': score})
    row['topic'] = max_topic
    row['topic score'] = max_score
    doc_topics = sorted(doc_topics, key=lambda t:t['score'], reverse=True)
    for i, t in enumerate(doc_topics):
        row['topic %d' % i] = t
    table.append(row)

In [70]:
json.dump(table, open('boa_tableau_json_test.json', 'w'))