In [1]:
import os
import json 
import glob
from pathlib import Path

In [2]:
n = 1
large_encoder = True

l = 'Large' if large_encoder else 'Small'
project_dir = Path(f'TestProject{l}{n:03d}/')
os.makedirs(project_dir, exist_ok=True)

In [3]:
def get_sents(file):
    with open(file, 'r') as f:
        for line in f:
            doc = json.loads(line)
            sents = list()
            if isinstance(doc['knowledge_graph']['title'][0]['value'], str):
                sents.append(doc['knowledge_graph']['title'][0]['value'])
                sents.extend(doc['split_sentences'])
                for sent in sents:
                    sent = str(sent).replace('\n', ' ').replace('\t', ' ')
                    while '  ' in sent:
                        sent = sent.replace('  ', ' ')
                    yield sent

In [4]:
jl_files = ['Data/sample_news_2019-03-04.jl', 'Data/sample_news_2019-03-05.jl', 'Data/sample_news_2019-03-06.jl']
jl_files

['Data/sample_news_2019-03-04.jl',
 'Data/sample_news_2019-03-05.jl',
 'Data/sample_news_2019-03-06.jl']

In [5]:
for jl in jl_files:
    fname = Path(jl).stem
    dump_tsv = project_dir/f'{fname}.tsv'
    with open(dump_tsv, 'x') as dt:
        sent_gen = get_sents(jl)
        for i, sent in enumerate(sent_gen):
            dt.write(f'{i}\t{sent}\n')

In [6]:
tsv_files = glob.glob(f'{project_dir}/*.tsv')
tsv_files

['TestProjectLarge001/sample_news_2019-03-04.tsv',
 'TestProjectLarge001/sample_news_2019-03-06.tsv',
 'TestProjectLarge001/sample_news_2019-03-05.tsv']

In [7]:
from SimSent.indexer.index_builder import IndexBuilder
from SimSent.vectorizer.sentence_vectorizer import SentenceVectorizer

In [8]:
sv = SentenceVectorizer(large=large_encoder)

Loading model: /home/lukas/GitHub/SimSent/SimSent/vectorizer/model/96e8f1d3d4d90ce86b2db128249eb8143a91db73/
Initializing TF Session...


In [9]:
ibdr = IndexBuilder(project_dir, sentence_vectorizer=sv)

In [10]:
for tsv in tsv_files:
    ibdr.tsv_to_index(tsv)

In [11]:
from SimSent.server.service_handler import QueryHandler

from SimSent.vectorizer.sentence_vectorizer import DockerVectorizer
from SimSent.indexer.deploy_handler import RangeShards

In [12]:
# Ensure correct docker is running the vectorizer container in background 
dv = DockerVectorizer(large=large_encoder)
rs = RangeShards(project_dir, nprobe=8, get_nested=True)

In [13]:
qp = QueryHandler(dv, rs, project_dir=project_dir, get_nested=True)

In [14]:
keys = list(rs.shards.keys())
keys

['sample_news_2019-03-04', 'sample_news_2019-03-05', 'sample_news_2019-03-06']

In [24]:
query = 'Tesla has revolutionized the automotive industry'

In [25]:
results = qp.query_corpus(query, keys)

  Query vectorized in --- 0.0296s
  Index searched in ----- 0.0273s
  Payload formatted in -- 0.0035s



In [26]:
for Key, Vals in results.items():
    print(f'\n{Key}:')
    for v in Vals:
        print(f'   * ID:    {v[1]}\n'
              f'   * Score: {v[0]}\n'
              f'   * Text:  {v[2]}\n')


sample_news_2019-03-04:
   * ID:    7041
   * Score: 0.4920949637889862
   * Text:  With a recent Bloomberg report forecasting electric and internal combustion engine price parity by 2024, ALYI is positioning to expand into electric cars. 

   * ID:    19359
   * Score: 0.494681179523468
   * Text:  AUDI AG is among the most successful luxury automotive brands, delivering about 1.812 million vehicles globally in 2018.

   * ID:    7038
   * Score: 0.5557064414024353
   * Text:  The targeted acquisition is one of several potential acquisitions the company is developing to expand from its current electric motorcycle business into producing electric cars.

   * ID:    10468
   * Score: 0.6566527485847473
   * Text:  The Company revolutionized solar with its microinverter technology and produces the world's only truly integrated solar plus storage solution.

   * ID:    6178
   * Score: 0.6658201217651367
   * Text:  About HydrogenicsHydrogenics Corporation is a world leader in engineerin