In [2]:
import os
import json 
import glob
from pathlib import Path

In [3]:
n = 1
large_encoder = True

l = 'Large' if large_encoder else 'Small'
project_dir = Path(f'TestProject{l}{n:3d}/')
os.makedirs(project_dir, exist_ok=True)

In [3]:
def get_sents(file):
    with open(file, 'r') as f:
        for line in f:
            doc = json.loads(line)
            sents = list()
            if isinstance(doc['knowledge_graph']['title'][0]['value'], str):
                sents.append(doc['knowledge_graph']['title'][0]['value'])
                sents.extend(doc['split_sentences'])
                for sent in sents:
                    sent = str(sent).replace('\n', ' ').replace('\t', ' ')
                    while '  ' in sent:
                        sent = sent.replace('  ', ' ')
                    yield sent

In [40]:
jl_files = ['Data/sample_news_2019-03-04.jl', 'Data/sample_news_2019-03-05.jl', 'Data/sample_news_2019-03-06.jl']
jl_files

['Data/sample_news_2019-03-04.jl',
 'Data/sample_news_2019-03-05.jl',
 'Data/sample_news_2019-03-06.jl']

In [5]:
for jl in jl_files:
    fname = Path(jl).stem
    dump_tsv = project_dir/f'{fname}.tsv'
    with open(dump_tsv, 'x') as dt:
        sent_gen = get_sents(jl)
        for i, sent in enumerate(sent_gen):
            dt.write(f'{i}\t{sent}\n')

In [6]:
tsv_files = glob.glob(f'{project_dir}/*.tsv')
tsv_files

['TestProject005small/sample_news_2019-03-05.tsv',
 'TestProject005small/sample_news_2019-03-04.tsv',
 'TestProject005small/sample_news_2019-03-06.tsv']

In [25]:
from SimSent.indexer.index_builder import IndexBuilder
from SimSent.vectorizer.sentence_vectorizer import SentenceVectorizer

In [8]:
sv = SentenceVectorizer(large=large_encoder)

Loading model: /Users/lukasferrer/Documents/SimSent/SimSent/vectorizer/model/1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47/
Initializing TF Session...


In [9]:
ibdr = IndexBuilder(project_dir, sentence_vectorizer=sv)

In [10]:
for tsv in tsv_files:
    ibdr.tsv_to_index(tsv)

In [4]:
from SimSent.server.service_handler import QueryHandler

from SimSent.vectorizer.sentence_vectorizer import DockerVectorizer
from SimSent.indexer.deploy_handler import RangeShards

W0425 17:09:09.200798 4507502016 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [5]:
# Ensure correct docker is running the vectorizer container in background 
dv = DockerVectorizer(large=large_encoder)
rs = RangeShards(project_dir, nprobe=8, get_nested=True)

In [6]:
qp = QueryHandler(dv, rs, project_dir=project_dir, get_nested=True)

In [7]:
keys = list(rs.shards.keys())
keys

['sample_news_2019-03-04', 'sample_news_2019-03-05', 'sample_news_2019-03-06']

In [14]:
query = 'Tesla increases its value every year'

In [15]:
results = qp.query_corpus(query, keys)

  Query vectorized in --- 0.0386s
  Index searched in ----- 0.0071s
  Payload formatted in -- 0.0089s



In [16]:
for Key, Vals in results.items():
    print(f'\n{Key}:')
    for v in Vals:
        print(f'   * ID:    {v[1]}\n'
              f'   * Score: {v[0]}\n'
              f'   * Text:  {v[2]}\n')


sample_news_2019-03-04:
   * ID:    7040
   * Score: 0.4747331440448761
   * Text:  The global electric vehicle market is expected to reach over $500 billion by 2025.  

   * ID:    7041
   * Score: 0.48845478892326355
   * Text:  With a recent Bloomberg report forecasting electric and internal combustion engine price parity by 2024, ALYI is positioning to expand into electric cars. 

   * ID:    19359
   * Score: 0.7073149681091309
   * Text:  AUDI AG is among the most successful luxury automotive brands, delivering about 1.812 million vehicles globally in 2018.

   * ID:    7223
   * Score: 0.7136830687522888
   * Text:  The Corporation and Ion Energy will issue a press release announcing closing of the Initial Financing which is anticipated to occur on or about March 19, 2019. 

   * ID:    7858
   * Score: 0.7144827842712402
   * Text:  January's realized condensate pricing, including its hedging gains was almost three times higher than December, and more consistent with realized 