In [1]:
import glob
from pathlib import Path


In [1]:
from SimSent.indexer.index_builder import IndexBuilder
from SimSent.vectorizer.sentence_vectorizer import SentenceVectorizer

In [2]:
sv = SentenceVectorizer()

Loading model: /Users/lukasferrer/Documents/SimSent/SimSent/vectorizer/model/1fb57c3ffe1a38479233ee9853ddd7a8ac8a8c47/
Initializing TF Session...


In [2]:
project_dir = Path('TestProject002/')

In [5]:
ibdr = IndexBuilder(project_dir, sentence_vectorizer=sv)

In [8]:
files = glob.glob('TestProject002/*.tsv')
files

['TestProject002/sample_news_2019-03-05.tsv',
 'TestProject002/sample_news_2019-03-04.tsv',
 'TestProject002/sample_news_2019-03-06.tsv']

In [9]:
for f in files:
    ibdr.tsv_to_index(f)

In [1]:
import os.path as p
import glob
from time import time
from pathlib import Path
from sqlitedict import SqliteDict
from typing import Dict, List, Tuple, Union

import numpy as np

from SimSent.vectorizer.sentence_vectorizer import DockerVectorizer
from SimSent.indexer.deploy_handler import RangeShards
from SimSent.indexer.faiss_cache import faiss_cache

W0424 16:24:33.640244 4672722368 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [3]:
class QueryProcessor:
    QueryReturn = np.array
    DiffScores = List[np.float32]
    VectorIDs = List[np.int64]
    FaissSearch = Tuple[DiffScores, VectorIDs]
    FormattedSearch = List[Tuple[np.int64, np.float32, str]]
    FormattedMultiSearch = Dict[str, FormattedSearch]

    def __init__(self, query_vectorizer: object, index_handler: object, 
                 project_dir: Path, nested: bool = False):
        super().__init__()
        self.vectorizer = query_vectorizer
        self.indexer = index_handler
        
        # Get id-to-sent maps
        get = '*/*.sqlite' if nested else '*.sqlite'
        db_files = glob.glob(p.abspath(project_dir/get))
        self.sent_dbs = dict()
        for f in db_files:
            self.sent_dbs[Path(f).stem] = SqliteDict(f)

    @faiss_cache(32)
    def query_corpus(self, query_str: str, keys: List[str], 
                     k: int = 5, radius: float = 1.0, verbose: bool = True
                     ) -> FormattedMultiSearch:
        """
        Vectorize query -> Search faiss index handler -> Format doc payload.
        Expects to receive only one query per call.
        """
        # Vectorize
        t_v = time()
        query_vector = self.vectorize(query_str)

        # Search
        t_s = time()
        results = self.indexer.search(query_vector, keys, radius=radius)

        t_p = time()
        top_hits = list()
        similar_docs = dict()
        for source, result_set in results.items():
            sorted_set = self.format_results(source, result_set, k)
            top_hits.extend(sorted_set)
            similar_docs[source] = sorted_set
        similar_docs['top_hits'] = sorted(top_hits)[:k]

        t_r = time()
        if verbose:
            print(f'  Query vectorized in --- {t_s - t_v:0.4f}s')
            print(f'  Index searched in ----- {t_p - t_s:0.4f}s')
            print(f'  Payload formatted in -- {t_r - t_p:0.4f}s\n')

        return similar_docs

    def vectorize(self, query: Union[str, List[str]]) -> QueryReturn:
        """
        Use DockerVectorizer for fast Query Vectorization.
        :param query: Text to vectorize
        :return: Formatted query embedding
        """
        if not isinstance(query, list):
            query = [query]
        if len(query) > 1:
            query = query[:1]

        query_vector = self.vectorizer.make_vectors(query)

        if isinstance(query_vector[0], list):
            query_vector = np.array(query_vector, dtype=np.float32)
        return query_vector

    def format_results(self, source: str, result_set: FaissSearch, k: int
                       ) -> FormattedSearch:
        scores, hit_ids = result_set
        sents = list()
        for sent_id in hit_ids:
            sents.append(self.sent_dbs[source][str(sent_id)])
        
        return sorted(zip(scores, hit_ids, sents))[:k]

In [4]:
dv = DockerVectorizer()
rs = RangeShards(project_dir, nprobe=32, get_nested=True)

In [5]:
qp = QueryProcessor(dv, rs, project_dir=project_dir, nested=True)

In [6]:
keys = list(rs.shards.keys())
# keys = [keys[0].replace('_mmap', '_id-sent-map')]
keys

['sample_news_2019-03-04', 'sample_news_2019-03-05', 'sample_news_2019-03-06']

In [9]:
qp.query_corpus('This is a good thing', keys)

  Query vectorized in --- 0.0139s
  Index searched in ----- 0.0217s
  Payload formatted in -- 0.9474s



{'sample_news_2019-03-04': [(0.44618067, 12448, 'GOOD)'),
  (0.44618067, 12462, ': GOOD)'),
  (0.44843227, 6714, '"This has to change.'),
  (0.5184382, 6750, 'Not So Fast!'),
  (0.57668424, 8074, '"It is great to be back.\xa0 ')],
 'sample_news_2019-03-05': [(0.38179156, 17447, 'This need not be so.'),
  (0.45797464, 21663, 'This is simply absurd.'),
  (0.46639562, 21146, "Well I wouldn't go that far."),
  (0.47100002, 20566, 'We probably should look at it.'),
  (0.4732894, 20585, 'Obviously, there is a little way to go')],
 'sample_news_2019-03-06': [(0.3150266,
   15989,
   "It makes things interesting, and that's a good thing."),
  (0.3150266, 16177, "It makes things interesting, and that's a good thing."),
  (0.35274342, 4748, "It's a shame."),
  (0.36258426, 13807, 'Even if it is a little soon.'),
  (0.3919842, 6115, "I mean, it's really...")],
 'top_hits': [(0.3150266,
   15989,
   "It makes things interesting, and that's a good thing."),
  (0.3150266, 16177, "It makes things int

In [None]:
qp.sent_dbs.items()

In [None]:
rs.shards[keys[0]]