In [1]:
from datasets import load_dataset

dataset = 'lifestyle'
datasplit = 'dev'

collection_dataset = load_dataset("colbertv2/lotte_passages", dataset)
collection = [x['text'] for x in collection_dataset[datasplit + '_collection']]

queries_dataset = load_dataset("colbertv2/lotte", dataset)
queries = [x['query'] for x in queries_dataset['search_' + datasplit]]

f'Loaded {len(queries)} queries and {len(collection):,} passages'

  from .autonotebook import tqdm as notebook_tqdm
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


'Loaded 417 queries and 268,893 passages'

In [2]:
nbits = 2   # encode each dimension with 2 bits
doc_maxlen = 300 # truncate passages at 300 tokens
max_id = 10000

index_name = f'{dataset}.{datasplit}.{nbits}bits'

In [3]:
answer_pids = [x['answers']['answer_pids'] for x in queries_dataset['search_' + datasplit]]
filtered_queries = [q for q, apids in zip(queries, answer_pids) if any(x < max_id for x in apids)]

f'Filtered down to {len(filtered_queries)} queries'

'Filtered down to 20 queries'

In [4]:
from vespa.application import Vespa

VESPA_URL = "http://localhost:8080/"
vespa_app = Vespa(url=VESPA_URL)

In [6]:
def _upload(schema: str, data_id: str, fields: dict, groupname: str = "all"):

    app = vespa_app

    app.feed_data_point(
        schema=schema,
        namespace="all",
        data_id=data_id,
        fields=fields,
        groupname=groupname
    )

In [9]:
import unicodedata
from uuid import uuid4
import time

doc_id = str(uuid4()) # random id

for i, e in enumerate(collection[:100]):

    chunk = "".join(
        ch for ch in e if unicodedata.category(ch)[0] != "C"
    )  # remove control characters

    chunk_id = str(uuid4()) # random id

    fields = {
        "id" : chunk_id, 
        "document_id" : doc_id, # document id from path
        "access_group" : "", # not yet implemented
        "chunk_text" : chunk,
        "chunking_strategy" : "lotte",
        "chunk_no" : i,
        "embedding" : [0],
        "last_updated" : int(time.time()) # current time in long int
    }

    _upload(schema="text_chunk", data_id=chunk_id, fields=fields)

In [17]:
def query(
    query: str,
    ):

    response = vespa_app.query(
    body={
            "yql": 'select * from sources * where userQuery();',
            "hits": 1,
            "query": query,
            "type": "any",
            "ranking": "default"
        }
    )
        
    if not response.is_successful():
        raise ValueError(f"Query failed with status code {response.status_code}, url={response.url} response={response.json}")
    
    return response

In [18]:
filtered_queries[13]

'are some cats just skinny?'

In [19]:
query(filtered_queries[13]).json

{'root': {'id': 'toplevel',
  'relevance': 1.0,
  'fields': {'totalCount': 792},
  'coverage': {'coverage': 100,
   'documents': 1190,
   'full': True,
   'nodes': 3,
   'results': 3,
   'resultsFull': 3},
  'children': [{'id': 'id:all:text_chunk:g=all:15b981ae-be9f-4356-ba87-d4efcb1b52b5',
    'relevance': 0.23449445118865728,
    'source': 'search_content',
    'fields': {'sddocname': 'text_chunk',
     'documentid': 'id:all:text_chunk:g=all:15b981ae-be9f-4356-ba87-d4efcb1b52b5',
     'id': '15b981ae-be9f-4356-ba87-d4efcb1b52b5',
     'document_id': '234c012d-36b3-45eb-9a0e-f32f033ca015',
     'chunk_text': 'Some cats are horizontal scratchers instead of vertical ones, they like pulling and yanking at carpet and stuff rather than, say, a couch or something, if they dont have something appropriate. You can get scratchers that are just corrugated cardboard stuck together in flatter box shapes that you set on the floor - I had a cat who was determined to yank up all my carpets, and then