### prepare data

In [1]:
from datasets import load_dataset, load_from_disk

try:
    dataset = load_from_disk("ms_marco")
except:
    dataset = load_dataset("ms_marco", "v2.1")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
passages = [text for x in dataset["test"]["passages"] for text in x["passage_text"]]

In [3]:
len(passages)

1008943

In [4]:
queries = [x for x in dataset["test"]["query"]]

In [5]:
len(queries)

101092

### store in vespa

In [6]:
from vespa.application import Vespa

VESPA_URL = "http://localhost:8080/"
vespa_app = Vespa(url=VESPA_URL)

In [7]:
def _upload(schema: str, data_id: str, fields: dict, groupname: str = "all"):

    app = vespa_app

    app.feed_data_point(
        schema=schema,
        namespace="all",
        data_id=data_id,
        fields=fields,
        groupname=groupname
    )

In [8]:
upperbound = len(queries)

In [9]:
import unicodedata
from uuid import uuid4
import time

doc_id = str(uuid4()) # random id

num_passages = len(passages)

for i, e in enumerate(passages):

    chunk = "".join(
        ch for ch in e if unicodedata.category(ch)[0] != "C"
    )  # remove control characters

    chunk_id = str(uuid4()) # random id

    fields = {
        "id" : chunk_id, 
        "document_id" : doc_id, # document id from path
        "access_group" : "", # not yet implemented
        "chunk_text" : chunk,
        "chunking_strategy" : "lotte",
        "chunk_no" : i,
        "embedding" : [0],
        "last_updated" : int(time.time()) # current time in long int
    }

    _upload(schema="text_chunk", data_id=chunk_id, fields=fields)

    print(f"\rProgress: {i}/{num_passages}", end='')

Progress: 304/1008943

KeyboardInterrupt: 

### top k accuracy; k = 1

In [10]:
k = 1

In [16]:
VESPA_URL = "http://localhost:8082/" # new enp
vespa_app = Vespa(url=VESPA_URL)

In [17]:
def query(
    query: str,
    ):

    response = vespa_app.query(
    body={
            "yql": 'select * from sources * where userQuery();',
            "hits": k,
            "query": query,
            "type": "any",
            "ranking": "default"
        }
    )
        
    if not response.is_successful():
        raise ValueError(f"Query failed with status code {response.status_code}, url={response.url} response={response.json}")
    
    return response

In [21]:
correct = 0
total = 0

for q, p in zip(queries, dataset["test"]["passages"]):
    resp = query(q)

    if "children" in resp.json["root"].keys():
        if resp.json["root"]["children"][0]["fields"]["chunk_text"] in p["passage_text"]:
            correct += 1
            print()
    
    total += 1
    
    print(f"\rProgress: {total}/{upperbound}", end='')

    if total == upperbound:
        break

Progress: 2631/101092

KeyboardInterrupt: 

In [22]:
total

2631

In [23]:
correct

0

In [24]:
(correct / total) * 100

0.0