### prepare data

In [1]:
from datasets import load_dataset, load_from_disk

try:
    dataset = load_from_disk("ms_marco")
except:
    dataset = load_dataset("ms_marco", "v2.1")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
passages = [text for x in dataset["test"]["passages"] for text in x["passage_text"]]

In [3]:
len(passages)

1008943

In [4]:
queries = [x for x in dataset["test"]["query"]]

In [5]:
len(queries)

101092

### store in vespa

In [23]:
from vespa.application import Vespa

VESPA_URL = "http://localhost:8080/"
vespa_app = Vespa(url=VESPA_URL)

In [24]:
def _upload(schema: str, data_id: str, fields: dict, groupname: str = "all"):

    vespa_app.feed_data_point(
                schema=schema,
                namespace="all",
                data_id=data_id,
                fields=fields,
                groupname= "all"
            )

In [25]:
upperbound = len(queries)
upperbound = 10000

In [26]:
import unicodedata
from uuid import uuid4
import time

doc_id = str(uuid4()) # random id

num_passages = len(passages)

for i, e in enumerate(passages):

    chunk = "".join(
        ch for ch in e if unicodedata.category(ch)[0] != "C"
    )  # remove control characters

    chunk_id = str(uuid4()) # random id

    fields = {
        "id" : chunk_id, 
        "document_id" : doc_id, # document id from path
        "access_group" : "", # not yet implemented
        "chunk_text" : chunk,
        "chunking_strategy" : "lotte",
        "chunk_no" : i,
        "last_updated" : int(time.time()) # current time in long int
    }

    _upload(schema="text_chunk", data_id=chunk_id, fields=fields)

    print(f"\rProgress: {i}/{num_passages}", end='')

Progress: 125287/1008943

KeyboardInterrupt: 

### top k accuracy; k = 1

In [27]:
k = 1

In [28]:
VESPA_URL = "http://localhost:8082/" # new enp
vespa_app = Vespa(url=VESPA_URL)

In [29]:
def query(
    query: str = None,
    yql: str = "select id,chunk_text from text_chunk where userQuery() or ({targetHits:10}nearestNeighbor(embedding,q))",
    ranking: str = "colbert"
    ):

    # eg yql
    # yql = f'select * from sources * where chunk_text contains "{query}"'
    # yql = 'select * from sources * where sddocname contains "text_chunk" limit 10;'

    response = vespa_app.query(
        yql=yql,
        groupname="all",
        ranking=ranking,
        query=query,
        body={
            "presentation.format.tensors": "short-value",
            f"input.query(q)": 'embed(e5, "{query}")',
            f"input.query(qt)": 'embed(colbert, "{query}")',
        },
    )
        
    if not response.is_successful():
        raise ValueError(f"Query failed with status code {response.status_code}, url={response.url} response={response.json}")
    
    return response.json

In [30]:
correct = 0
total = 0

for q, p in zip(queries, dataset["test"]["passages"]):
    resp = query(q)

    if "children" in resp["root"].keys():
        if resp["root"]["children"][0]["fields"]["chunk_text"] in p["passage_text"]:
            correct += 1
            print()
    
    total += 1
    
    print(f"\rProgress: {total}/{upperbound}", end='')

    if total == upperbound:
        break

Progress: 257/10000
Progress: 10000/10000

In [31]:
total

10000

In [32]:
correct

1

In [33]:
(correct / total) * 100

0.01

In [19]:
query("What is GQA?")

{'root': {'id': 'toplevel',
  'relevance': 1.0,
  'fields': {'totalCount': 37},
  'coverage': {'coverage': 100,
   'documents': 174,
   'full': True,
   'nodes': 2,
   'results': 1,
   'resultsFull': 1},
  'children': [{'id': 'index:search_content/1/a181a60355df655127f968ad',
    'relevance': 63.606340408325195,
    'source': 'search_content',
    'fields': {'matchfeatures': {'cos_sim': 0.607392739468672,
      'max_sim': 63.606340408325195},
     'id': 'defbf989-1cac-43da-9549-a08a0948f457',
     'chunk_text': 'Mistral 7B leverages grouped-query attention (GQA) [1], and sliding window attention (SWA) [6, 3]. GQA signiﬁcantly accelerates the inference speed, and also reduces the memory requirement during decoding, allowing for higher batch sizes hence higher throughput, a crucial factor for real-time applications. In addition, SWA is designed to handle longer sequences more effectively at a reduced computational cost, thereby alleviating a common limitation in LLMs. These attention mec