In [1]:
# https://github.com/vespa-engine/sample-apps/tree/master/msmarco-ranking

In [2]:
from vespa.package import Schema, Document, Field, FieldSet
pdf_schema = Schema(
            name="pdf",
            mode="streaming",
            document=Document(
                fields=[
                    Field(name="id", type="string", indexing=["summary"]),
                    Field(name="title", type="string", indexing=["summary", "index"]),
                    Field(name="url", type="string", indexing=["summary", "index"]),
                    Field(name="authors", type="array<string>", indexing=["summary", "index"]),
                    Field(name="metadata", type="map<string,string>", indexing=["summary", "index"]),
                    Field(name="page", type="int", indexing=["summary", "attribute"]),
                    Field(name="chunkno", type="int", indexing=["summary", "attribute"]),
                    Field(name="chunk", type="string", indexing=["summary", "index"]),

                    Field(name="embedding", type="tensor<bfloat16>(x[384])",
                        indexing=['"passage: " . (input title || "") . " " . (input chunk || "")', "embed e5", "attribute"],
                        attribute=["distance-metric: angular"],
                        is_document_field=False
                    ),

                    Field(name="colbert", type="tensor<int8>(dt{}, x[16])",
                        indexing=['(input title || "") . " " . (input chunk || "")', "embed colbert", "attribute"],
                        is_document_field=False
                    )
                ],
            ),
            fieldsets=[
                FieldSet(name = "default", fields = ["title", "chunk"])
            ]
)

In [3]:
from vespa.package import ApplicationPackage, Component, Parameter

vespa_app_name = "search"
vespa_application_package = ApplicationPackage(
        name=vespa_app_name,
        schema=[pdf_schema],
        components=[
            Component(id="e5", type="hugging-face-embedder",
              parameters=[
                  Parameter("transformer-model", {"url": "https://huggingface.co/intfloat/e5-small-v2/resolve/main/model.onnx"}),
                  Parameter("tokenizer-model", {"url": "https://huggingface.co/intfloat/e5-small-v2/raw/main/tokenizer.json"})
              ]
            ),
            Component(id="colbert", type="colbert-embedder",
              parameters=[
                  Parameter("transformer-model", {"url": "https://huggingface.co/colbert-ir/colbertv2.0/resolve/main/model.onnx"}),
                  Parameter("tokenizer-model", {"url": "https://huggingface.co/colbert-ir/colbertv2.0/raw/main/tokenizer.json"})
              ]
            )
        ]
)

In [4]:
from vespa.package import RankProfile, Function, FirstPhaseRanking, SecondPhaseRanking

colbert = RankProfile(
    name="colbert",
    inputs=[
        ("query(q)", "tensor<float>(x[384])"),
        ("query(qt)", "tensor<float>(qt{}, x[128])")
        ],
    functions=[
        Function(
            name="unpack",
            expression="unpack_bits(attribute(colbert))"
        ),
        Function(
            name="cos_sim",
            expression="closeness(field, embedding)"
        ),
        Function(
            name="max_sim",
            expression="""
                sum(
                    reduce(
                        sum(
                            query(qt) * unpack() , x
                        ),
                        max, dt
                    ),
                    qt
                )
            """
        )
    ],
    first_phase=FirstPhaseRanking(
        expression="cos_sim"
    ),
    second_phase=SecondPhaseRanking(
        expression="max_sim"
    ),
    match_features=["max_sim", "cos_sim"]
)
pdf_schema.add_rank_profile(colbert)

In [5]:
from vespa.deployment import VespaDocker

vespa_docker = VespaDocker()
app = vespa_docker.deploy(application_package=vespa_application_package)



Waiting for configuration server, 0/300 seconds...
Waiting for configuration server, 5/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 0/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Waiting for application status, 5/300 seconds...
Using plain http against endpoint http://localhost:8080/ApplicationStatus
Application is up!
Finished deployment.


In [6]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1024, #chars, not llm tokens
    chunk_overlap  = 0,
    length_function = len,
    is_separator_regex = False,
)

In [7]:
def sample_pdfs():
    return [
        {
            "title": "ColBERTv2: Effective and Efficient Retrieval via Lightweight Late Interaction",
            "url": "https://arxiv.org/pdf/2112.01488.pdf",
            "authors": "Keshav Santhanam, Omar Khattab, Jon Saad-Falcon, Christopher Potts, Matei Zaharia"
        },
        {
            "title": "ColBERT: Efficient and Effective Passage Search via Contextualized Late Interaction over BERT",
            "url": "https://arxiv.org/pdf/2004.12832.pdf",
            "authors": "Omar Khattab, Matei Zaharia"
        },
        {
            "title": "On Approximate Nearest Neighbour Selection for Multi-Stage Dense Retrieval",
            "url": "https://arxiv.org/pdf/2108.11480.pdf",
            "authors": "Craig Macdonald, Nicola Tonellotto"
        },
        {
            "title": "A Study on Token Pruning for ColBERT",
            "url": "https://arxiv.org/pdf/2112.06540.pdf",
            "authors": "Carlos Lassance, Maroua Maachou, Joohee Park, StÃ©phane Clinchant"
        },
        {
            "title": "Pseudo-Relevance Feedback for Multiple Representation Dense Retrieval",
            "url": "https://arxiv.org/pdf/2106.11251.pdf",
            "authors": "Xiao Wang, Craig Macdonald, Nicola Tonellotto, Iadh Ounis"
        }

    ]

In [8]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1024, #chars, not llm tokens
    chunk_overlap  = 0,
    length_function = len,
    is_separator_regex = False,
)

In [9]:
import hashlib
import unicodedata
def remove_control_characters(s):
    return "".join(ch for ch in s if unicodedata.category(ch)[0]!="C")

my_docs_to_feed = []
for pdf in sample_pdfs():
    url = pdf['url']
    loader = PyPDFLoader(url)
    pages = loader.load_and_split()
    for index, page in enumerate(pages):
        source = page.metadata['source']
        chunks = text_splitter.transform_documents([page])
        text_chunks = [chunk.page_content for chunk in chunks]
        text_chunks = [remove_control_characters(chunk) for chunk in text_chunks]
        page_number = index + 1
        for chunkno, chunk in enumerate(text_chunks):
          vespa_id = f"{url}#{page_number}#{chunkno}"
          hash_value = hashlib.sha1(vespa_id.encode()).hexdigest()
          fields = {
              "title" : pdf['title'],
              "url" : "",
              "page": page_number,
              "id": hash_value,
              "authors": [a.strip() for a in pdf['authors'].split(",")],
              "chunkno": chunkno,
              "chunk": chunk,
              "metadata": page.metadata
          }
          my_docs_to_feed.append(fields)

In [12]:
page.metadata

{'source': 'https://arxiv.org/pdf/2106.11251.pdf', 'page': 9}

In [None]:
my_docs_to_feed[0]["id"]

In [10]:
from typing import Iterable
def vespa_feed(user:str) -> Iterable[dict]:
    for doc in my_docs_to_feed:
        yield {
            "fields": doc,
            "id": doc["id"],
            "groupname": user
        }

In [11]:
from vespa.io import VespaResponse

def callback(response:VespaResponse, id:str):
    if not response.is_successful():
        print(f"Document {id} failed to feed with status code {response.status_code}, url={response.url} response={response.json}")

app.feed_iterable(schema="pdf", iter=vespa_feed("jo-bergum"), namespace="personal", callback=callback)

KeyboardInterrupt: 

In [None]:
from vespa.io import VespaQueryResponse
import json

response:VespaQueryResponse = app.query(
    yql="select id,title,page,chunkno,chunk from pdf where userQuery() or ({targetHits:10}nearestNeighbor(embedding,q))",
    groupname="jo-bergum",
    ranking="colbert",
    query="why is colbert effective?",
    body={
        "presentation.format.tensors": "short-value",
        "input.query(q)": "embed(e5, \"why is colbert effective?\")",
        "input.query(qt)": "embed(colbert, \"why is colbert effective?\")",
    }
)
assert(response.is_successful())
print(json.dumps(response.hits[0], indent=2))

{
  "id": "id:personal:pdf:g=jo-bergum:ca45229d483b107430faa63e7e3e30e51a5ff5c8",
  "relevance": 113.11049222946167,
  "source": "search_content.pdf",
  "fields": {
    "matchfeatures": {
      "cos_sim": 0.6499400230790054,
      "max_sim": 113.11049222946167
    },
    "id": "ca45229d483b107430faa63e7e3e30e51a5ff5c8",
    "title": "ColBERT: Efficient and Effective Passage Search via Contextualized Late Interaction over BERT",
    "page": 6,
    "chunkno": 1,
    "chunk": "lishes a new state-of-the-art for non-BERT models on MS MARCO(Dev); however, the best non-ensemble MRR@10 it achieves is 31%while ColBERT reaches up to 36%. Moreover, due to indexing docu-ment representations o\ufb04ine and employing a MaxSim-based lateinteraction mechanism, ColBERT is much more scalable, enablingend-to-end retrieval which is not supported by TK.3 COLBERTColBERT prescribes a simple framework for balancing the qualityand cost of neural IR, particularly deep language models like BERT.As introduced ear

In [None]:
response:VespaQueryResponse = app.query(
    yql="select * from sources * where true",
    groupname="jo-bergum",
)

print(json.dumps(response.number_documents_indexed, indent=2))

301
