In [4]:
pip install datasets fastembed qdrant-client pyarrow==8.0.0

Note: you may need to restart the kernel to use updated packages.


In [5]:
#pip install datasets fastembed qdrant-client

In [6]:
import pyarrow
from datasets import load_dataset

dataset = load_dataset("BeIR/scifact", "corpus", split="corpus")
dataset[1]

{'_id': '5836',
 'title': 'Induction of myelodysplasia by myeloid-derived suppressor cells.',
 'text': 'Myelodysplastic syndromes (MDS) are age-dependent stem cell malignancies that share biological features of activated adaptive immune response and ineffective hematopoiesis. Here we report that myeloid-derived suppressor cells (MDSC), which are classically linked to immunosuppression, inflammation, and cancer, were markedly expanded in the bone marrow of MDS patients and played a pathogenetic role in the development of ineffective hematopoiesis. These clonally distinct MDSC overproduce hematopoietic suppressive cytokines and function as potent apoptotic effectors targeting autologous hematopoietic progenitors. Using multiple transfected cell models, we found that MDSC expansion is driven by the interaction of the proinflammatory molecule S100A9 with CD33. These 2 proteins formed a functional ligand/receptor pair that recruited components to CD33’s immunoreceptor tyrosine-based inhibit

In [7]:
len(dataset)

5183

In [8]:
from fastembed.embedding import TextEmbedding

dense_embedding_model = TextEmbedding("sentence-transformers/all-MiniLM-L6-v2")
dense_embeddings = list(dense_embedding_model.passage_embed(dataset["text"][0:1]))
len(dense_embeddings)



Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

1

In [9]:
len(dense_embeddings[0])

384

In [10]:
from fastembed.sparse.bm25 import Bm25

bm25_embedding_model = Bm25("Qdrant/bm25")
bm25_embeddings = list(bm25_embedding_model.passage_embed(dataset["text"][0:1]))
len(bm25_embeddings)

Fetching 29 files:   0%|          | 0/29 [00:00<?, ?it/s]

1

In [11]:
from fastembed.late_interaction import LateInteractionTextEmbedding

late_interaction_embedding_model = LateInteractionTextEmbedding("colbert-ir/colbertv2.0")
late_interaction_embeddings = list(late_interaction_embedding_model.passage_embed(dataset["text"][0:1]))
len(late_interaction_embeddings)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

1

In [12]:
len(late_interaction_embeddings[0])

431

In [13]:
!docker run -d -p 6333:6333 -p 6334:6334 qdrant/qdrant:v1.10.0

58f58e1616f6e06643bbbfc0bec4b7bc33aa862dff066c4f1bd0fe4b09730e89


docker: Error response from daemon: driver failed programming external connectivity on endpoint intelligent_hoover (b030e8f1b8e68ed0c351b524532c5e90569c28c89aa124561e25b64163a77409): Bind for 0.0.0.0:6333 failed: port is already allocated.


In [14]:
from qdrant_client import QdrantClient, models

client = QdrantClient(
    url="https://6bc8371e-95cb-4db6-a095-983bd238bf64.eu-west-1-0.aws.cloud.qdrant.io:6333",
    api_key="FWafO9HNUnEWRkv7t0G91zc9vTS6ZKblzSNDJnWN-BPfd7i1RoE2gQ",
    timeout=1200,  # 20 minutes timeout
)
#client = QdrantClient("http://localhost:6333", timeout=600)
client.create_collection(
    "scifact",
    vectors_config={
        "all-MiniLM-L6-v2": models.VectorParams(
            size=len(dense_embeddings[0]),
            distance=models.Distance.COSINE,
        ),
        "colbertv2.0": models.VectorParams(
            size=len(late_interaction_embeddings[0][0]),
            distance=models.Distance.COSINE,
            multivector_config=models.MultiVectorConfig(
                comparator=models.MultiVectorComparator.MAX_SIM,
            )
        ),
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)

True

In [17]:
import tqdm

batch_size = 4
for batch in tqdm.tqdm(dataset.iter(batch_size=batch_size), 
                       total=len(dataset) // batch_size):
    dense_embeddings = list(dense_embedding_model.passage_embed(batch["text"]))
    bm25_embeddings = list(bm25_embedding_model.passage_embed(batch["text"]))
    late_interaction_embeddings = list(late_interaction_embedding_model.passage_embed(batch["text"]))
    
    client.upload_points(
        "scifact",
        points=[
            models.PointStruct(
                id=int(batch["_id"][i]),
                vector={
                    "all-MiniLM-L6-v2": dense_embeddings[i].tolist(),
                    "bm25": bm25_embeddings[i].as_object(),
                    "colbertv2.0": late_interaction_embeddings[i].tolist(),
                },
                payload={
                    "_id": batch["_id"][i],
                    "title": batch["title"][i],
                    "text": batch["text"][i],
                }
            )
            for i, _ in enumerate(batch["_id"])
        ],
        # We send a lot of embeddings at once, so it's best to reduce the batch size.
        # Otherwise, we would have gigantic requests sent for each batch and we can
        # easily reach the maximum size of a single request.
        batch_size=batch_size,  
    )

1296it [2:34:58,  7.18s/it]                                                                                            


In [16]:
client.recover_snapshot(
    "scifact",
    location="https://storage.googleapis.com/common-datasets-snapshots/scifact-multiple-representations.snapshot",
)

UnexpectedResponse: Unexpected Response: 500 (Internal Server Error)
Raw response content:
b'{"status":{"error":"Service internal error: Http request error: error sending request for url (https://storage.googleapis.com/common-datasets-snapshots/scifact-multiple-representations.snapshot)"}, ...'

In [None]:
client.get_collection("scifact")