Show how to create a collection and upload data to it

In [8]:
import sys, pathlib

repo_root = pathlib.Path().resolve()
repo_root = repo_root.parent

if str(repo_root) not in sys.path:
    sys.path.append(str(repo_root))

from indexing.qdrant import load_qdrant_client, create_qdrant_collection, delete_collection
from indexing.upload import upload_points
from qdrant_client import models
from langchain_qdrant import QdrantVectorStore, RetrievalMode, FastEmbedSparse
from embeddings.embedding import FastEmbedEmbeddings, output_supported_models

In [2]:
client = load_qdrant_client()

Output possible model

In [3]:
output_supported_models("dense") 

[['BAAI/bge-base-en',
  'Size:0.42',
  'Dim 768',
  'Desc Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: necessary, 2023 year.'],
 ['BAAI/bge-base-en-v1.5',
  'Size:0.21',
  'Dim 768',
  'Desc Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year.'],
 ['BAAI/bge-large-en-v1.5',
  'Size:1.2',
  'Dim 1024',
  'Desc Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year.'],
 ['BAAI/bge-small-en',
  'Size:0.13',
  'Dim 384',
  'Desc Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: necessary, 2023 year.'],
 ['BAAI/bge-small-en-v1.5',
  'Size:0.067',
  'Dim 384',
  'Desc Text embeddings, Unimodal (text), English, 512 input tokens truncation, Prefixes for queries/documents: not so necessary, 2023 year.'],
 ['BAAI/bge-small-zh-v1.5

In [4]:
output_supported_models("sparse") 

[['prithivida/Splade_PP_en_v1',
  'Size:0.532',
  'Desc Independent Implementation of SPLADE++ Model for English.'],
 ['prithvida/Splade_PP_en_v1',
  'Size:0.532',
  'Desc Independent Implementation of SPLADE++ Model for English.'],
 ['Qdrant/bm42-all-minilm-l6-v2-attentions',
  'Size:0.09',
  'Desc Light sparse embedding model, which assigns an importance score to each token in the text'],
 ['Qdrant/bm25',
  'Size:0.01',
  'Desc BM25 as sparse embeddings meant to be used with Qdrant'],
 ['Qdrant/minicoil-v1',
  'Size:0.09',
  'Desc Sparse embedding model, that resolves semantic meaning of the words, while keeping exact keyword match behavior. Based on jinaai/jina-embeddings-v2-small-en-tokens']]

In [5]:
model_dense = FastEmbedEmbeddings(model_name="BAAI/bge-base-en-v1.5")
model_sparse = FastEmbedSparse(model_name="Qdrant/bm42-all-minilm-l6-v2-attentions")

Fetching 5 files: 100%|██████████| 5/5 [00:13<00:00,  2.60s/it]
Fetching 6 files: 100%|██████████| 6/6 [00:05<00:00,  1.16it/s]


In [None]:
create_qdrant_collection(client, collection_name="RAG", model_dense=model_dense, model_sparse=model_sparse)

Collection config updated with new model


For filtering purpose, I need to create a payload

In [None]:
#Right now, I will only filter on the type but next I will use advanced filtering
#Need to create payload before adding points because it will add the id directly for better filtering
client.create_payload_index(collection_name="RAG", field_name="metadata.type", field_schema=models.PayloadSchemaType.KEYWORD, wait=True)

UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [14]:
vector_store = QdrantVectorStore(client=client, collection_name="RAG", retrieval_mode=RetrievalMode.HYBRID, embedding=model_dense, sparse_embedding=model_sparse)

In [8]:
upload_points(vector_store, text)

Uploading batches: 100%|██████████| 30/30 [26:39<00:00, 53.31s/it]
