In [1]:
import pandas as pd
import numpy as np
import json
import re
import os

from chromadb.errors import UniqueConstraintError
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer

import torch
from sklearn.preprocessing import normalize

import chromadb

In [2]:
DATA_FOLDER = '../data'
MODELS_FOLDER = '../data/DocProperties/incore-exporter/Workflow.DTO/Models'
MODEL_ID = "BAAI/bge-m3"

In [3]:
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Device count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"Device name: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.7.1+cu128
CUDA available: True
CUDA version: 12.8
Device count: 1
Device name: NVIDIA GeForce RTX 3060


## We recommend to use the following pipeline: hybrid retrieval + re-ranking.

- [ ] Hybrid retrieval leverages the strengths of various methods, offering higher accuracy and stronger generalization capabilities. A classic example: using both embedding retrieval and the BM25 algorithm. Now, you can try to use BGE-M3, which supports both embedding and sparse retrieval. This allows you to obtain token weights (similar to the BM25) without any additional cost when generate dense embeddings. To use hybrid retrieval, you can refer to Vespa and Milvus.

- [ ] As cross-encoder models, re-ranker demonstrates higher accuracy than bi-encoder embedding model. Utilizing the re-ranking model (e.g., bge-reranker, bge-reranker-v2) after retrieval can further filter the selected text.

In [4]:
model = SentenceTransformer(MODEL_ID)

In [27]:
class SentenceTransformerEmbeddingFunction:
    def __init__(self, model_name_or_path):
        self.model_name = model_name_or_path
        self.model = SentenceTransformer(model_name_or_path)

    def __call__(self, input):
        embeddings = self.model.encode(input, normalize_embeddings=True)
        return embeddings.tolist()

    def name(self):
        return self.model_name

In [28]:
embedding_func = SentenceTransformerEmbeddingFunction(MODEL_ID)

In [18]:
vectors = model.encode_document(f'{DATA_FOLDER}/Models_doc.jsonl', show_progress_bar=True, device='cuda:0', normalize_embeddings=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
# vectors = pd.DataFrame(vectors)
# vectors

Unnamed: 0,0
0,-0.026871
1,-0.032953
2,-0.057616
3,-0.030557
4,-0.030781
...,...
1019,-0.004025
1020,0.035294
1021,0.009350
1022,-0.011672


In [7]:
# vectors.shape

(1024, 1)

In [20]:
vectors_2 = model.encode_document(f'{DATA_FOLDER}/DocProperties_JSONL.jsonl', show_progress_bar=True, device='cuda:0', normalize_embeddings=True)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
# vectors_2 = pd.DataFrame(vectors_2)
# vectors_2

Unnamed: 0,0
0,-0.051448
1,-0.005251
2,-0.040826
3,-0.023159
4,-0.039081
...,...
1019,0.001224
1020,0.066554
1021,-0.028640
1022,-0.005577


In [10]:
# vectors_2.shape

(1024, 1)

In [11]:
chrome_persistent_client = chromadb.PersistentClient(path=f'{DATA_FOLDER}/chromadb')

In [23]:
def embedding_wrapper(text):
    return model.encode_document(text, show_progress_bar=True, normalize_embeddings=True).tolist()

In [30]:
try:
    collection = chrome_persistent_client.create_collection(name='my_rag_v3', embedding_function=embedding_func)
finally:
    collection = chrome_persistent_client.get_collection(name='my_rag_v3', embedding_function=embedding_func)

In [31]:
collection.upsert(
    documents=[
        f'{DATA_FOLDER}/DocProperties_JSONL.jsonl',
        f'{DATA_FOLDER}/Models_doc.jsonl'
    ],
    metadatas=[
        {'category':'code + explanation'},
        {'category':'classes'},
    ],
    ids=["id0", "id1"],
    embeddings=[vectors_2, vectors]
)

In [32]:
result = collection.query(
    query_texts=['write animal info'],
    n_results=1,
)
result

{'ids': [['id1']],
 'embeddings': None,
 'documents': [['../data/Models_doc.jsonl']],
 'uris': None,
 'included': ['metadatas', 'documents', 'distances'],
 'data': None,
 'metadatas': [[{'category': 'classes'}]],
 'distances': [[1.1367965936660767]]}

In [None]:
# TODO crack files to smaller pieces DocProperties_JSONL.jsonl and Models_doc.jsonl