In [3]:
import pandas as pd
import numpy as np

import os


from paths import DATA_FOLDER, PROCESSED_DATA_FOLDER, MODELS_FOLDER, ML_FOLDER
from BasicClasses import *

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

import chromadb


In [4]:
DATA_FOLDER = DATA_FOLDER
PROCESSED_DATA_FOLDER = PROCESSED_DATA_FOLDER
MODELS_FOLDER = MODELS_FOLDER
ML_FOLDER = ML_FOLDER
MODEL_ID = "BAAI/bge-m3"
RERANKING_MODEL = "BAAI/bge-reranker-v2-m3"

In [5]:
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Device count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"Device name: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.8.0+cu128
CUDA available: True
CUDA version: 12.8
Device count: 1
Device name: NVIDIA GeForce RTX 3060


## We recommend to use the following pipeline: hybrid retrieval + re-ranking.

- [ ] Hybrid retrieval leverages the strengths of various methods, offering higher accuracy and stronger generalization capabilities. A classic example: using both embedding retrieval and the BM25 algorithm. Now, you can try to use BGE-M3, which supports both embedding and sparse retrieval. This allows you to obtain token weights (similar to the BM25) without any additional cost when generate dense embeddings. To use hybrid retrieval, you can refer to Vespa and Milvus.

- [ ] As cross-encoder models, re-ranker demonstrates higher accuracy than bi-encoder embedding model. Utilizing the re-ranking model (e.g., bge-reranker, bge-reranker-v2) after retrieval can further filter the selected text.

In [6]:
# model = SentenceTransformer(MODEL_ID)
model = SentenceTransformer(f'{ML_FOLDER}/BGE-m3')
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-v2-m3',
                                                       cache_dir=f'{ML_FOLDER}/bge-reranker-v2-m3')
reranker = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-v2-m3',
                                                                           cache_dir=f'{ML_FOLDER}/bge-reranker-v2-m3')

In [None]:
# model.save(f'{ML_FOLDER}/BGE-m3')


In [7]:
# embedding_func = SentenceTransformerEmbeddingFunction(MODEL_ID)
embedding_func = SentenceTransformerEmbeddingFunction(f'{ML_FOLDER}/BGE-m3')

In [8]:
class_vectors_v2 = []
for file in os.listdir(f'{PROCESSED_DATA_FOLDER}/classes'):
    with open(f'{PROCESSED_DATA_FOLDER}/classes/{file}', 'r', encoding='utf-8') as f:
        line = f.read()
        results = embedding_func(line)
        class_vectors_v2.extend(results)

In [None]:
class_vectors_v3 = pd.DataFrame(class_vectors_v2)
class_vectors_v3

In [9]:
trainig_vectors_v1 = []
for file in os.listdir(f'{PROCESSED_DATA_FOLDER}/training'):
    with open(f'{PROCESSED_DATA_FOLDER}/training/{file}', 'r', encoding='utf-8') as f:
        line = f.read()
        results = embedding_func(line)
        trainig_vectors_v1.extend(results)

In [10]:
chrome_persistent_client = chromadb.PersistentClient(path=f'{DATA_FOLDER}/chromadb')

In [13]:
chrome_persistent_client.list_collections()

[]

In [12]:
# chrome_persistent_client.delete_collection('classes')
# chrome_persistent_client.delete_collection('training')

In [14]:
collection_classes = chrome_persistent_client.get_or_create_collection(name='classes', embedding_function=embedding_func)
collection_training = chrome_persistent_client.get_or_create_collection(name='training', embedding_function=embedding_func)

In [15]:
documents = []
metadatas =()
ids = []


class_folder = os.listdir(f'{PROCESSED_DATA_FOLDER}/classes')
training_folder = os.listdir(f'{PROCESSED_DATA_FOLDER}/training')

In [16]:
for file in class_folder:
    with open(f'{PROCESSED_DATA_FOLDER}/classes/{file}', 'r', encoding='utf-8') as f:
        documents.append(f.read())

metadatas = (
    [{'category': 'classes'} for _ in class_folder]
)

ids = [f'CLASSid{i}' for i in range(1, len(class_folder)+ 1)]



In [17]:
collection_classes.upsert(
    documents=documents,
    metadatas=metadatas,
    ids=ids,
    embeddings=class_vectors_v2,
)

In [18]:
documents = []
metadatas =()
ids = []
embeddings = []

In [19]:
for file in training_folder:
    with open(f'{PROCESSED_DATA_FOLDER}/training/{file}', 'r', encoding='utf-8') as f:
        documents.append(f.read())

metadatas = (
    [{'category': 'code + explanation'} for _ in training_folder]
)

ids = [f'TRAINid{i}' for i in range(1, len(training_folder) + 1)]

embeddings.extend(trainig_vectors_v1)

In [20]:
collection_training.upsert(
    documents=documents,
    metadatas=metadatas,
    ids=ids,
    embeddings=trainig_vectors_v1,
)


In [21]:
text = 'Write a code for insurance with the document 207'
result = collection_classes.query(
    query_texts=[text],
    n_results=20,
)
result

{'ids': [['CLASSid59',
   'CLASSid86',
   'CLASSid111',
   'CLASSid91',
   'CLASSid89',
   'CLASSid92',
   'CLASSid46',
   'CLASSid84',
   'CLASSid141',
   'CLASSid136',
   'CLASSid90',
   'CLASSid85',
   'CLASSid49',
   'CLASSid87',
   'CLASSid88',
   'CLASSid147',
   'CLASSid129',
   'CLASSid47',
   'CLASSid56',
   'CLASSid149']],
 'embeddings': None,
 'documents': [['{"class": "DocumentsToInsuranceObject", "members": {"DocumentsToInsuranceObject": []}}\n',
   '{"class": "InsuranceCompany", "members": {"InsuranceCompany": []}}\n',
   '{"class": "Product", "members": {"Product": [{"name": "PolisNumber", "type": "String", "description": "Номер полісу"}, {"name": "BranchGID", "type": "Guid", "description": "Унікальний ідентифікатор відділення продажу полісу"}, {"name": "BeginingDate", "type": "DateTime", "description": "Дата початку дії продукту"}, {"name": "EndingDate", "type": "DateTime", "description": "Дата завершення дії продукту"}, {"name": "RegisteredDate", "type": "DateTime", "d

In [22]:
def compute_score(pairs, normalize=True, batch_size=2):
    scores = []
    for i in range(0, len(pairs), batch_size):
        batch_pairs = pairs[i:i + batch_size]
        inputs = [f"{pair[0]} [SEP] {pair[1]}" for pair in batch_pairs]
        tokenized = tokenizer(inputs, padding=True, truncation=True,
                                   max_length=2048, return_tensors="pt")


        with torch.no_grad():
            outputs = reranker(**tokenized)
            batch_scores = outputs.logits.squeeze(-1).cpu().numpy()
            scores.extend(batch_scores)

    if normalize:
        scores = torch.softmax(torch.tensor(scores), dim=0).numpy()

    return scores

In [23]:
pairs = [[text, result['documents'][0][i]] for i in range(20)]
new_scores = []

new_scores = compute_score(pairs, normalize=True)


new_result = list(zip(
    result['ids'][0],
    result['metadatas'][0],
    result['documents'][0],
    new_scores
))

new_result.sort(key=lambda x: x[3], reverse=True)

In [24]:
new_result[0][2]

'{"class": "Product", "members": {"Product": [{"name": "PolisNumber", "type": "String", "description": "Номер полісу"}, {"name": "BranchGID", "type": "Guid", "description": "Унікальний ідентифікатор відділення продажу полісу"}, {"name": "BeginingDate", "type": "DateTime", "description": "Дата початку дії продукту"}, {"name": "EndingDate", "type": "DateTime", "description": "Дата завершення дії продукту"}, {"name": "RegisteredDate", "type": "DateTime", "description": "Дата реєстрації продукту"}, {"name": "AgentReportNumber", "type": "String", "description": "Номер звіту агента, пов\'язаного з продуктом"}, {"name": "Comment", "type": "String", "description": "Страхова сума продукту в національній валюті"}, {"name": "Comment", "type": "String", "description": "Додаткові коментарі або примітки до продукту"}, {"name": "Blanks", "type": "List<Blank>", "description": "Дата створення продукту"}, {"name": "Blanks", "type": "List<Blank>", "description": "Список бланків, пов\'язаних з продуктом"}

In [None]:
context = "\n\n".join(
            f'Document {i+1}: \n {doc}' for i, doc in enumerate(new_result)
        )
print(context)