In [1]:
import pandas as pd
import numpy as np

import os


from paths import DATA_FOLDER, PROCESSED_DATA_FOLDER, MODELS_FOLDER, ML_FOLDER
from BasicClasses import *

from sentence_transformers import SentenceTransformer

import torch

from FlagEmbedding import FlagReranker
import chromadb


In [2]:
DATA_FOLDER = DATA_FOLDER
PROCESSED_DATA_FOLDER = PROCESSED_DATA_FOLDER
MODELS_FOLDER = MODELS_FOLDER
ML_FOLDER = ML_FOLDER
MODEL_ID = "BAAI/bge-m3"
RERANKING_MODEL = "BAAI/bge-reranker-v2-m3"

In [3]:
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Device count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"Device name: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.7.1+cu128
CUDA available: True
CUDA version: 12.8
Device count: 1
Device name: NVIDIA GeForce RTX 3060


## We recommend to use the following pipeline: hybrid retrieval + re-ranking.

- [ ] Hybrid retrieval leverages the strengths of various methods, offering higher accuracy and stronger generalization capabilities. A classic example: using both embedding retrieval and the BM25 algorithm. Now, you can try to use BGE-M3, which supports both embedding and sparse retrieval. This allows you to obtain token weights (similar to the BM25) without any additional cost when generate dense embeddings. To use hybrid retrieval, you can refer to Vespa and Milvus.

- [ ] As cross-encoder models, re-ranker demonstrates higher accuracy than bi-encoder embedding model. Utilizing the re-ranking model (e.g., bge-reranker, bge-reranker-v2) after retrieval can further filter the selected text.

In [4]:
# model = SentenceTransformer(MODEL_ID)
model = SentenceTransformer(f'{ML_FOLDER}/BGE-m3')
reranker = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=True, cache_dir=f'{ML_FOLDER}/bge-reranker-v2-m3')

In [26]:
# model.save(f'{ML_FOLDER}/BGE-m3')


In [5]:
# embedding_func = SentenceTransformerEmbeddingFunction(MODEL_ID)
embedding_func = SentenceTransformerEmbeddingFunction(f'{ML_FOLDER}/BGE-m3')

In [6]:
class_vectors_v2 = []
for file in os.listdir(f'{PROCESSED_DATA_FOLDER}/classes'):
    with open(f'{PROCESSED_DATA_FOLDER}/classes/{file}', 'r', encoding='utf-8') as f:
        line = f.read()
        results = embedding_func(line)
        class_vectors_v2.extend(results)

In [7]:
class_vectors_v3 = pd.DataFrame(class_vectors_v2)
class_vectors_v3

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,0.013050,-0.046113,-0.044793,-0.020217,-0.027509,-0.062423,0.056399,-0.013958,0.013406,0.021537,...,-0.006939,0.000181,-0.003332,-0.006464,0.007276,0.012404,0.027708,0.023567,0.001088,-0.058475
1,0.022854,-0.026327,-0.039258,0.006260,-0.031888,-0.040422,0.049038,0.009782,-0.002681,0.004728,...,-0.000326,0.033952,-0.015754,-0.000359,0.007099,-0.016202,0.044074,-0.008473,-0.032029,-0.013290
2,-0.020735,-0.031802,-0.033753,-0.010971,-0.004630,-0.059646,0.042915,-0.000403,0.001488,0.001282,...,-0.012531,0.019758,-0.006215,0.033143,-0.035400,-0.019717,0.025555,0.038884,-0.048065,-0.028586
3,-0.011113,-0.017807,-0.028216,-0.000887,-0.011821,-0.049197,0.059271,0.018424,0.007300,0.011133,...,-0.022455,0.041278,-0.008923,0.027074,-0.033528,-0.024717,0.018434,0.031478,-0.033236,-0.011449
4,-0.017301,-0.039470,-0.017234,-0.016681,-0.004020,-0.027047,0.054245,0.006750,-0.033397,0.016934,...,-0.012752,0.074418,-0.030674,-0.006302,-0.017996,-0.019397,0.027810,-0.000163,-0.011114,-0.039693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,-0.035152,-0.020645,-0.040086,-0.005385,-0.018264,-0.076614,0.047513,-0.045873,0.016580,-0.019762,...,0.000220,0.042256,0.044803,0.011301,-0.000735,-0.013274,0.010037,0.005243,-0.005301,-0.036255
146,-0.061301,-0.064885,-0.012513,0.014143,-0.014113,-0.071864,0.021862,0.018736,0.008848,0.037936,...,0.017766,-0.023936,0.000420,0.015518,-0.021489,-0.000902,0.056634,0.034710,-0.037585,-0.039298
147,-0.038642,-0.056093,-0.029700,0.010163,-0.035860,-0.045055,0.071626,0.008817,0.008483,-0.004579,...,-0.011845,0.044878,0.017026,0.051305,0.000876,-0.030292,0.009970,0.051302,-0.027545,-0.000510
148,-0.059275,-0.007700,-0.002304,-0.002497,-0.020237,-0.052844,0.051197,0.005195,-0.002260,-0.001404,...,0.019614,0.047648,-0.005751,0.025689,-0.037365,-0.031524,0.048221,0.048494,0.001617,-0.012174


In [8]:
trainig_vectors_v1 = []
for file in os.listdir(f'{PROCESSED_DATA_FOLDER}/training'):
    with open(f'{PROCESSED_DATA_FOLDER}/training/{file}', 'r', encoding='utf-8') as f:
        line = f.read()
        results = embedding_func(line)
        trainig_vectors_v1.extend(results)

In [9]:
chrome_persistent_client = chromadb.PersistentClient(path=f'{DATA_FOLDER}/chromadb')

In [10]:
chrome_persistent_client.list_collections()

[Collection(name=classes), Collection(name=training)]

In [11]:
# chrome_persistent_client.delete_collection('classes')
# chrome_persistent_client.delete_collection('training')

In [12]:
collection_classes = chrome_persistent_client.get_or_create_collection(name='classes', embedding_function=embedding_func)
collection_training = chrome_persistent_client.get_or_create_collection(name='training', embedding_function=embedding_func)

In [13]:
documents = []
metadatas =()
ids = []


class_folder = os.listdir(f'{PROCESSED_DATA_FOLDER}/classes')
training_folder = os.listdir(f'{PROCESSED_DATA_FOLDER}/training')

In [14]:
for file in class_folder:
    with open(f'{PROCESSED_DATA_FOLDER}/classes/{file}', 'r', encoding='utf-8') as f:
        documents.append(f.read())

metadatas = (
    [{'category': 'classes'} for _ in class_folder]
)

ids = [f'CLASSid{i}' for i in range(1, len(class_folder)+ 1)]



In [15]:
collection_classes.upsert(
    documents=documents,
    metadatas=metadatas,
    ids=ids,
    embeddings=class_vectors_v2,
)

In [16]:
documents = []
metadatas =()
ids = []
embeddings = []

In [17]:
for file in training_folder:
    with open(f'{PROCESSED_DATA_FOLDER}/training/{file}', 'r', encoding='utf-8') as f:
        documents.append(f.read())

metadatas = (
    [{'category': 'code + explanation'} for _ in training_folder]
)

ids = [f'TRAINid{i}' for i in range(1, len(training_folder) + 1)]

embeddings.extend(trainig_vectors_v1)

In [18]:
collection_training.upsert(
    documents=documents,
    metadatas=metadatas,
    ids=ids,
    embeddings=trainig_vectors_v1,
)


In [19]:
text = 'Write a code for insurance with the document 207'
result = collection_classes.query(
    query_texts=[text],
    n_results=20,
)
result

{'ids': [['CLASSid59',
   'CLASSid86',
   'CLASSid111',
   'CLASSid91',
   'CLASSid89',
   'CLASSid92',
   'CLASSid46',
   'CLASSid84',
   'CLASSid141',
   'CLASSid136',
   'CLASSid90',
   'CLASSid85',
   'CLASSid49',
   'CLASSid87',
   'CLASSid88',
   'CLASSid147',
   'CLASSid129',
   'CLASSid47',
   'CLASSid56',
   'CLASSid149']],
 'embeddings': None,
 'documents': [['{"class": "DocumentsToInsuranceObject", "members": {"DocumentsToInsuranceObject": []}}\n',
   '{"class": "InsuranceCompany", "members": {"InsuranceCompany": []}}\n',
   '{"class": "Product", "members": {"Product": [{"name": "PolisNumber", "type": "String", "description": "Номер полісу"}, {"name": "BranchGID", "type": "Guid", "description": "Унікальний ідентифікатор відділення продажу полісу"}, {"name": "BeginingDate", "type": "DateTime", "description": "Дата початку дії продукту"}, {"name": "EndingDate", "type": "DateTime", "description": "Дата завершення дії продукту"}, {"name": "RegisteredDate", "type": "DateTime", "d

In [20]:
pairs = [[text, result['documents'][0][i]] for i in range(20)]
new_scores = []

new_scores = reranker.compute_score(pairs, normalize=True)


new_result = list(zip(
    result['ids'][0],
    result['metadatas'][0],
    result['documents'][0],
    new_scores
))

new_result.sort(key=lambda x: x[3], reverse=True)

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [21]:
new_result[0][2]

'{"class": "InsuranceObject", "members": {"InsuranceObject": [{"name": "Name", "type": "String", "description": "Страхова сума для об\'єкта страхування"}, {"name": "Name", "type": "String", "description": "Страховий платіж за об\'єктом страхування"}, {"name": "Name", "type": "String", "description": "Страховий тариф для об\'єкта страхування"}, {"name": "Name", "type": "String", "description": "Назва страхового об\'єкта"}, {"name": "Covers", "type": "List<Cover>", "description": "Покриття, що надаються для страхового об\'єкта"}, {"name": "Object", "type": "Object", "description": "Об\'єкт, що Виводить на друк страховий об\'єкт"}, {"name": "Coefficients", "type": "List<Coefficient>", "description": "Коефіцієнти, що застосовуються до страхового об\'єкта"}, {"name": "EntityAttributes", "type": "List<EntityAttribute>", "description": "Атрибути об\'єкта страхування, що описують його характеристики та властивості"}, {"name": "_InsuranceObjectTypeName", "type": "string", "description": "Курс в

In [22]:
context = "\n\n".join(
            f'Document {i+1}: \n {doc}' for i, doc in enumerate(new_result)
        )
print(context)

Document 1: 
 ('CLASSid87', {'category': 'classes'}, '{"class": "InsuranceObject", "members": {"InsuranceObject": [{"name": "Name", "type": "String", "description": "Страхова сума для об\'єкта страхування"}, {"name": "Name", "type": "String", "description": "Страховий платіж за об\'єктом страхування"}, {"name": "Name", "type": "String", "description": "Страховий тариф для об\'єкта страхування"}, {"name": "Name", "type": "String", "description": "Назва страхового об\'єкта"}, {"name": "Covers", "type": "List<Cover>", "description": "Покриття, що надаються для страхового об\'єкта"}, {"name": "Object", "type": "Object", "description": "Об\'єкт, що Виводить на друк страховий об\'єкт"}, {"name": "Coefficients", "type": "List<Coefficient>", "description": "Коефіцієнти, що застосовуються до страхового об\'єкта"}, {"name": "EntityAttributes", "type": "List<EntityAttribute>", "description": "Атрибути об\'єкта страхування, що описують його характеристики та властивості"}, {"name": "_InsuranceObje