In [1]:
import pandas as pd
import numpy as np
import json
import re
import os

from chromadb.errors import UniqueConstraintError
from importlib_metadata import metadata
from torch.cuda import device
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer

import torch
from sklearn.preprocessing import normalize

import chromadb
import joblib

In [2]:
DATA_FOLDER = '../data'
PROCESSED_DATA_FOLDER = '../data/processed_data'
MODELS_FOLDER = '../data/DocProperties/incore-exporter/Workflow.DTO/Models'
ML_FOLDER = '../data/ML'
MODEL_ID = "BAAI/bge-m3"

In [3]:
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Device count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"Device name: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.7.1+cu128
CUDA available: True
CUDA version: 12.8
Device count: 1
Device name: NVIDIA GeForce RTX 3060


## We recommend to use the following pipeline: hybrid retrieval + re-ranking.

- [ ] Hybrid retrieval leverages the strengths of various methods, offering higher accuracy and stronger generalization capabilities. A classic example: using both embedding retrieval and the BM25 algorithm. Now, you can try to use BGE-M3, which supports both embedding and sparse retrieval. This allows you to obtain token weights (similar to the BM25) without any additional cost when generate dense embeddings. To use hybrid retrieval, you can refer to Vespa and Milvus.

- [ ] As cross-encoder models, re-ranker demonstrates higher accuracy than bi-encoder embedding model. Utilizing the re-ranking model (e.g., bge-reranker, bge-reranker-v2) after retrieval can further filter the selected text.

In [4]:
# model = SentenceTransformer(MODEL_ID)
model = SentenceTransformer(f'{ML_FOLDER}/BGE-m3')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [5]:
class SentenceTransformerEmbeddingFunction:
    def __init__(self, model_name_or_path):
        self.model_name = model_name_or_path
        self.model = SentenceTransformer(model_name_or_path)

    def __call__(self, input):
        embeddings = self.model.encode(input, normalize_embeddings=True, device='cuda:0')
        return embeddings.tolist()

    def name(self):
        return self.model_name

In [6]:
# model.save(f'{ML_FOLDER}/BGE-m3')

In [7]:
# embedding_func = SentenceTransformerEmbeddingFunction(MODEL_ID)
embedding_func = SentenceTransformerEmbeddingFunction(f'{ML_FOLDER}/BGE-m3')

In [8]:
class_vectors_v2 = []
for file in os.listdir(f'{PROCESSED_DATA_FOLDER}/classes'):
    with open(f'{PROCESSED_DATA_FOLDER}/classes/{file}', 'r', encoding='utf-8') as f:
        line = f.read()
        results = embedding_func(line)
        class_vectors_v2.append(results)

In [9]:
class_vectors_v3 = pd.DataFrame(class_vectors_v2)
class_vectors_v3

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,0.013050,-0.046113,-0.044793,-0.020217,-0.027509,-0.062423,0.056399,-0.013958,0.013406,0.021537,...,-0.006939,0.000181,-0.003332,-0.006464,0.007276,0.012404,0.027708,0.023567,0.001088,-0.058475
1,0.022854,-0.026327,-0.039258,0.006260,-0.031888,-0.040422,0.049038,0.009782,-0.002681,0.004728,...,-0.000326,0.033952,-0.015754,-0.000359,0.007099,-0.016202,0.044074,-0.008473,-0.032029,-0.013290
2,-0.020735,-0.031802,-0.033753,-0.010971,-0.004630,-0.059646,0.042915,-0.000403,0.001488,0.001282,...,-0.012531,0.019758,-0.006215,0.033143,-0.035400,-0.019717,0.025555,0.038884,-0.048065,-0.028586
3,-0.011113,-0.017807,-0.028216,-0.000887,-0.011821,-0.049197,0.059271,0.018424,0.007300,0.011133,...,-0.022455,0.041278,-0.008923,0.027074,-0.033528,-0.024717,0.018434,0.031478,-0.033236,-0.011449
4,-0.017301,-0.039470,-0.017234,-0.016681,-0.004020,-0.027047,0.054245,0.006750,-0.033397,0.016934,...,-0.012752,0.074418,-0.030674,-0.006302,-0.017996,-0.019397,0.027810,-0.000163,-0.011114,-0.039693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,-0.035152,-0.020645,-0.040086,-0.005385,-0.018264,-0.076614,0.047513,-0.045873,0.016580,-0.019762,...,0.000220,0.042256,0.044803,0.011301,-0.000735,-0.013274,0.010037,0.005243,-0.005301,-0.036255
145,-0.061301,-0.064885,-0.012513,0.014143,-0.014113,-0.071864,0.021862,0.018736,0.008848,0.037936,...,0.017766,-0.023936,0.000420,0.015518,-0.021489,-0.000902,0.056634,0.034710,-0.037585,-0.039298
146,-0.038642,-0.056093,-0.029700,0.010163,-0.035860,-0.045055,0.071626,0.008817,0.008483,-0.004579,...,-0.011845,0.044878,0.017026,0.051305,0.000876,-0.030292,0.009970,0.051302,-0.027545,-0.000510
147,-0.059275,-0.007700,-0.002304,-0.002497,-0.020237,-0.052844,0.051197,0.005195,-0.002260,-0.001404,...,0.019614,0.047648,-0.005751,0.025689,-0.037365,-0.031524,0.048221,0.048494,0.001617,-0.012174


In [10]:
# vectors = model.encode_document(f'{DATA_FOLDER}/Models_doc.jsonl', show_progress_bar=True, device='cuda:0', normalize_embeddings=True)

In [11]:
# vectors = pd.DataFrame(vectors)
# vectors

In [12]:
# vectors.shape

In [13]:
trainig_vectors_v1 = []
for file in os.listdir(f'{PROCESSED_DATA_FOLDER}/training'):
    with open(f'{PROCESSED_DATA_FOLDER}/training/{file}', 'r', encoding='utf-8') as f:
        line = f.read()
        results = embedding_func(line)
        trainig_vectors_v1.append(results)

In [14]:
# vectors_2 = model.encode_document(f'{DATA_FOLDER}/DocProperties_JSONL.jsonl', show_progress_bar=True, device='cuda:0', normalize_embeddings=True)

In [15]:
# vectors_2 = pd.DataFrame(vectors_2)
# vectors_2

In [16]:
# vectors_2.shape

In [17]:
chrome_persistent_client = chromadb.PersistentClient(path=f'{DATA_FOLDER}/chromadb')

In [18]:
# chrome_persistent_client.list_collections()

In [19]:
# chrome_persistent_client.delete_collection('my_rag_v1')

In [20]:
# try:
#     collection = chrome_persistent_client.create_collection(name='my_rag_v1', embedding_function=embedding_func)
# finally:

collection = chrome_persistent_client.get_collection(name='my_rag_v1', embedding_function=embedding_func)

In [21]:
documents = []
metadatas =()
ids = []
embeddings = []

class_folder = os.listdir(f'{PROCESSED_DATA_FOLDER}/classes')
training_folder = os.listdir(f'{PROCESSED_DATA_FOLDER}/training')

In [22]:
for file in class_folder:
    with open(f'{PROCESSED_DATA_FOLDER}/classes/{file}', 'r', encoding='utf-8') as f:
        documents.append(f.read())

for file in training_folder:
    with open(f'{PROCESSED_DATA_FOLDER}/training/{file}', 'r', encoding='utf-8') as f:
        documents.append(f.read())

In [23]:
metadatas = (
    [{'category': 'classes'} for _ in class_folder] +
    [{'category': 'code + explanation'} for _ in training_folder]
)

In [24]:
ids = [f'id{i}' for i in range(1, len(class_folder) + len(training_folder) + 1)]

In [25]:
embeddings.extend(class_vectors_v2)
embeddings.extend(trainig_vectors_v1)

In [26]:
collection.upsert(
    documents=documents,
    metadatas=metadatas,
    ids=ids,
    embeddings=embeddings,
)

In [28]:
result = collection.query(
    query_texts=['Write a codefor insurance with the document 207'],
    n_results=7,
)
result

{'ids': [['id390', 'id331', 'id260', 'id991', 'id215', 'id288', 'id456']],
 'embeddings': None,
 'documents': [['{"id": 651, "name": "721LetterSubject", "messages": [{"role": "user", "content": "Пояснення:\\nnan"}, {"role": "assistant", "content": "C# code:\\n__Result = \\"Договір страхування \\\\\\"Калібр\\\\\\" номер \\" + __Product.PolisNumber;\\n"}]}\n',
   '{"id": 594, "name": "716LetterSubject", "messages": [{"role": "user", "content": "Пояснення:\\nnan"}, {"role": "assistant", "content": "C# code:\\n__Result = \\"Договір страхування \\\\\\"Медичне страхування фізичних осіб\\\\\\" номер \\" + __Product.PolisNumber;\\n"}]}\n',
   '{"id": 722, "name": "702MiniCasko", "messages": [{"role": "user", "content": "Пояснення:\\nnan"}, {"role": "assistant", "content": "C# code:\\n__Result = \\"Договір страхування МІНІ КАСКО номер \\" + __Product.PolisNumber;\\n"}]}\n',
   '{"id": 653, "name": "SMS721", "messages": [{"role": "user", "content": "Пояснення:\\nnan"}, {"role": "assistant", "con