In [1]:
import pandas as pd
import numpy as np
import json
import re
import os
from paths import DATA_FOLDER, PROCESSED_DATA_FOLDER, MODELS_FOLDER, ML_FOLDER

from chromadb.errors import UniqueConstraintError
from importlib_metadata import metadata
from torch.cuda import device
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer

import torch
from sklearn.preprocessing import normalize

import chromadb
import joblib

In [None]:
DATA_FOLDER = DATA_FOLDER
PROCESSED_DATA_FOLDER = PROCESSED_DATA_FOLDER
MODELS_FOLDER = MODELS_FOLDER
ML_FOLDER = ML_FOLDER
MODEL_ID = "BAAI/bge-m3"

In [None]:
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Device count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"Device name: {torch.cuda.get_device_name(0)}")

## We recommend to use the following pipeline: hybrid retrieval + re-ranking.

- [ ] Hybrid retrieval leverages the strengths of various methods, offering higher accuracy and stronger generalization capabilities. A classic example: using both embedding retrieval and the BM25 algorithm. Now, you can try to use BGE-M3, which supports both embedding and sparse retrieval. This allows you to obtain token weights (similar to the BM25) without any additional cost when generate dense embeddings. To use hybrid retrieval, you can refer to Vespa and Milvus.

- [ ] As cross-encoder models, re-ranker demonstrates higher accuracy than bi-encoder embedding model. Utilizing the re-ranking model (e.g., bge-reranker, bge-reranker-v2) after retrieval can further filter the selected text.

In [None]:
# model = SentenceTransformer(MODEL_ID)
model = SentenceTransformer(f'{ML_FOLDER}/BGE-m3')

In [None]:
class SentenceTransformerEmbeddingFunction:
    def __init__(self, model_name_or_path):
        self.model_name = model_name_or_path
        self.model = SentenceTransformer(model_name_or_path)

    def __call__(self, input):
        embeddings = self.model.encode(input, normalize_embeddings=True, device='cuda:0')
        return embeddings.tolist()

    def name(self):
        return self.model_name

In [None]:
# model.save(f'{ML_FOLDER}/BGE-m3')

In [None]:
# embedding_func = SentenceTransformerEmbeddingFunction(MODEL_ID)
embedding_func = SentenceTransformerEmbeddingFunction(f'{ML_FOLDER}/BGE-m3')

In [None]:
class_vectors_v2 = []
for file in os.listdir(f'{PROCESSED_DATA_FOLDER}/classes'):
    with open(f'{PROCESSED_DATA_FOLDER}/classes/{file}', 'r', encoding='utf-8') as f:
        line = f.read()
        results = embedding_func(line)
        class_vectors_v2.append(results)

In [None]:
class_vectors_v3 = pd.DataFrame(class_vectors_v2)
class_vectors_v3

In [None]:
trainig_vectors_v1 = []
for file in os.listdir(f'{PROCESSED_DATA_FOLDER}/training'):
    with open(f'{PROCESSED_DATA_FOLDER}/training/{file}', 'r', encoding='utf-8') as f:
        line = f.read()
        results = embedding_func(line)
        trainig_vectors_v1.append(results)

In [None]:
chrome_persistent_client = chromadb.PersistentClient(path=f'{DATA_FOLDER}/chromadb')

In [None]:
# chrome_persistent_client.list_collections()

In [None]:
# chrome_persistent_client.delete_collection('my_rag_v1')

In [None]:
collection = chrome_persistent_client.get_or_create_collection(name='my_rag_v1', embedding_function=embedding_func)

In [None]:
documents = []
metadatas =()
ids = []
embeddings = []

class_folder = os.listdir(f'{PROCESSED_DATA_FOLDER}/classes')
training_folder = os.listdir(f'{PROCESSED_DATA_FOLDER}/training')

In [None]:
for file in class_folder:
    with open(f'{PROCESSED_DATA_FOLDER}/classes/{file}', 'r', encoding='utf-8') as f:
        documents.append(f.read())

for file in training_folder:
    with open(f'{PROCESSED_DATA_FOLDER}/training/{file}', 'r', encoding='utf-8') as f:
        documents.append(f.read())

In [None]:
metadatas = (
    [{'category': 'classes'} for _ in class_folder] +
    [{'category': 'code + explanation'} for _ in training_folder]
)

In [None]:
ids = [f'id{i}' for i in range(1, len(class_folder) + len(training_folder) + 1)]

In [None]:
embeddings.extend(class_vectors_v2)
embeddings.extend(trainig_vectors_v1)

In [None]:
collection.upsert(
    documents=documents,
    metadatas=metadatas,
    ids=ids,
    embeddings=embeddings,
)

In [None]:
result = collection.query(
    query_texts=['Write a codefor insurance with the document 207'],
    n_results=7,
)
result