In [None]:
from sentence_transformers import SentenceTransformer

embedding_model_name__ = "intfloat/multilingual-e5-small"

model__ = SentenceTransformer(embedding_model_name__)

def embed(texts, device = 'cpu'):
    embeddings = model__.encode(sentences=texts, device=device, normalize_embeddings=True)
    
    return embeddings



In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

import chromadb
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import copy

CHROMA_CLIENT = chromadb.PersistentClient(path='vectordb')
COLLECTION = CHROMA_CLIENT.get_or_create_collection(name = 'gaia', 
                                                    metadata={"hnsw:space": "cosine"})

RERANKER = AutoModelForSequenceClassification.from_pretrained(
    "jinaai/jina-reranker-v2-base-multilingual",
    torch_dtype="auto",
    trust_remote_code=True,
)

TEXT_SPLITTER = RecursiveCharacterTextSplitter(
    chunk_size=1014,
    chunk_overlap=256,
    length_function=len,
    is_separator_regex=False,
)


# Add a list of texts and their associated metadata to a vector database.
#   Parameters:
#     texts (list of str): A list of texts to be added to the vector database.
#     metadatas (list of dict): A list of metadata dictionaries corresponding to each text. Each dictionary should 
#                               include at least a 'file_id' key.
#     device (str): The device used for embedding computation (e.g., "cpu" or "gpu").
def add_texts_to_vectordb(texts, metadatas, device):
    
    for i in range(len(texts)):
        
        # Split text into chunks
        splited_texts = TEXT_SPLITTER.split_text(texts[i])
        file_id = metadatas[i]['file_id']
        ids = [file_id + "_" + str(j) for j in range(len(splited_texts))]
        
        # Get embeddings from chunks
        embeddings = embed(texts=splited_texts, device=device)
        file_metadatas = [copy.deepcopy(metadatas[i]) for _ in range(len(splited_texts))]
        
        # Apply sentence window to save bigger context in a chunk
        for idx in range(len((file_metadatas))):
            if idx == 0:
                sentence_window_doc = splited_texts[0] + '\n' + splited_texts[1]
            elif idx == len(splited_texts) - 1:
                sentence_window_doc = splited_texts[-2] + '\n' + splited_texts[-1]
            else:
                sentence_window_doc = splited_texts[idx - 1] + '\n' + splited_texts[idx] + '\n' + splited_texts[idx + 1]
                
            file_metadatas[idx]['sentence_window_document'] = sentence_window_doc
        
        # Add data to vector DB
        COLLECTION.add(
            ids = ids,
            documents = splited_texts,
            embeddings = embeddings,
            metadatas = file_metadatas
        )
    
        print(f"\n===== ADD {len(splited_texts)} DOCUMENTS TO VECTORDB ===============")
        print("Length original text:", len(texts[i]))
        print("Metadatas:", file_metadatas[0])
        

# Query a vector database to retrieve and rank documents based on similarity to the given query.
#   Parameters:
#     query (str): The text query to search for in the vector database.
#     device (str): The device on which the embedding and querying should be performed.
#     n_vectordb (int, optional): The number of top similar documents to retrieve from the vector database (default is 10).
#     n_rerank (int, optional): The number of top-ranked documents to return after reranking (default is 3).
#   Returns:
#     dict: A dictionary containing:
#         - 'documents' (list of str): The top-ranked documents after reranking.
#         - 'scores' (list of float): The similarity scores of the top-ranked documents.
#         - 'file_names' (list of str): The file names corresponding to the top-ranked documents.
def query_vectordb(query, device, n_vectordb = 10, n_rerank = 3):
    
    # Get embedding from user's query
    queryy: list[str] = [query]
    query_embedding = embed(texts=queryy, device=device)
    
    # Query in vector DB
    vectordb_results = COLLECTION.query(
        query_embeddings = query_embedding,
        n_results = n_vectordb
    )
    
    sentence_window_documents = [t['sentence_window_document'] for t in vectordb_results['metadatas'][0]]
    file_names = [t['file_name'] for t in vectordb_results['metadatas'][0]]
    
    sim_sentences = vectordb_results['documents'][0]
    sim_scores = vectordb_results['distances'][0]
    
    print("\n===== VECTORDB QUERY RESULTS ==============")
    for sentence, score in zip(sim_sentences, sim_scores):
        print(f"\n=== Documents ({score}):", sentence)
    
    query_doc_pairs = [[query, s] for s in sim_sentences]

    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-reranker-v2-base-multilingual', trust_remote_code=True)
    RERANKER = AutoModelForSequenceClassification.from_pretrained(
        'jinaai/jina-reranker-v2-base-multilingual',
        torch_dtype="auto",
        trust_remote_code=True,
    )

    RERANKER.to(device)
    RERANKER.eval()

    # Tokenize the input query-document pairs
    tokenized_pairs = [tokenizer(query, doc, return_tensors="pt", padding=True, truncation=True) for query, doc in query_doc_pairs]

    # Compute rerank scores
    rerank_scores = []
    for tokenized_input in tokenized_pairs:
        with torch.no_grad():
            outputs = RERANKER(**tokenized_input.to(device))
            rerank_scores.append(outputs.logits.squeeze().item())

    # Sort by rerank scores
    sorted_indices = sorted(range(len(rerank_scores)), key=lambda i: rerank_scores[i], reverse=True)

    final_results = {
        'documents': [sentence_window_documents[i] for idx, i in enumerate(sorted_indices) if idx < n_rerank],
        'scores': [rerank_scores[i] for idx, i in enumerate(sorted_indices) if idx < n_rerank],
        "file_names": [file_names[i] for idx, i in enumerate(sorted_indices) if idx < n_rerank]
    }
    
    return final_results

def get_all_from_vectordb():
    all_data = COLLECTION.get(
        include=['documents', 'metadatas']
    )
    
    return all_data

def delete_from_vectordb(delete_id):
    ids_to_delete = []
    
    all_data = COLLECTION.get()
    
    # Get all data where "file_id" in metadatas equals delete_id
    for idx in range(len(all_data['ids'])):
        id = all_data['ids'][idx]
        metadata = all_data['metadatas'][idx]
        
        if metadata['file_id'] == delete_id:
            ids_to_delete.append(id)
            
    COLLECTION.delete(ids = ids_to_delete)
        
    
    print("\n===== DELETE FROM VECTORDB:")
    print("Number of deleted docs:", len(ids_to_delete))
    

In [None]:
import uuid
from werkzeug.datastructures import FileStorage

texts = []
metadatas = []

with open("./dataset/rag_data.csv", "rb") as f:
    file = FileStorage(f)
    file_content = file.read()
    file_text = file_content.decode("utf-8")

    texts.append(file_text)
    metadatas.append({'file_name': file.filename, 'file_id': str(uuid.uuid4())})

# Print to verify
print(texts)
print(metadatas)

In [None]:
add_texts_to_vectordb(texts, metadatas, device='cuda')

In [None]:
query_vectordb("What causes Alstrom syndrome?", device='cuda', n_vectordb=5, n_rerank=3)

In [16]:
rag_prompt = """
    Make sure you extract all the relevant keywords from the user's query.
    Return output only as keywords separated by commas, without any explanations or additional text.

    ### Instruction:
    {}
    
    ### Input:
    {}
    
    ### Response:
    {}
"""

def extract_keyword(model, tokenizer, text):
    inputs =  tokenizer(
            [
                rag_prompt.format(
                    "Extract all the relevant keywords to store in vector database.",
                    text,
                    "", # output
                )
            ],
            return_tensors="pt"
        ).to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=128)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [6]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

def rerank(device, query, candidates):
    """
    Perform reranking of candidates based on cosine similarity with the query.
    
    :param query: The query sentence.
    :param candidates: List of sentences to rerank.
    :return: List of candidates reranked by similarity.
    """
    print('?????????')

    # Create embeddings for the query and candidates
    query_embedding = embed([query], device)
    candidate_embeddings = embed(candidates, device)
    
    # Calculate cosine similarity
    cosine_scores = util.pytorch_cos_sim(query_embedding, candidate_embeddings)[0]
    
    # Sort indices by descending similarity scores
    ranked_indices = np.argsort(cosine_scores.numpy())[::-1]
    print(ranked_indices)
    # Return the candidates sorted by similarity
    reranked_candidates = [candidates[idx] for idx in ranked_indices]
    
    return reranked_candidates

In [None]:
from unsloth import FastLanguageModel
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "../lora_model",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
FastLanguageModel.for_inference(model)

In [8]:
final_prompt = """
    You are Gaia - an AI assistant and answer or reply a whole conversation with your boss. 
    Your answer will include the answer to the user's question, the title of the document, and an exact citation of the text from that document,
    or simply, reply your boss with the most relevant information from the document.
    ### Instruction:
    {}
    
    ### Input:
    {}
    
    ### Response:
    {}
"""

In [None]:
inp = "What causes Alstrom syndrome?"
keywords = extract_keyword(model, tokenizer, inp)
print(keywords)
print('This is extremely slow')
vectordb_results = query_vectordb(query=keywords, device="cuda")

sim_sentences = vectordb_results['documents']
sim_scores = vectordb_results['scores']
source_file_names = vectordb_results['file_names']

print("\n===== EXTRACTED SIM SENTENCES ============")
for sim_sentence, score in zip(sim_sentences, sim_scores):
    print(f"\n== Sim sentences ({score}) == :", sim_sentence)

source_documents = '\n\n'.join([f"- Document {f}:" + s for f, s in zip(source_file_names, sim_sentences)])

inputs = tokenizer(
            [
                final_prompt.format(
                    "Answer my question or chat with me: " + inp,  # instruction
                    "The whole source documents which used in this conversation: " + source_documents,  # input
                    "",  # output - leave this blank for generation!
                )
            ],
            return_tensors="pt",
        ).to("cuda")