#### RAG system architecture
This file outlines our basic system structure.
* Intitialzing a vector database, storing our chunks
* Creating a simple retriever 
* Adding a reranker
* Initialize our LLM
* Test the system on expert-dataset
* Test baseline (without context) on expert-dataset

In [None]:
from typing import Iterable, List
from  langchain.schema import Document
import json
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

#https://stackoverflow.com/questions/3173320/text-progress-bar-in-terminal-with-block-characters
def printProgressBar (iteration, total, prefix = '', suffix = '', decimals = 1, length = 100, fill = '█', printEnd = "\r"):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        printEnd    - Optional  : end character (e.g. "\r", "\r\n") (Str)
    """
    percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    print(f'\r{prefix} |{bar}| {percent}% {suffix}', end = printEnd)
    # Print New Line on Complete
    if iteration == total: 
        print()

#https://github.com/langchain-ai/langchain/issues/3016
def load_docs_from_jsonl(file_path)->Iterable[Document]:
    array = []
    with open(file_path, 'r') as jsonl_file:
        for line in jsonl_file:
            data = json.loads(line)
            obj = Document(**data)
            array.append(obj)
    return array

dir__vectorstore =  "./vectorstore"
dir__embeddings =  "../3_embeddings/models/RAG-multilingual-e5-small"

_chunks = load_docs_from_jsonl("../1_preproc/chunks.jsonl")

embeddings = HuggingFaceEmbeddings(model_name = dir__embeddings)

db = FAISS.from_documents(_chunks, embeddings)

db.save_local(dir__vectorstore)

  from tqdm.autonotebook import tqdm, trange





In [2]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

embeddings = HuggingFaceEmbeddings(model_name = dir__embeddings)
db = FAISS.load_local(dir__vectorstore, embeddings, allow_dangerous_deserialization= True)

def retrieveDocs(query: str, k:int, type="similarity") -> List[Document]:
        retriever = db.as_retriever(search_type=type, search_kwargs={"k": k})
        return retriever.invoke(query)

In [3]:
import re
def processText(text):
    # Regex für das Entfernen von ![Image](source) und ![Icon](source)
    text = re.sub(r"!\[(?:Icon)]\([^\)]+\)", "[Icon]", text)
    text = re.sub(r"!\[(?:Image|Icon)]\([^\)]+\)", "", text)

    # Regex für das Entfernen von "Abbildung zahl/zahl"
    text = re.sub(r"Abbildung\s\d+/\d+", "", text)

    # Optional: Entferne überflüssige Leerzeichen, falls nötig
    text = re.sub(r"\s+", " ", text).strip()

    return text

def processDocument(data):
    # Die Header 1-4 und das source-document aus dem meta-Feld extrahieren
    headers = []
    for i in range(1, 5):  # Für Header 1 bis Header 4
        header_key = f"Header {i}"
        if header_key in data['meta']:
            headers.append(data['meta'][header_key])

    # Text aus dem JSON extrahieren
    text = processText(data.get('text', ''))
    score = data.get("cross-encoder_score")

    # Quelle (source-document) extrahieren
    source_document = data['meta'].get('source-document', 'Unbekannte Quelle')

    # Den finalen String aufbauen
    final_string = "\n\n".join(headers) + "\n\n" + text + "\n\nQuelle: " + source_document + f"\n | Relevance Score: {round(score*100,2)}%\n\n" 

    # Ergebnis ausgeben
    return final_string

In [4]:
import json
from sentence_transformers import CrossEncoder
from torch.nn import Sigmoid
def rerank(query, docs):
    hits = []
    for doc in docs:
        js = {
                'text': doc.page_content,
                'meta': doc.metadata
            }
        hits.append(js)

    
    cross_encoder_model = CrossEncoder("svalabs/cross-electra-ms-marco-german-uncased", default_activation_function=Sigmoid())
    sentence_pairs = [[query, hit["text"]] for hit in hits]
    similarity_scores = cross_encoder_model.predict(sentence_pairs)
    
    for idx in range(len(hits)):
        hits[idx]["cross-encoder_score"] = similarity_scores[idx]

    hits = sorted(hits, key=lambda x: x["cross-encoder_score"], reverse=True)
    return hits

In [5]:
from llama_cpp import Llama

class Nemo:
        
    model_path = "./models/Mistral-Nemo-Instruct-2407-Q4_K_M.gguf"

    def __init__(self, use_gpu:bool):
        self.use_gpu = use_gpu

        n_gpu_layers = -1 if use_gpu else 0

        self.model = Llama(
            model_path=self.model_path,  # path to GGUF file
            n_ctx=8000,  # The max sequence length to use - note that longer sequence lengths require much more resources
            n_gpu_layers=n_gpu_layers, # The number of layers to offload to GPU, if you have GPU acceleration available. Set to 0 if no GPU acceleration is available on your system.
            )

    def query(self, query: str, ctx: str, debug: bool):
        if debug: print("Starting Generation...\n")

        if ctx is not None:
            user_prompt = f"""
Nutze den Kontext um die Frage zu beantworten:
{ctx}

Frage:
{query}
"""

        if debug: print(f"User prompt: {user_prompt}\n")

        # Prompt creation
        system_message = """Dies ist eine Unterhaltung zwischen einem intelligenten, hilfsbereitem KI-Assistenten und einem Nutzer.
                                Der Assistent gibt ausführliche, hilfreiche und ehrliche Antworten."""

        prompt = f"""[INST]{system_message} {user_prompt} [/INST]"""
        

        out = self.model(prompt, max_tokens= 500, stop=["[INST]","[/INST]"], echo=True, stream=True)

        print("\n")
        if debug: print(f"Answer:")
        answer = ""
        for chunk in out:
            if debug: print(chunk["choices"][0]["text"], end='', flush=True)
            answer += chunk["choices"][0]["text"]

        return answer

In [6]:
llm = Nemo(False)

with open("../2_datasets/expert-dataset.json", 'r', encoding='utf-8') as file:
        data = json.load(file)

dataset_QA = []
for item in data:
    question = item["question"]
    answer = item["answer"]
    relevance = item["relevance"]
    complexity = item["complexity"]
    id = item["Id"]
    res = {
          "id": id,
          "question": question,
          "answer": answer,
          "relevance": relevance,
          "complexity": complexity
    }
    dataset_QA.append(res)

llama_model_loader: loaded meta data with 35 key-value pairs and 363 tensors from ./models/Mistral-Nemo-Instruct-2407-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Mistral Nemo Instruct 2407
llama_model_loader: - kv   3:                            general.version str              = 2407
llama_model_loader: - kv   4:                           general.finetune str              = Instruct
llama_model_loader: - kv   5:                           general.basename str              = Mistral-Nemo
llama_model_loader: - kv   6:                         general.size_label str              = 12B
llama_model_loader: - kv   7:  

In [7]:
predictions = []
references = []

js = []

for q_a in dataset_QA:
    question = q_a["question"]
    answer = q_a["answer"]
    documents = retrieveDocs(question, 10)
    reranked = rerank(question, documents)
    cut = reranked[:3]
    context = "".join([processDocument(el) for el in cut])
    generation = llm.query(question, context, False)

    predictions.append(generation)
    references.append(answer)

    obj = {
        "id": q_a["id"],
        "prediction": generation,
        "reference": answer,
        "complexity": q_a["complexity"],
        "relevance": q_a["relevance"],
        "question": question,
        "contexts": context,
    }
    print(obj)





In [None]:
predictions = []
references = []

js = []

for q_a in dataset_QA:
    question = q_a["question"]
    answer = q_a["answer"]

    generation = llm.query(question, "", False)

    predictions.append(generation)
    references.append(answer)

    obj = {
        "id": q_a["id"],
        "prediction": generation,
        "reference": answer,
        "complexity": q_a["complexity"],
        "relevance": q_a["relevance"],
        "question": question,
        "contexts": context,
    }
    print(obj)