See tutorial on https://huggingface.co/blog/ngxson/make-your-own-rag

# Loading the dataset - PDF files from folder

In [None]:
from pathlib import Path
import pdfplumber   ### better result
import re


# Import Folder containing PDF files (Windows path)
DATASET_DIR = Path(r"C:\STUDY\MALD\RAG\dataset")

documents = []
document_names = []

pdf_files = sorted(DATASET_DIR.glob("*.pdf"))
if not pdf_files:
    raise FileNotFoundError(f"No .pdf files found in: {DATASET_DIR}")


#########   PDF CLEANING    #########
# PDF Cleanup after reading for extraction artefacts (line breaks, column breaks, multiple spaces footnotes, references...)
def clean_pdf_text(text: str) -> str:
    # Cleans raw text extracted best practice for from scientific PDFs.
    # The function is conservative: it removes structural noise while preserving semantic content.
    
    # Fix hyphenation across line breaks eg. "emo-\n tional" -> "emotional"
    text = re.sub(r"-\n\s*", "", text)

    # Remove page headers / footers (heuristic).  Typical patterns: journal names, page numbers
    header_footer_patterns = [
        r"^\s*\d+\s*$",  # page numbers
        r"^.*\/\s*(Journal|Proceedings|Conference|Review).*?\(\d{4}\).*$",
        r"^.*et al\.\s*/\s*.*\(\d{4}\).*$",
    ]

    cleaned_lines = []
    for line in text.split("\n"):
        if any(re.match(p, line.strip()) for p in header_footer_patterns):
            continue
        cleaned_lines.append(line)
    text = "\n".join(cleaned_lines)


    # Remove footnote markers and symbols
    text = re.sub(r"\[\d+\]", "", text)        # [1], [2]
    text = re.sub(r"\(\d+\)", "", text)        # (1)
    text = re.sub(r"\*+\s*", "", text)         # *, **


    # Stop processing at References / Bibliography
    references_pattern = r"\n\s*(References|Bibliography)\s*\n"
    split = re.split(references_pattern, text, flags=re.IGNORECASE)
    text = split[0]


    # Normalize line breaks inside paragraphs
    text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)


    # Collapse excessive whitespace
    text = re.sub(r"\s{2,}", " ", text)
    return text.strip()
#########   END PDF CLEANING    #########


#########   READING PDF #########
#### using pdfplumber; faster, cleaner reading for complex text
for fp in pdf_files:
    pages = []   
    with pdfplumber.open(fp) as pdf:
        for page_num, page in enumerate(pdf.pages, start=1):
            raw_text = page.extract_text()
            if raw_text:
                cleaned = clean_pdf_text(raw_text)
                pages.append({
                    "page": page_num,
                    "text": cleaned
                })
    if pages:
        documents.append(pages)
        document_names.append(fp.name)

print(f"Loaded {len(documents)} PDF documents from {DATASET_DIR}")
print("Documents:")
for name in document_names:
    print(" -", name)
#########   END READING PDF #########


### SET Minimum chunk length to omit noise, bad chunks,...
MIN_CHARS = 100

#########   CHUNKING    ########

# 1 ##############################################################################################
### --- BASIC --- Chunking WITH OVERLEAP: turn each document into multiple smaller chunks for embeddings ---

def chunk_text_with_pages(pages, chunk_size: int = 500, overlap: int = 150):
    
    # Basic character-based chunking with overlap with page-range tracking.
    # pages: list of dicts -> {"page": int, "text": str}
    # returns: list of (chunk_text, start_page, end_page)
    
    # Concatenate pages while keeping page boundaries
    full_text = ""
    page_map = []  # list of (char_start, char_end, page_number)

    cursor = 0
    for p in pages:
        text = p["text"].strip()
        if not text:
            continue

        start = cursor
        full_text += text + "\n\n"
        cursor = len(full_text)
        end = cursor

        page_map.append((start, end, p["page"]))

    if not full_text.strip():
        return []

    chunks = []
    step = max(1, chunk_size - overlap)
    start_char = 0

    while start_char < len(full_text):
        end_char = min(len(full_text), start_char + chunk_size)
        chunk_text = full_text[start_char:end_char].strip()

        if chunk_text:
            # determine page range for this chunk
            pages_covered = [
                page
                for s, e, page in page_map
                if not (e <= start_char or s >= end_char)
            ]

            if pages_covered:
                chunks.append(
                    (
                        chunk_text,
                        min(pages_covered),
                        max(pages_covered)
                    )
                )

        start_char += step

    return chunks

### END Chunking with overlap

### Preparing dataset for Chunking with overlap 
CHUNK_SIZE = 500
CHUNK_OVERLAP = 150

dataset_basic = []

for doc_pages, doc_name in zip(documents, document_names):
    page_chunks = chunk_text_with_pages(
        doc_pages,
        chunk_size=CHUNK_SIZE,
        overlap=CHUNK_OVERLAP
    )

    for text, p_start, p_end in page_chunks:
        if len(text.strip()) < MIN_CHARS:       # skip short chunks without meaning 
            continue
            
        if p_start == p_end:
            page_info = f"PAGE: {p_start}"
        else:
            page_info = f"PAGES: {p_start}-{p_end}"

        dataset_basic.append(
            f"[SOURCE: {doc_name} | {page_info}]\n{text}"
        )

print(
    f"Prepared {len(dataset_basic)} basic chunks "
    f"(CHUNK_SIZE={CHUNK_SIZE}, OVERLAP={CHUNK_OVERLAP})"
)
### END of Preparing dataset for Chunking with overlap 
###############################################################################################

'''

# 2 ###########################################################################################
### --- ADVANCED --- Set chunking as paragraph

# -------- Paragraph-aware, token-safe chunking --------

# Split text into paragraphs based on empty lines.
def split_into_paragraphs(text: str):
    text = re.sub(r'\r\n', '\n', text)
    text = re.sub(r'\n{3,}', '\n\n', text)
    return [p.strip() for p in text.split('\n\n') if len(p.strip()) > 50]   ## ">50" remove noise: headers, short lines, numbers,...

# Split very long paragraphs into fixed-size parts.
def hard_split(text: str, max_chars: int):   ## If paragraf size is mode that defined max_chars
    return [text[i:i+max_chars] for i in range(0, len(text), max_chars)]

# Create chunks from a list of pages while preserving page ranges. Each page is a dict with keys: {'page': int, 'text': str}
def chunk_by_paragraphs_with_pages(pages, max_chars=1000):
    chunks = []
    current_text = ""
    start_page = None
    last_page = None

    for page in pages:
        page_num = page["page"]
        paragraphs = split_into_paragraphs(page["text"])

        for p in paragraphs:
            if len(p) > max_chars:
                if current_text:
                    chunks.append((current_text.strip(), start_page, last_page))
                    current_text = ""
                    start_page = None
                    last_page = None

                for part in hard_split(p, max_chars):
                    chunks.append((part.strip(), page_num, page_num))
                continue

            if not current_text:
                current_text = p
                start_page = page_num
                last_page = page_num
            elif len(current_text) + len(p) <= max_chars:
                current_text += "\n\n" + p
                last_page = page_num
            else:
                chunks.append((current_text.strip(), start_page, last_page))
                current_text = p
                start_page = page_num
                last_page = page_num

    if current_text:
        chunks.append((current_text.strip(), start_page, last_page))

    return chunks
    
### END Chunking as paragraph


### Preparing dataset for Chunking with overlap 
# Conservative chunk size, token-safe size for bge-base / similar embedding models
CHUNK_SIZE = 1000

# Build dataset from loaded documents for chunking with paragraphs
dataset_paragraph = []
for doc_pages, doc_name in zip(documents, document_names):
    page_chunks = chunk_by_paragraphs_with_pages(doc_pages, max_chars=CHUNK_SIZE)

    for text, p_start, p_end in page_chunks:
        if len(text.strip()) < MIN_CHARS:       # skip short chunks without meaning 
            continue
        
        if p_start == p_end:
            page_info = f"PAGE: {p_start}"
        else:
            page_info = f"PAGES: {p_start}-{p_end}"

        dataset_paragraph.append(
            f"[SOURCE: {doc_name} | {page_info}]\n{text}"
        )


print(f"Prepared {len(dataset_paragraph)} paragraph-based chunks (CHUNK_SIZE={CHUNK_SIZE})")

### END of Preparing dataset for Chunking with overlap 
###############################################################################################
'''

# Implement the vector database 

#### We will use the embedding model from ollama to convert each chunk into an embedding vector, then store the chunk and its corresponding vector in a list.

###### EMBEDDING_MODEL = 'hf.co/CompendiumLabs/bge-base-en-v1.5-gguf'
###### LANGUAGE_MODEL = 'hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF'

In [48]:
import ollama

EMBEDDING_MODEL = 'hf.co/CompendiumLabs/bge-base-en-v1.5-gguf'
LANGUAGE_MODEL = 'hf.co/bartowski/Llama-3.2-1B-Instruct-GGUF'

# Each element in the VECTOR_DB will be a tuple (chunk, embedding)
# The embedding is a list of floats, for example: [0.1, 0.04, -0.34, 0.21, ...]
VECTOR_DB = []

def add_chunk_to_database(chunk):
    try:
        resp = ollama.embed(model=EMBEDDING_MODEL, input=chunk)
        embedding = resp['embeddings'][0] if isinstance(resp, dict) else resp.embeddings[0]
        VECTOR_DB.append((chunk, embedding))
    except Exception as e:
        print("⚠️ Skipped chunk due to embedding error:", str(e)[:120])



#### Setting chunks in database
##### NOTE: 
##### dataset_basic = chunking with ovelap
##### database_paragraph = chunking with paragraphs

In [None]:
# CHANGE variable "dataset_basic" OR "dataset_paragraph" per used chunking
for i, chunk in enumerate(dataset_basic):
  add_chunk_to_database(chunk)
  print(f'Added chunk {i+1}/{len(dataset_basic)} to the database')



# Implement the retrieval function 


#### Implementation of retrieval function that takes a query and returns the top N most relevant chunks based on cosine similarity. We can imagine that the higher the cosine similarity between the two vectors, the "closer" they are in the vector space. This means they are more similar in terms of meaning.

In [58]:
# -----------------------------------------------------
# Cosine similarity using NumPy library
# -----------------------------------------------------

import numpy as np

def cosine_similarity(q, d):
    return np.dot(q, d) / (np.linalg.norm(q) * np.linalg.norm(d))


#### Implementation of retrieval function:
###### Query embedding and retrieval were implemented as separate stages. Queries were embedded once and subsequently matched against the document embedding space using cosine similarity to ensure a controlled and reproducible evaluation.

In [59]:
# -----------------------------------------------------
# Embed query
# -----------------------------------------------------
def embed_query(query: str):
    resp = ollama.embed(model=EMBEDDING_MODEL, input=query)
    return resp["embeddings"][0] if isinstance(resp, dict) else resp.embeddings[0]


## TOP-K 
# 5-10 for test
# 3 for GENERATION

# -----------------------------------------------------
# Retrieve top-k chunks
# -----------------------------------------------------
def retrieve_top_k(query_embedding, vector_db, k=3):
    scored = [
        (chunk, cosine_similarity(query_embedding, emb))
        for chunk, emb in vector_db
    ]
    scored.sort(key=lambda x: x[1], reverse=True)
    return scored[:k]

# Generation phrase 


### In this phrase, the chatbot generate a response based on the retrieved knowledge from the step above. This is done by simply add the chunks into the prompt that will be taken as input for the chatbot.


In [None]:
input_query = input('Ask me a question: ')

q_emb = embed_query(input_query)
top_chunks = retrieve_top_k(q_emb, VECTOR_DB, k=3)
print('Retrieved knowledge:')
for i, (chunk, score) in enumerate(top_chunks, start=1):
    print(f'\n[{i}] similarity={score:.3f}')
    print(f'\n{chunk}')

## To prevent context overflow during generation, retrieved document chunks were concatenated incrementally until a predefined character limit was reached, ensuring stable and reproducible generation behavior.
MAX_CONTEXT_CHARS = 3500

context_blocks = []
current_len = 0

for chunk, similarity in top_chunks:                    
    if current_len + len(chunk) > MAX_CONTEXT_CHARS:
        break
    context_blocks.append(chunk)
    current_len += len(chunk)

instruction_prompt = (
    "You are a helpful chatbot.\n"
    "Use ONLY the following context to answer the question.\n"
    "If the answer is not contained in the context, say so explicitly.\n\n"
    "Context:\n"
    + "\n\n".join(context_blocks)
    + f"\n\nQuestion: {input_query}\nAnswer:"
)

### Generating response with LLM - Ollama. Instruction prompt is used as system message:

In [None]:
stream = ollama.chat(
  model=LANGUAGE_MODEL,
  messages=[
    {'role': 'system', 'content': instruction_prompt},
    {'role': 'user', 'content': input_query},
  ],
  stream=True,
)

# print the response from the chatbot in real-time
print('Chatbot response:')
for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)


### ADDITIONAL:  function for asking a question

In [None]:
def ask(query, top_k=3):
    ####################################
    # End-to-end RAG query:
    # - embeds query
    # - retrieves top-k chunks
    # - builds safe context
    # - generates answer using LLM
    ####################################

    MAX_CONTEXT_CHARS = 3500

    # Embed query
    q_emb = embed_query(query)

    # Retrieve top-k chunks
    retrieved = retrieve_top_k(q_emb, VECTOR_DB, k=top_k)

    # Build context incrementally (safe with defined max chars)
    context_blocks = []
    current_len = 0

    for chunk, score in retrieved:
        if current_len + len(chunk) > MAX_CONTEXT_CHARS:
            break
        context_blocks.append(chunk)
        current_len += len(chunk)

    context = "\n\n".join(context_blocks)

    # Construct prompt
    prompt = (
        "You are a helpful assistant.\n"
        "Answer the question using ONLY the information provided in the context.\n"
        "If the answer cannot be found in the context, reply:\n"
        "\"The answer is not contained in the provided documents.\"\n\n"
        "Context:\n"
        f"{context}\n\n"
        f"Question: {query}\n"
        "Answer:"
    )

    # Generate answer
    response = ollama.generate(
        model="gemma3:4b",
        prompt=prompt
    )

    # Extract sources
    sources = list({
        c.split("]")[0] + "]"
        for c, _ in retrieved
        if c.startswith("[SOURCE:")
    })

    return {
        "answer": response["response"],
        "sources": sources
    }

# Automated PROMPT for prepared questions

In [None]:
ask("Can negative word tone in emails indicate on critical business processes?")




# Evaluation and generating results


In [None]:
# =======================
# Retrieval Evaluation (annotation-driven, interactive)
# =======================
# Evaluation questions are taken from annotation keys.
# For each question:
# 1) the question is printed
# 2) retrieval + similarity is computed
# 3) the researcher manually annotates relevant results

import numpy as np
import textwrap
import re
from contextlib import redirect_stdout
from datetime import datetime


# -----------------------------------------------------
# Evaluation output logging
# -----------------------------------------------------
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
eval_log_path = f"retrieval_evaluation_log_{timestamp}.txt"


# -----------------------------------------------------
# Cosine similarity using NumPy library
# -----------------------------------------------------
def cosine_similarity(q, d):
    return np.dot(q, d) / (np.linalg.norm(q) * np.linalg.norm(d))


# -----------------------------------------------------
# Embed query
# -----------------------------------------------------
def embed_query(query: str):
    resp = ollama.embed(model=EMBEDDING_MODEL, input=query)
    return resp["embeddings"][0] if isinstance(resp, dict) else resp.embeddings[0]


# -----------------------------------------------------
# Retrieve top-k chunks
# -----------------------------------------------------
def retrieve_top_k(query_embedding, vector_db, k=10):
    scored = [
        (chunk, cosine_similarity(query_embedding, emb))
        for chunk, emb in vector_db
    ]
    scored.sort(key=lambda x: x[1], reverse=True)       # used descending sorting for similarity order   --> reverse=True
    return scored[:k]


# -----------------------------------------------------
# Annotation template (questions only, values filled interactively)
# -----------------------------------------------------
annotations = {
    "Can negative word tone in emails indicate on critical business processes?": [],
    "What can the long time between receiving an email and replying to it tell us": [],
    "Is there a connection between critical business processes and long email response times?": []
}

k = 10
print("\nStarting INTERACTIVE retrieval evaluation (annotation-driven)...\n")


# -----------------------------------------------------
# Formatting output text
# -----------------------------------------------------
def format_print_chunk(text, width=100, max_chars=1200):    # max output chars limited to 1200 to omit flooding
    # Formats chunk text for readable console output:
    #   - fixes missing spaces after punctuation
    #   - wraps text to fixed line width
    #   - limits maximum printed characters
    
    if not text:
        return ""

    # Fix common PDF artifacts (missing spaces)
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
    text = re.sub(r'([.,;:])([A-Za-z])', r'\1 \2', text)

    # Limit length for display
    text = text[:max_chars]

    # Wrap nicely
    wrapped = textwrap.fill(text, width=width)
    return wrapped

### formatting chunk for printout
def split_source_and_text(chunk: str):
    # Splits chunk into SOURCE header and body text. Assumes chunk starts with [SOURCE: ...]
    if chunk.startswith("[SOURCE:"):
        try:
            header, body = chunk.split("]", 1)
            return header + "]", body.strip()
        except ValueError:
            return chunk, ""
    return "", chunk


# -----------------------------------------------------
# Interactive evaluation loop
# -----------------------------------------------------
with open(eval_log_path, "w", encoding="utf-8") as f, redirect_stdout(f):       ##### START LOGGING #####

    print("RETRIEVAL EVALUATION LOG")
    print(f"Timestamp: {timestamp}")
    print(f"Embedding model: {EMBEDDING_MODEL}")
    print(f"ELanguage model: {LANGUAGE_MODEL}")
    print(f"Used Top-k: {k}")
    print("=" * 100)



    for question in annotations.keys():

        print("\n" + "=" * 100)
        print("QUESTION:")
        print(question)
        print("=" * 100)

        # 1) Embed query
        q_emb = embed_query(question)

        # 2) Retrieve top-k
        top_chunks = retrieve_top_k(q_emb, VECTOR_DB, k=k)

        # 3) Print retrieved chunks with similarity
        for i, (chunk, score) in enumerate(top_chunks, start=1):
            print(f"\n[{i}] similarity={score:.3f}")
            source_header, body_text = split_source_and_text(chunk)

            # Source
            print(source_header)
            print("-" * 80)

            # Text
            print(format_print_chunk(body_text))
            print("\n" + "-" * 80)

        # 4) Manual relevance judgment
        print(
            "Enter positions of relevant results. Example: 1,3 (comma-separated, empty if none are relevant)"
        )

        user_input = input("Relevant positions: ").strip()

        if user_input == "":
            relevant_positions = []
        else:
            relevant_positions = [
                int(x.strip())
                for x in user_input.split(",")
                if x.strip().isdigit()
            ]

        annotations[question] = relevant_positions

        print("Recorded relevant positions:", relevant_positions)
        input("Press ENTER to continue to the next question...")


    # -----------------------------------------------------
    # Evaluation metrics
    # -----------------------------------------------------
    def recall_at_k(relevant_positions, k):
        return 1 if any(pos <= k for pos in relevant_positions) else 0

    def precision_at_k(relevant_positions, k):
        return len([pos for pos in relevant_positions if pos <= k]) / k


    # -----------------------------------------------------
    # Compute aggregated metrics
    # -----------------------------------------------------
    recalls = []
    precisions = []

    for q, relevant in annotations.items():
        recalls.append(recall_at_k(relevant, k))
        precisions.append(precision_at_k(relevant, k))

    print("\n" + "=" * 60)
    print("FINAL RETRIEVAL METRICS")
    print("=" * 60)
    print(f"Recall@{k}: {np.mean(recalls):.3f}")
    print(f"Precision@{k}: {np.mean(precisions):.3f}")
    print(f"Queries evaluated: {len(annotations)}")


print(f"\nEvaluation log written to: {eval_log_path}")          ##### END LOGGING #####