# RAG Pipeline using LlamaCPP and FAISS

In [1]:
import os
from unstructured.partition.auto import partition
from docx import Document as DocxDocument
import fitz
from llama_cpp import Llama
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pickle
import nltk
import spacy

nltk.download("punkt")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /Users/hubboko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
def load_document(file_path):
    ext = os.path.splitext(file_path)[1].lower()

    if ext == ".pdf":
        return load_pdf(file_path)
    elif ext == ".docx":
        return load_docx(file_path)
    elif ext == ".txt" or ext == ".md":
        return load_txt(file_path)
    else:
        raise ValueError(f"Unsupported file type: {ext}")

def load_pdf(path):
    doc = fitz.open(path)
    texts = [page.get_text() for page in doc]
    return [t.strip() for t in texts if t.strip()]

def load_docx(path):
    doc = DocxDocument(path)
    paragraphs = []
    for p in doc.paragraphs:
        if p.text and p.text.strip():
            paragraphs.append(p.text.strip())
    return paragraphs

def load_txt(path):
    with open(path, 'r', encoding='utf-8') as f:
        text = f.read()
    return [para.strip() for para in text.split('\n\n') if para.strip()]

def load_fallback_unstructured(path):
    elements = partition(filename=path)
    return [str(el).strip() for el in elements if str(el).strip()]

In [21]:
try:
    nlp = spacy.load("en_core_web_sm")
except:
    from spacy.cli import download
    download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")
def spacy_semantic_chunk(text, max_tokens=100, overlap=1):
    doc = nlp(text)
    sents = list(doc.sents)
    chunks = []
    start = 0
    
    while start < len(sents):
        chunk_sents = sents[start:start+5]
        chunk = " ".join(sent.text for sent in chunk_sents)
        chunks.append(chunk.strip())
        start += 5 - overlap  # Move forward with overlap

    return chunks


In [22]:
EMBED_MODEL_NAME = "BAAI/bge-m3"
SUPPORTED_EXTS = [".pdf", ".docx", ".txt", ".md"]


In [None]:
def index_documents_in_folder(folder_path, index_path="faiss_index", store_path="doc_store.pkl"):
    model = SentenceTransformer(EMBED_MODEL_NAME)
    all_chunks = []
    all_embeddings = []

    for filename in os.listdir(folder_path):
        ext = os.path.splitext(filename)[1].lower()
        if ext not in SUPPORTED_EXTS:
            print(f"Skipping unsupported file: {filename}")
            continue

        file_path = os.path.join(folder_path, filename)
        try:
            text_blocks = load_document(file_path)
            for block in text_blocks:
                chunks = spacy_semantic_chunk(block)
                embeddings = model.encode(chunks)
                all_chunks.extend(chunks)
                all_embeddings.append(embeddings)
        except Exception as e:
            print(f"Error processing {filename}: {e}")

    if not all_embeddings:
        raise RuntimeError("No documents were indexed.")
    stacked_embeddings = np.vstack(all_embeddings)

    index = faiss.IndexFlatL2(stacked_embeddings.shape[1])
    index.add(stacked_embeddings)
    faiss.write_index(index, index_path)

    with open(store_path, "wb") as f:
        pickle.dump(all_chunks, f)

if __name__ == "__main__":
    index_documents_in_folder("./data/dummy")


Skipping unsupported file: .DS_Store
Indexed 1 chunks from dummy.docx


In [31]:

INDEX_PATH = "faiss_index"
STORE_PATH = "doc_store.pkl"
LLAMA_PATH = "/Users/hubboko/Documents/rag_practice_local_model_llama/llama.cpp/models/llama3.1/Defne-llama3.1-8B.Q5_K_M.gguf"

embedder = SentenceTransformer(EMBED_MODEL_NAME)
index = faiss.read_index(INDEX_PATH)
with open(STORE_PATH, "rb") as f:
    doc_chunks = pickle.load(f)
import contextlib
import sys
@contextlib.contextmanager
def suppress_stderr():
    with open(os.devnull, 'w') as fnull:
        stderr_original = sys.stderr
        sys.stderr = fnull
        try:
            yield
        finally:
            sys.stderr = stderr_original
llm = Llama(
    model_path=LLAMA_PATH,
    n_ctx=8192,
    n_threads=12,
    n_gpu_layers=16,
    verbose=False
)

def rag_query(user_query, top_k=5):
    query_vec = embedder.encode([user_query])
    D, I = index.search(np.array(query_vec), top_k)

    context = "\n".join(doc_chunks[i] for i in I[0])

    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    You are a helpful and concise AI assistant embedded in a web application. Use the provided context to answer each user question as accurately and briefly as possible. If the context does not contain the answer, respond with: "Sorry, I cannot assist you." Always return only the single most relevant answer.
    Answer the queries with the same user's input language no matter the context's language.
    <|start_header_id|>user<|end_header_id|>
    Context:
    {context}

    Question:
    {user_query}

    <|start_header_id|>assistant<|end_header_id|>
    """

    output = llm(
        prompt,
        max_tokens=250,
        temperature=0.0000000000000001,
        top_p=0.9,        
        top_k=5,         
        stop=["Q:", "\n"],
        repeat_penalty=1.1
    )
    return output['choices'][0]['text'].strip()

if __name__ == "__main__":
    question = input("Ask a question: ")
    answer = rag_query(question)
    print("\nAnswer:", answer)

llama_context: n_ctx_per_seq (8192) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_1row              (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_l4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_bf16                  (not supported)
ggml_metal_init: skipping kernel_mul_mv_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_mul_mm_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mm_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h64           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h80           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h96


Answer: Sorry, I cannot assist you. The provided context does not contain any information that could be used to answer a question.
