# RAG Pipeline using LlamaCPP and FAISS

In [1]:
import os
from unstructured.partition.auto import partition
from docx import Document as DocxDocument
import fitz
from llama_cpp import Llama
import faiss
import numpy as np
import nltk
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.docstore.in_memory import InMemoryDocstore


nltk.download("punkt")

[nltk_data] Downloading package punkt to /Users/hubboko/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
def load_document(file_path):
    ext = os.path.splitext(file_path)[1].lower()

    if ext == ".pdf":
        return load_pdf(file_path)
    elif ext == ".docx":
        return load_docx(file_path)
    elif ext == ".txt" or ext == ".md":
        return load_txt(file_path)
    else:
        raise ValueError(f"Unsupported file type: {ext}")

def load_pdf(path):
    doc = fitz.open(path)
    texts = [page.get_text() for page in doc]
    return [t.strip() for t in texts if t.strip()]

def load_docx(path):
    doc = DocxDocument(path)
    paragraphs = []
    for p in doc.paragraphs:
        if p.text and p.text.strip():
            paragraphs.append(p.text.strip())
    return paragraphs

def load_txt(path):
    with open(path, 'r', encoding='utf-8') as f:
        text = f.read()
    return [para.strip() for para in text.split('\n\n') if para.strip()]

def load_fallback_unstructured(path):
    elements = partition(filename=path)
    return [str(el).strip() for el in elements if str(el).strip()]

In [5]:
embedding_model = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")
text_splitter = SemanticChunker(embeddings=embedding_model)

data = load_document("./dummy2.docx")

semantic_chunks = text_splitter.create_documents(data)


In [6]:
# 1. Embed semantic chunks
texts = [doc.page_content for doc in semantic_chunks]
embeddings = embedding_model.embed_documents(texts)

# 2. Create FAISS index
index = faiss.IndexFlatL2(len(embeddings[0]))
index.add(np.array(embeddings).astype("float32"))

# 3. Map index IDs to docstore IDs
index_to_docstore_id = {str(i): str(i) for i in range(len(semantic_chunks))}
docstore = InMemoryDocstore({str(i): doc for i, doc in enumerate(semantic_chunks)})

# 4. Create LangChain FAISS vector store
vector_store = FAISS(
    embedding_function=embedding_model,
    index=index,
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id
)

# 5. Save vector store to disk
vector_store.save_local("./index_faiss")


In [7]:
INDEX_PATH = "faiss_index"
LLAMA_PATH = "/Users/hubboko/Documents/rag_practice_local_model_llama/llama.cpp/models/llama3.1/Defne-llama3.1-8B.Q5_K_M.gguf"

embedder = HuggingFaceEmbeddings(model_name="BAAI/bge-m3")

vector_store = FAISS.load_local(
    "./index_faiss",
    embeddings=embedding_model,
    allow_dangerous_deserialization=True
)


index = vector_store.index
docstore = vector_store.docstore
index_to_id = vector_store.index_to_docstore_id

llm = Llama(
    model_path=LLAMA_PATH,
    n_ctx=8192,
    n_threads=12,
    n_gpu_layers=16,
    verbose=False
)

def rag_query(user_query, top_k=5):
    query_vec = embedder.embed_query(user_query)
    D, I = index.search(np.array([query_vec]), top_k)

    context_chunks = [
        docstore.search(index_to_id[str(i)]).page_content for i in I[0]
    ]

    context = "\n".join(context_chunks)

    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful and concise AI assistant embedded in a web application. Use the provided context to answer each user question as accurately and briefly as possible. If the context does not contain the answer, respond with: "Sorry, I cannot assist you." Always return only the single most relevant answer.
Answer the queries with the same user's input language no matter the context's language.
<|start_header_id|>user<|end_header_id|>
Context:
{context}

Question:
{user_query}

<|start_header_id|>assistant<|end_header_id|>
"""

    output = llm(
        prompt,
        max_tokens=250,
        temperature=0,
        top_p=0.9,
        top_k=5,
        stop=["Q:", "\n"],
        repeat_penalty=1.1
    )
    return output['choices'][0]['text'].strip()

# Optional CLI mode
if __name__ == "__main__":
    question = input("Ask a question: ")
    answer = rag_query(question)
    print("\nAnswer:", answer)


llama_context: n_ctx_per_seq (8192) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
ggml_metal_init: skipping kernel_get_rows_bf16                     (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_1row              (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_f32_l4                (not supported)
ggml_metal_init: skipping kernel_mul_mv_bf16_bf16                  (not supported)
ggml_metal_init: skipping kernel_mul_mv_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_mul_mm_bf16_f32                   (not supported)
ggml_metal_init: skipping kernel_mul_mm_id_bf16_f32                (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h64           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h80           (not supported)
ggml_metal_init: skipping kernel_flash_attn_ext_bf16_h96


Answer: Parham Jabari is the founder of PJ Logistik GmbH. He started his career as a parcel delivery driver for various companies, including DHL, DPD, Austrian Post AG, and UPS.
