In [28]:
import fitz  # PyMuPDF
from pathlib import Path
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import requests
import os
import sys
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [35]:
BASE_DIR = Path().resolve()nPDF_PATH = BASE_DIR / "rag" / "data" / "KonstantinaCV.pdf"
TOP_K = 3


In [36]:
# turns pdf into chunks
def extract_text_from_pdf(path):
    doc = fitz.open(path)
    all_text = "\n".join(page.get_text() for page in doc)

    # Smart text splitting
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=30,
        separators=["\n\n", "\n", ".", " "]
    )
    chunks = text_splitter.split_text(all_text)
    return chunks


In [37]:
# indexes vectored chunks 
def build_faiss_index(chunks, model):
    embeddings = model.encode(chunks)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(np.array(embeddings))
    return index, embeddings
    
# retrieves the 3 most relevant chunks based on an vectored input query
def retrieve_relevant_chunks(query, model, chunks, index, top_k=3):
    query_embedding = model.encode([query])
    D, I = index.search(np.array(query_embedding), top_k)
    return [chunks[i] for i in I[0]]


In [38]:
##calls ollama hosted model mistral using a prompt
def ask_ollama(context, question, model_name="mistral"):
    prompt = f"Answer playfully the question based on the following context:\n\n{context}\n\nQuestion: {question}\nAnswer:"
    response = requests.post(
        "http://localhost:11434/api/generate",
        json={"model": model_name, "prompt": prompt, "stream": False}
    )
    return response.json()["response"]


In [39]:
##loading PDF and preparing data
chunks = extract_text_from_pdf(PDF_PATH)

##embedding chunks
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
index, _ = build_faiss_index(chunks, embed_model)

print("ready to ask questions")


ready to ask questions


In [40]:
question = "What languages does she speak and what level"
context = "\n\n".join(retrieve_relevant_chunks(question, embed_model, chunks, index, TOP_K))
answer = ask_ollama(context, question)
print(answer)


 Konstantina Antonopoulou speaks English (C2), Greek (C2), Spanish (C1) and German (B2-C1). She's a multilingual NLP expert!
