In [29]:
### Python 3.12.3
import requests 
import json
import fitz # PyMuPDF
import sys
from Prompts.prompt_LLM import *
from langchain_community.llms import Ollama



: 

In [30]:
user_query = "I am at the beginning of CSRD reporting. I don't know the slightest thing about it"

### Preprocessing

In [31]:
corpus_of_documents = "Regulations/Celesia_CSRD.pdf"

In [32]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# 2. Preprocess the extracted text into smaller chunks
def split_text_into_chunks(text, chunk_size=500):
    words = text.split(" ")
    chunks = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks


In [33]:
def jaccard_similarity(query, document):
    query = query.lower().split(" ")
    document = document.lower().split(" ")
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return(len(intersection) / len(union))

def return_response(query, corpus):
    """Return the most relevant document chunk based on Jaccard similarity."""

    similarities = [jaccard_similarity(query, doc) for doc in corpus]
    return corpus[similarities.index(max(similarities))]



### Ollama

In [34]:
ollama_model = Ollama(model="llama3.2", base_url="http://127.0.0.1:11434")


In [35]:
def query_ollama_model(query, context, prompt=PROMPT1):
    """Query the Ollama model with the given query and context."""
    formatted_prompt = prompt.format(context=context, query=query)

    print(f"formatted prompt: {formatted_prompt}")
    
    try:
        response = ollama_model.predict(formatted_prompt)
        return response
    except requests.ConnectionError as e:
        print(f"Failed to connect to Ollama server: {e}")
        response = {"generated_text": "Connection failed, please check server."}


def process_pdf_with_ollama(pdf_path, user_query):
    # Step 1: Extract text from the PDF
    pdf_text = extract_text_from_pdf(pdf_path)
    
    document_chunks = split_text_into_chunks(pdf_text)

    print("document chunks: ", document_chunks)
    
    relevant_chunk = return_response(user_query, document_chunks)

    print("relevant chunk: ", relevant_chunk)
    
    response = query_ollama_model(user_query, relevant_chunk)
    
    return response

### Launching the Model

In [None]:
response = process_pdf_with_ollama(corpus_of_documents, user_query)
print("Model Response:", response)