In [None]:
import argparse
import os
import shutil
import pandas
import re
import chromadb.config
import chromadb.types



from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain_community.vectorstores.chroma import Chroma
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_community.llms.ollama import Ollama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.document_loaders import TextLoader

from PIL import Image
from pytesseract import image_to_string
from pdf2image import convert_from_path

### Embedding Function

In [None]:

def get_embedding_function():    
    embeddings = OllamaEmbeddings(model="mxbai-embed-large")
    
    return embeddings


### Data and Model

In [None]:
def pdf_to_text(pdf_file, output_file):
    images = convert_from_path(pdf_file)
    final_text = ""
    for img in images:
        final_text += image_to_string(img)
        # print(final_text)
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(final_text)
data_paths =["example_invoice1.pdf", "example_invoice2.pdf", "example_invoice3.pdf"]
invoice_numbers=[['1234567'], ['R/1243737'], ['RE22000188', '203857593']]
output_file = "extracted_text.txt"
invoice_numbers_detected=[]
model = Ollama(model="llama3-chatqa", temperature=0.01)
template = """
        <|System|>>
    You are a very helpful AI assistant who follows instructions very well.
    Use the following context to answer the question.

    Think step-by-step and carefully review the document provided before responding. You will receive $100 if you answer the question correctly.

    Make sure to respond only with the invoice number(s) and no additional text!


    Kontext: {context}
    </s>
    <|user|>
    {query}
    </s>
    <|Assistent|>
    """
prompt = ChatPromptTemplate.from_template(template)
output_parser = StrOutputParser()

### Invoice Number Identification

In [None]:
def process_documents(data_paths_subset):
    # List to store extracted invoice numbers
    invoice_numbers_detected = []
    
    # Loop through each file in the provided subset
    for i, data_path in enumerate(data_paths_subset):
        output_file = f"extracted_text_part{i}.txt"

        # Convert PDF to plain text and save to file
        pdf_to_text(data_path, output_file)

        # Load the extracted text
        loader = TextLoader(output_file)
        docs = loader.load()

        # Split text into overlapping chunks for retrieval
        splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=100)
        chunks = splitter.split_documents(docs)
        print(chunks)

        # Create a vectorstore (Chroma) from the chunks using embedding function
        vectorstore = Chroma.from_documents(chunks, get_embedding_function())
        
        # Define two types of retrievers: dense (vector-based) and sparse (BM25)
        vectorstore_retriever = vectorstore.as_retriever(search_kwargs={"k": 1})
        keyword_retriever = BM25Retriever.from_documents(chunks)
        keyword_retriever.k = 1

        # Combine both retrievers into an ensemble with custom weights
        ensemble_retriever = EnsembleRetriever(
            retrievers=[vectorstore_retriever, keyword_retriever],
            weights=[0.1, 0.9]
        )

        # Set up the output parser and prompt
        output_parser = StrOutputParser()
        prompt = ChatPromptTemplate.from_template(template)

        # Create the processing chain: retrieval → prompt → model → output parsing
        chain = (
            {"context": ensemble_retriever, "query": RunnablePassthrough()}
            | prompt
            | model
            | output_parser
        )

        # Execute the chain with a predefined task prompt
        response = chain.invoke("""
            Task: A reminder needs to be analyzed. The goal is to identify and extract the invoice numbers. 
            The invoice number is often a number of 6–12 numeric or alphanumeric digits. Also look for letters such as "RE" or "AR" as part of the number or 
            separated with the numbers by "/" or "-".  
            
            Look for terms like "Invoice Number", "Invoice No.", "Beleg", "Belegnummer", or similar nearby or in a table.                         
            There may be multiple invoice numbers in a document, and you should recognize all of them. 
            
            If there is more than one invoice number in a document, make sure to split them with a comma.                                
            Make sure to respond only with the invoice number(s) and avoid any additional text or "None" as an answer. 
            
            Which invoice numbers do you find in the document?
        """)

        # Store the response
        invoice_numbers_detected.append(response)

        # Cleanup: delete entries from the vectorstore and release memory
        ids = vectorstore.get(where={'source': output_file})['ids']
        vectorstore.delete(ids)
        del vectorstore

    return invoice_numbers_detected

# Process all documents in the provided data_paths list
invoices_results = process_documents(data_paths)



### Evaluation

In [None]:
def flatten_list(nested_list):
    """ Flacht eine verschachtelte Liste zu einer einfachen Liste ab. """
    flat_list = []
    for element in nested_list:
        if isinstance(element, list):
            flat_list.extend(flatten_list(element))  # Rekursiv für Listen
        else:
            flat_list.append(element)
    return flat_list

def split_and_flatten(numbers_list):
    """ Spaltet Strings in einer Liste auf, die durch Kommas getrennte Nummern enthalten. """
    flattened_list = []
    for item in numbers_list:
        split_items = item.split(',')
        split_items = [num.strip() for num in split_items]  # Entfernen von zusätzlichen Leerzeichen
        flattened_list.extend(split_items)
    return flattened_list

# Verarbeite die tatsächlichen Rechnungsnummern und die Ausgabe des LLM
flat_invoice_numbers = flatten_list(invoice_numbers)
processed_output = split_and_flatten(invoices_results)

# Umwandeln in Mengen und Berechnung der Schnittmenge
set_actual_numbers = set(flat_invoice_numbers)
set_detected_numbers = set(processed_output)
intersection = set_actual_numbers.intersection(set_detected_numbers)

# Berechnung und Ausgabe der Genauigkeit
accuracy = len(intersection) / len(set_actual_numbers) * 100
print(f"Anzahl der korrekt erkannten Rechnungsnummern: {len(intersection)}")
print("Korrekt erkannte Rechnungsnummern:", intersection)
print(f"Genauigkeit der Erkennung: {accuracy:.2f}%")





In [None]:
import matplotlib.pyplot as plt
correct_classifications = 47
incorrect_classifications = 51
# Data for RAG model results
labels = ['Correctly Classified', 'Incorrectly Classified']
counts = [correct_classifications, incorrect_classifications]

# Creating the bar chart
fig, ax = plt.subplots()
ax.bar(labels, counts, color=['green', 'red'])
ax.set_ylabel('Number of Invoice Numbers')
ax.set_title('RAG Model Classification Results')

# Display the chart
plt.show()

### Use LlamaParse to improve the Performance

In [None]:
# Load environment variables 
import os
from dotenv import load_dotenv
load_dotenv()

# Patch asyncio to work inside notebooks or nested environments
import nest_asyncio
nest_asyncio.apply()


from llama_parse import LlamaParse
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

# Load the API key for LlamaParse (optional if set via .env)
llamaparse_api_key = os.getenv("LLAMA_CLOUD_API_KEY")

# Configure the PDF parser
parser = LlamaParse(
    api_key="",  # use your actual key 
    result_type="markdown"  # formats: "markdown" or "text"
)

# Parse the input document(s) using the parser
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(
    input_files=["example_invoice1.pdf"],  # update with your PDF filename(s)
    file_extractor=file_extractor
).load_data()

# === Embedding and LLM Setup ===
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings

# Use Ollama for both embedding and language model
embed_model = OllamaEmbedding(model_name="llama3")
llm = Ollama(model="llama3", request_timeout=30.0)

# Apply settings globally
Settings.embed_model = embed_model
Settings.llm = llm

# === Indexing and Query Engine Setup ===

# Create a vector-based index from the parsed documents
index = VectorStoreIndex.from_documents(documents)

# Create a query engine based on the index
query_engine = index.as_query_engine()


# Prompt to extract invoice numbers
query = (
    "Please analyze the full document, including all text sections and tables. "
    "Extract every invoice number mentioned in the document. "
    "Invoice numbers are usually 6–12 digits, sometimes including slashes or dashes (e.g. RE-2023/0921). "
    "Use keywords like 'Invoice Number', 'Rechnungsnummer', 'Beleg', etc. "
    "List all invoice numbers found, separated by a comma. Do not include any other text."
)

# Run the query against the document index
response = query_engine.query(query)

# Display extracted invoice numbers
print("Extracted Invoice Numbers:")
print(response)

# === (Optional) Ground Truth Comparison ===

# Define the correct/expected invoice numbers for this document
ground_truth = {"RE-2023/0921", "INV-001237", "AB123456"}

# Convert model output into set of strings for comparison
predicted_numbers = set(str(response).replace(" ", "").split(","))

# Evaluate model performance
true_positives = predicted_numbers & ground_truth
false_positives = predicted_numbers - ground_truth
false_negatives = ground_truth - predicted_numbers
