In [None]:
# Importing required libraries for subprocess, concurrent tasks, and PDF processing
import subprocess
import uuid
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
from tqdm import tqdm  # For progress bars
from PyPDF2 import PdfReader  # For handling PDF files
import fitz  # Import PyMuPDF
import base64  # Import base64 for image encoding

# Importing LangChain specific components for document processing and retrieval
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.retrievers.multi_vector import MultiVectorRetriever, SearchType
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain.document_loaders import PDFPlumberLoader
from langchain.text_splitter import NLTKTextSplitter
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from BCEmbedding import RerankerModel
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.messages import HumanMessage, SystemMessage 

In [None]:
def june_run_nougat(file_path, output_dir):
    """
    Run Nougat tool on the given file.

    Args:
        file_path (str): Path to the input PDF file.
        output_dir (str): Directory to store the output.

    Returns:
        int: 0 if operation is successful, 1 if failed.
    """
    cmd = ["nougat.exe", file_path, "-o", output_dir, "-m", "0.1.0-base", "--no-skipping"]
    res = subprocess.run(cmd)
    
    if res.returncode != 0:
        print(f"Error when running Nougat on {file_path}.")
        return res.returncode
    else:
        print(f"Operation completed for {file_path}!")
        return 0

def june_get_tables_from_mmd(mmd_path):
    """
    Extract tables from an MMD file generated by Nougat.

    Args:
        mmd_path (str): Path to the MMD file.

    Returns:
        list: List of tables extracted from the MMD file.
    """
    with open(mmd_path, encoding='utf-8') as f:
        lines = f.readlines()

    res = []
    tmp = []
    flag = ""
    
    for line in lines:
        if line == "\\begin{table}\n":
            flag = "BEGINTABLE"
        elif line == "\\end{table}\n":
            flag = "ENDTABLE"
        
        if flag == "BEGINTABLE":
            tmp.append(line)
        elif flag == "ENDTABLE":
            tmp.append(line)
            flag = "CAPTION"
        elif flag == "CAPTION":
            tmp.append(line)
            flag = "MARKDOWN"
            res.append(''.join(tmp))
            tmp = []
    
    return res

def process_pdf(file_path, output_dir):
    """
    Process a PDF file by extracting text, tables, and images, and run Nougat tool.

    Args:
        file_path (str): Path to the input PDF file.
        output_dir (str): Directory to store the output.

    Returns:
        tuple: (texts_with_metadata, tables_with_metadata, images_with_metadata, pdf_id, elapsed_time)
            - texts_with_metadata: List of documents containing the extracted text.
            - tables_with_metadata: List of documents containing the extracted tables.
            - images_with_metadata: List of documents containing extracted image base64 URIs.
            - pdf_id: Unique identifier for the PDF file.
            - elapsed_time: Time taken to process the PDF.
    """
    start_time = time.time()

    pdf_id = str(uuid.uuid4())
    texts_with_metadata = []
    tables_with_metadata = []
    images_with_metadata = [] 

    # Load PDF and extract text
    reader = PdfReader(file_path)
    pages = []
    
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            pages.append(Document(page_content=page_text, metadata={"pdf_id": pdf_id}))

    # Split text using NLTKTextSplitter
    text_splitter = NLTKTextSplitter()
    texts = text_splitter.split_documents(pages)

    # Add PDF ID to each text document's metadata
    texts_with_metadata.extend([Document(page_content=text.page_content, metadata={"pdf_id": pdf_id}) for text in texts])

    # --- Ekstraksi Gambar Baru ---
    try:
        doc = fitz.open(file_path)
        for page_num in range(len(doc)):
            page_images = doc.load_page(page_num).get_images(full=True)
            for img_index, img in enumerate(page_images):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                image_ext = base_image["ext"]
                
                # Encode as base64
                img_base64 = base64.b64encode(image_bytes).decode('utf-8')
                img_uri = f"data:image/{image_ext};base64,{img_base64}"
                
                images_with_metadata.append(Document(
                    page_content=img_uri, 
                    metadata={"pdf_id": pdf_id, "type": "image", "page": page_num}
                ))
        doc.close()
        print(f"Extracted {len(images_with_metadata)} images from {file_path}")
    except Exception as e:
        print(f"Error extracting images from {file_path}: {e}")
    # --- Akhir Ekstraksi Gambar ---

    # Run Nougat tool
    if june_run_nougat(file_path, output_dir) == 1:
        print(f"Failed to process {file_path}")
        return None
    
    # Extract tables from MMD file generated by Nougat
    mmd_path = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + ".mmd")
    tables = june_get_tables_from_mmd(mmd_path)
    tables_with_metadata.extend([Document(page_content=table, metadata={"pdf_id": pdf_id}) for table in tables])

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Processed {file_path} in {elapsed_time:.2f} seconds")

    # Perbarui nilai return
    return texts_with_metadata, tables_with_metadata, images_with_metadata, pdf_id, elapsed_time

def process_pdfs_in_batches(input_dir, output_dir, batch_size=10, max_workers=4):
    """
    Process multiple PDFs in batches with parallel execution.

    Args:
        input_dir (str): Directory containing PDF files.
        output_dir (str): Directory to store the output.
        batch_size (int): Number of PDFs to process in each batch.
        max_workers (int): Maximum number of concurrent workers.

    Returns:
        tuple: (all_texts, all_tables, all_images, pdf_ids, processing_times)
            - all_texts: List of all extracted texts from PDFs.
            - all_tables: List of all extracted tables from PDFs.
            - all_images: List of all extracted images from PDFs.
            - pdf_ids: List of unique PDF IDs.
            - processing_times: List of processing times for each PDF.
    """
    pdf_files = [os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith('.pdf')]
    
    all_texts = []
    all_tables = []
    all_images = [] # <-- List baru untuk gambar
    pdf_ids = []
    processing_times = []

    for i in range(0, len(pdf_files), batch_size):
        batch_files = pdf_files[i:i + batch_size]

        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(process_pdf, pdf_file, output_dir) for pdf_file in batch_files]

            # Show progress using tqdm
            for future in tqdm(as_completed(futures), total=len(batch_files), desc=f"Processing batch {i // batch_size + 1}"):
                result = future.result()
                if result:
                    # Perbarui unpack
                    texts, tables, images, pdf_id, elapsed_time = result
                    all_texts.extend(texts)
                    all_tables.extend(tables)
                    all_images.extend(images) 
                    pdf_ids.append(pdf_id)
                    processing_times.append(elapsed_time)
    
    # Perbarui nilai return
    return all_texts, all_tables, all_images, pdf_ids, processing_times

# Example usage
input_dir = "/path/to/pdf/directory"
output_dir = "/path/to/output/directory"

# Process PDFs in batches
# Perbarui unpack
texts, tables, images, pdf_ids, processing_times = process_pdfs_in_batches(input_dir, output_dir)

# Print processing times for each file
for pdf_file, processing_time in zip(os.listdir(input_dir), processing_times):
    print(f"{pdf_file} processed in {processing_time:.2f} seconds")

In [None]:
# Define the table summarization prompt
table_prompt_text = """You are an assistant tasked with summarizing tables. \
Give a concise summary of the table by forming logical and corresponding relationships rather than broad summaries. Table chunk: {element}"""
table_prompt = ChatPromptTemplate.from_template(table_prompt_text)

# Define the text summarization prompt
text_prompt_text = """You are an assistant tasked with summarizing text. \
Give a concise summary of the text chunk. Text chunk: {element}"""
text_prompt = ChatPromptTemplate.from_template(text_prompt_text)

# --- Prompt Peringkasan Gambar Baru ---
def create_image_summarization_prompt(image_uri):
    """Creates a multimodal prompt for image summarization."""
    return [
        SystemMessage(content="You are an expert assistant tasked with summarizing images from a research paper. Describe the key elements, data, methodology, and conclusions shown in this image. Be concise and accurate."),
        HumanMessage(content=[
            {"type": "text", "text": "Please summarize this image from a research paper:"},
            {"type": "image_url", "image_url": {"url": image_uri}}
        ])
    ]
# --- Akhir Prompt Gambar ---


# Create the model instance
model = ChatOpenAI(temperature=0, model="gpt-4o-mini") # Model ini sudah mendukung visi

# Table summarization chain
table_summarize_chain = {"element": lambda x: x} | table_prompt | model | StrOutputParser()

# Text summarization chain
text_summarize_chain = {"element": lambda x: x} | text_prompt | model | StrOutputParser()

# --- Chain Peringkasan Gambar Baru ---
image_summarize_chain = (
    RunnableLambda(lambda x: create_image_summarization_prompt(x["element"]))
    | model 
    | StrOutputParser()
)
# --- Akhir Chain Gambar ---

def summarize_tables(tables, max_concurrency=5):
    """
    Process the tables and generate summaries.
    
    Args:
        tables (list): A list of Document objects containing table content.
        max_concurrency (int): The maximum number of concurrent requests.
    
    Returns:
        list: A list of table summaries.
    """
    try:
        # PERBAIKAN: Gunakan .page_content untuk mengakses string dari objek Document
        table_summaries = table_summarize_chain.batch(
            [table.page_content for table in tables], 
            {"max_concurrency": max_concurrency}
        )
        return table_summaries
    except Exception as e:
        print(f"Error summarizing tables: {e}")
        return []

def summarize_texts(texts, max_concurrency=5):
    """
    Process the texts and generate summaries.
    
    Args:
        texts (list): A list of Document objects containing text chunks.
        max_concurrency (int): The maximum number of concurrent requests.
    
    Returns:
        list: A list of text summaries.
    """
    try:
        # PERBAIKAN: Gunakan .page_content untuk mengakses string dari objek Document
        text_summaries = text_summarize_chain.batch(
            [text.page_content for text in texts], 
            {"max_concurrency": max_concurrency}
        )
        return text_summaries
    except Exception as e:
        print(f"Error summarizing texts: {e}")
        return []

# --- Fungsi Peringkasan Gambar Baru ---
def summarize_images(images, max_concurrency=5):
    """
    Process the images and generate summaries.
    
    Args:
        images (list): A list of Document objects containing image base64 URIs.
        max_concurrency (int): The maximum number of concurrent requests.
    
    Returns:
        list: A list of image summaries.
    """
    try:
        # Gunakan .page_content (yang berisi URI base64) dan kirim sebagai dict ke chain
        image_summaries = image_summarize_chain.batch(
            [{"element": image.page_content} for image in images], 
            {"max_concurrency": max_concurrency}
        )
        return image_summaries
    except Exception as e:
        print(f"Error summarizing images: {e}")
        return []
# --- Akhir Fungsi Gambar ---


# Assuming `tables`, `texts`, and `images` are already obtained
# Get table summaries
table_summaries = summarize_tables(tables, max_concurrency=5)

# Get text summaries
text_summaries = summarize_texts(texts, max_concurrency=5)

# --- Dapatkan Peringkasan Gambar ---
image_summaries = summarize_images(images, max_concurrency=5)
# --- --- --- --- --- --- --- ---

# Print the summaries
for i, table_summary in enumerate(table_summaries):
    print(f"Table {i+1} Summary: {table_summary}")

for i, text_summary in enumerate(text_summaries):
    print(f"Text {i+1} Summary: {text_summary}")

# --- Cetak Peringkasan Gambar ---
for i, image_summary in enumerate(image_summaries):
    print(f"Image {i+1} Summary: {image_summary}")
# --- --- --- --- --- --- --- ---

In [None]:
# Set persistent directory
persist_directory = ''  # Specify your persistent directory here

# Initialize the Chroma vector store with OpenAI embeddings
vectorstore = Chroma(collection_name="summaries", embedding_function=OpenAIEmbeddings(), persist_directory=persist_directory)
vectorstore.persist()

# Create an in-memory document store
store = InMemoryStore()
id_key = "doc_id"

# Create the MultiVectorRetriever for handling vector and document retrieval
MultiVector_retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
    search_kwargs={"k": 10}  # Adjust the number of results to return from the search
)

# Function to add documents (text or tables) with summaries to the retriever
def add_documents_to_retriever(contents, summaries, documents_type='text'):
    """
    Adds text, table, or image documents along with summaries to the vector store and document store.

    Args:
        contents (list): A list of Document objects (text, table, or image).
        summaries (list): A list of summaries corresponding to the documents.
        documents_type (str): The type of document being added ('text', 'table', or 'image').
    """
    doc_ids = [str(uuid.uuid4()) for _ in contents]
    
    # Create the documents with content and metadata
    documents = [
        Document(
            # PERBAIKAN: Gunakan content.page_content
            page_content=content.page_content, 
            metadata={
                id_key: doc_ids[i],
                # PERBAIKAN: Gunakan content.metadata
                "pdf_id": content.metadata["pdf_id"], 
                "summary": summaries[i]
                # Anda mungkin ingin menambahkan metadata lain dari 'content.metadata' di sini
            }
        )
        for i, content in enumerate(contents)
    ]
    
    # Add documents to the vector store and docstore
    MultiVector_retriever.vectorstore.add_documents(documents)
    # PERBAIKAN: Gunakan list comprehension dengan .page_content
    MultiVector_retriever.docstore.mset(list(zip(doc_ids, [content.page_content for content in contents])))

# Add text and table documents to the retriever
add_documents_to_retriever(texts, text_summaries, documents_type='text')
add_documents_to_retriever(tables, table_summaries, documents_type='table')
# --- Tambahkan Gambar ke Retriever ---
add_documents_to_retriever(images, image_summaries, documents_type='image')
# --- --- --- --- --- --- --- ---

# Set the search type to MMR (Maximum Margin Retrieval)
MultiVector_retriever.search_type = SearchType.mmr

In [None]:
# Create unique IDs for each text and table
text_ids = [str(uuid.uuid4()) for _ in texts]
table_ids = [str(uuid.uuid4()) for _ in tables]
image_ids = [str(uuid.uuid4()) for _ in images] 

# Initialize mappings for content and summaries
id_to_content = {}
id_to_summary = {}

# Build the mapping for texts
for i, text in enumerate(texts):
    text_id = text_ids[i]
    id_to_content[text_id] = text.page_content # PERBAIKAN: Gunakan .page_content
    id_to_summary[text_id] = text_summaries[i]

# Build the mapping for tables
for i, table in enumerate(tables):
    table_id = table_ids[i]
    id_to_content[table_id] = table.page_content # PERBAIKAN: Gunakan .page_content
    id_to_summary[table_id] = table_summaries[i]

# --- Tambahkan Pemetaan Gambar (Hanya Ringkasan) ---
# Build the mapping for images
for i, image in enumerate(images):
    image_id = image_ids[i]
    # KITA TIDAK menambahkan image.page_content (base64) ke id_to_content
    # karena itu adalah 'sampah' untuk pencarian teks BM25.
    id_to_summary[image_id] = image_summaries[i]
# --- --- --- --- --- --- --- ---

# Combine original content and summaries for BM25 input
combined_texts_with_ids = []
# Tambahkan teks dan tabel (konten + ringkasan)
for content_id in id_to_content.keys():
    combined_texts_with_ids.append((content_id, id_to_content[content_id]))  
    combined_texts_with_ids.append((content_id, id_to_summary[content_id]))  

# --- Tambahkan Ringkasan Gambar (Hanya Ringkasan) ---
for image_id in image_ids:
    if image_id in id_to_summary: # Pastikan ringkasan ada
        combined_texts_with_ids.append((image_id, id_to_summary[image_id])) 
# --- --- --- --- --- --- --- ---

# Initialize BM25Retriever
bm25_retriever = BM25Retriever.from_texts(
    [text for _, text in combined_texts_with_ids], k=10  # Top 10 results
)

# Initialize the EnsembleRetriever with BM25 and MultiVector retrievers
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, MultiVector_retriever], weights=[0.3, 1]
)

# Function to handle retrieval and ranking using EnsembleRetriever
def retrieve_with_ensemble(query, k=10):
    """
    Retrieves documents using the ensemble of retrievers (BM25 and MultiVector).

    Args:
        query (str): The search query to use for retrieval.
        k (int): The number of results to return.

    Returns:
        List of documents retrieved based on the ensemble search.
    """
    return ensemble_retriever.retrieve(query, k=k)

In [None]:
from langchain_core.messages import HumanMessage, SystemMessage

# LLM
# Ganti model ke model visi
model = ChatOpenAI(temperature = 0, model = "gpt-4o-mini")

def create_multimodal_prompt(input_dict):
    """
    Membuat prompt multimodal (teks dan gambar) secara dinamis
    berdasarkan konteks yang diambil dan pertanyaan.
    """
    # create_documents sekarang mengembalikan string (teks, tabel, atau data:image URI)
    context_docs = input_dict["context"] 
    question = input_dict["question"]
    
    # System prompt
    system_message = SystemMessage(
        content="Answer the question based only on the following context, which can include text, tables, and images:"
    )
    
    # Membangun konten untuk HumanMessage
    user_content = []
    
    # Mulai dengan pertanyaan
    user_content.append({"type": "text", "text": f"Question: {question}\n\nContext:"})
    
    # Tambahkan dokumen konteks (teks dan gambar)
    text_buffer = []
    for doc_content in context_docs:
        if isinstance(doc_content, str) and doc_content.startswith("data:image"):
            # Jika ada teks di buffer, tambahkan dulu
            if text_buffer:
                user_content.append({"type": "text", "text": "\n---\n".join(text_buffer)})
                text_buffer = []
            
            # Tambahkan gambar
            user_content.append({
                "type": "image_url",
                "image_url": {"url": doc_content}
            })
        elif isinstance(doc_content, str):
            # Tambahkan teks ke buffer
            text_buffer.append(doc_content)
    
    # Tambahkan sisa teks di buffer
    if text_buffer:
        user_content.append({"type": "text", "text": "\n---\n".join(text_buffer)})
        
    user_message = HumanMessage(content=user_content)
    
    return [system_message, user_message]


# Chain baru yang menggunakan prompt multimodal
chain = (
    {
        "context": RunnableLambda(create_original_query) | RunnableLambda(create_documents), 
        "question": RunnablePassthrough()
    }
    | RunnableLambda(create_multimodal_prompt)  # Menggunakan fungsi baru untuk membuat prompt
    | model  
    | StrOutputParser()  
)

In [None]:
from ragas.llms.base import LangchainLLMWrapper
from langchain_community.chat_models import ChatOpenAI  # Modify import
from ragas import evaluate
from ragas.metrics import (
    RubricsScoreWithReference,
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
    answer_correctness,
    answer_similarity
)

# Create Langchain LLM instance
# GANTI MODEL di sini ke model visi
langchain_llm = ChatOpenAI(model_name="gpt-4o-mini", temperature=0)

# Wrap the Langchain LLM instance in LangchainLLMWrapper
wrapped_llm = LangchainLLMWrapper(langchain_llm)

# Define custom rubrics
my_custom_rubrics = {
    "score1_description": "The response is incorrect, irrelevant, or does not align with the ground truth.",
    "score2_description": "The response partially matches the ground truth but includes significant errors, omissions, or irrelevant information.",
    "score3_description": "The response generally aligns with the ground truth but may lack detail, clarity, or have minor inaccuracies.",
    "score4_description": "The response is mostly accurate and aligns well with the ground truth, with only minor issues or missing details.",
    "score5_description": "The response is fully accurate, aligns completely with the ground truth, and is clear and detailed.",
}

# Use the dataset you previously constructed
# dataset = Dataset.from_dict(data)  # Constructed earlier

# Instantiate the metric class
metric_with_ref = RubricsScoreWithReference(rubrics=my_custom_rubrics)

# Perform the evaluation on the dataset
try:
    result = evaluate(
        dataset=dataset,
        metrics=[
            metric_with_ref,
            context_precision,
            faithfulness,
            answer_relevancy,
            context_recall,
            answer_correctness,
            answer_similarity
        ],
        llm=wrapped_llm  # Use the wrapped (vision-capable) LLM for evaluation
    )
    # Print the evaluation results
    print(result)
except Exception as e:
    print(f"Error during evaluation: {e}")