# page by page

In [4]:
import json
import tempfile
import pandas as pd
from PyPDF2 import PdfReader, PdfWriter
from io import BytesIO
from pdf2image import convert_from_bytes
import pytesseract
from tabula.io import read_pdf

def convert_dataframe_to_strings(df):
    """Convert DataFrame rows to formatted strings."""
    rows_as_string = []
    for index, row in df.iterrows():
        row_components = []
        for col in df.columns:
            value = row[col]
            col_name = col if "Unnamed" not in col else ''
            if pd.notna(value):
                row_components.append(f"{col_name}: {value}")
        row_string = ', '.join(row_components).strip(', ')
        row_string = f'"{row_string}"'
        rows_as_string.append(row_string)
    return " ".join(rows_as_string)

def parse_pdf_tables(page_binary):
    """Extract tables from the given PDF binary data."""
    with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
        temp_pdf.write(page_binary)  # Write the PDF content to a temporary file
        temp_pdf_path = temp_pdf.name
    
    # Extract tables from the temporary PDF file
    tables = read_pdf(temp_pdf_path, pages="1", multiple_tables=True, stream=True)
    temp_table_string = ""

    # Use convert_dataframe_to_strings to format each table
    for table in tables:
        temp_table_string += convert_dataframe_to_strings(table) + " "

    return temp_table_string.strip()

def process_document_without_ocr(content):
    """Process the PDF document without OCR and extract text and tables page by page."""
    pdf_reader = PdfReader(BytesIO(content))
    pages_content = []

    for page_num, page in enumerate(pdf_reader.pages, start=1):
        # Extract text from the page
        page_text = page.extract_text() or ''
        
        # Create a temporary PDF containing only the current page
        with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as temp_pdf:
            writer = PdfWriter()  # Create a PDF writer object
            writer.add_page(page)  # Add the current page to the writer
            writer.write(temp_pdf)  # Write the page to the temporary PDF file
            
            temp_pdf_path = temp_pdf.name  # Get the path of the temporary PDF

        # Read the binary content of the temporary PDF for the current page
        with open(temp_pdf_path, 'rb') as temp_pdf_read:
            page_binary = temp_pdf_read.read()
        
        # Extract tables from the temporary PDF containing only the current page
        tables_content = parse_pdf_tables(page_binary)
        
        # Construct the page data dictionary
        page_data = {
            "page_number": page_num,
            "content": page_text.strip().replace("\n", " "),
            "tables_content": tables_content
        }
        
        print(page_data['tables_content'])  # Print the extracted table content
        print("-" * 50)
        pages_content.append(page_data)  # Append page data to the list

    return pages_content

# Test with a sample PDF
pdf_file_path = r"C:\Users\SaiRamPenjarla\Downloads\Cairn Monthly Production Report_Jan'24 1.pdf"

with open(pdf_file_path, "rb") as f:
    pdf_content = f.read()

# Process the PDF without OCR
result_without_ocr = process_document_without_ocr(pdf_content)

# Optionally, print the results for verification
for page in result_without_ocr:
    print(f"Page {page['page_number']}:")
    print(f"Content: {page['content']}")
    print(f"Tables Content: {page['tables_content']}")
    print("=" * 50)


": Production (Net), : January 24, : Actual, : kboepd, Actual: 78.0, Business Plan Variance: 86.3 -9.7%" ": Year To Date, : Actual, : kboepd, Actual: 83.7, Business Plan Variance: 86.6 -3.4%" ": FY 24 Annual, : Forecast, : kboepd, Actual: 82.9, Business Plan Variance: 86.3 -4.0%" ": Production (Gross), : January 24, : Actual, : kboepd, Actual: 121.3, Business Plan Variance: 135.7 -10.6%" ": Year To Date, : Actual, : kboepd, Actual: 130.8, Business Plan Variance: 135.1 -3.2%" ": FY 24 Annual, : Forecast, : kboepd, Actual: 129.4, Business Plan Variance: 134.8 -4.0%" ": Year To Date, : Actual, : Million USD, Actual: 357, Business Plan Variance: 381 -6.2%" ": 2023 Annual, : Forecast, : Million USD, Actual: 433, Business Plan Variance: 460 -5.8%" ": Operating Cost" ": Year To Date - Unit, : Actual, : USD/boe, Actual: 14.0, Business Plan Variance: 14.4 -2.9%" ": 2023 Annual - Unit, : Forecast, : USD/boe, Actual: 14.3, Business Plan Variance: 14.6 -2.0%" ": LTIR, : TRIR, Actual: Million Manho

In [None]:

with open(r"C:\Users\SaiRamPenjarla\Downloads\Cairn Monthly Production Report_Jan'24 1.pdf", "rb") as f:
    pdf_content = f.read()
    
parse_pdf_tables(pdf_content)

# old

In [7]:
import PyPDF2
import pytesseract
from io import BytesIO
from pdf2image import convert_from_bytes
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [8]:


class DocumentReader:
    def __init__(self, content: bytes, embedding_model):
        self.chat_history = []
        self.text = self.extract_text(content)
        self.chunks = self.chunk_text(self.text)
        self.retriver = self.vectorize_chunks(self.chunks, embedding_model)

    def extract_text(self, content: bytes) -> str:
        try:
            pdf_reader = PyPDF2.PdfReader(BytesIO(content))
            text = ''.join(page.extract_text() or '' for page in pdf_reader.pages)
            if text.strip():
                return text.strip()
        except Exception as e:
            print(e)
            pass  # Fall back to OCR if text extraction fails

        try:
            images = convert_from_bytes(content)
            return ''.join(pytesseract.image_to_string(image) for image in images).strip()
        except Exception as e:
            raise ValueError(f"Error extracting text: {e}")

    def chunk_text(self, text: str):
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=230)
        return text_splitter.create_documents([text])

    def vectorize_chunks(self, chunks, embedding_model):
        vector_store = FAISS.from_documents(chunks, embedding=embedding_model)
        return vector_store.as_retriever(search_type="mmr")

In [None]:
from chat.rag_components.utils import load_gemini_model
gemini_llm, gemini_embedding_model = load_gemini_model()

In [None]:
with open("Index.pdf", 'rb') as file:
    # Step 2: Read the content of the PDF file
    content = file.read()
DocumentReader(content, gemini_embedding_model)

In [None]:
pip install -qU langchain-community faiss-cpu

In [1]:
from langchain.vectorstores.faiss import FAISS

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
embedding_model = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [None]:
vector_store = FAISS.load_local('faiss_index_testing', embedding_model)

In [2]:
arr = [{"a":"b"}, {"a":"b"}]
for i in arr:
    i["c"] = "d"

arr

[{'a': 'b', 'c': 'd'}, {'a': 'b', 'c': 'd'}]