In [2]:
from pathlib import Path
# --- User Input for Company Folder ---
# Enter the name of the company folder (e.g., "PVIAM", "PVI Insurance")
company_folder_name = input("Enter the company folder name (e.g., PVIAM): ").strip()

if not company_folder_name:
    print("No company folder name entered. Please run this cell again and provide a name.")
    # You might want to exit or raise an error here if a company name is mandatory
else:
    print(f"Company folder set to: {company_folder_name}")

# Construct the base path for the company's data
# This assumes your company folders are directly under 'Financial Statement Data Retriever'
company_base_path = Path(r"D:\Visual Studio Projects\Financial Statement Data Retriever") / company_folder_name

# --- User Input for Periods ---
# Enter the periods (e.g., years, quarters) you want to process, separated by commas.
# Example: "2021, 2022, 2023, 2024" or "2020_Q1, 2020_Q2, 2021"
periods_input = input("Enter periods to process (e.g., 2021, 2022, 2023): ")
periods_to_process = [p.strip() for p in periods_input.split(',') if p.strip()]

if not periods_to_process:
    print("No periods entered. Please run this cell again and provide periods.")
else:
    print(f"Periods set for processing: {periods_to_process}")

# You can also hardcode the list here if you prefer not to use input() every time:
# periods_to_process = ["2021", "2022", "2023", "2024"]

Company folder set to: PVI Insurance
Periods set for processing: ['2020', '2021', '2022', '2023', '2024']


In [3]:
# import fitz         # PyMuPDF
# from PIL import Image
# import pytesseract
# import io
# from pathlib import Path

# # point to your tesseract exe if not in PATH
# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# # The 'import fitz', 'from PIL import Image', etc. are duplicated in your original cell.
# # You can remove the second set if you wish, but I'm keeping it as per "Don't change the main functions".
# import fitz         # PyMuPDF
# from PIL import Image
# import pytesseract
# import io
# from pathlib import Path

# # Use the dynamically set company_base_path
# base_pdf_dir = company_base_path / "financial_statements"
# ocr_dir = company_base_path / "text_statements"

# # Ensure OCR output directory exists
# ocr_dir.mkdir(parents=True, exist_ok=True)

# print("--- Starting OCR Process for Multiple Periods ---")

# for period in periods_to_process: # Changed 'year' to 'period'
#     pdf_path = base_pdf_dir / f"{period}.pdf" # Uses 'period'
#     out_txt = ocr_dir / f"{period}_ocr.txt" # Uses 'period'

#     if not pdf_path.exists():
#         print(f"Warning: PDF file not found for {period} at {pdf_path}. Skipping OCR for this period.")
#         continue

#     print(f"\nProcessing PDF for period: {period} ({pdf_path})")
#     doc = fitz.open(pdf_path)
#     with out_txt.open("w", encoding="utf-8") as fout:
#         for pageno in range(len(doc)):
#             page = doc.load_page(pageno)
#             pix = page.get_pixmap(dpi=650)                  # increase DPI for better OCR
#             img_bytes = pix.tobytes("png")                  # PNG bytes
#             img = Image.open(io.BytesIO(img_bytes))
#             # Ensure you have the 'vie' language pack installed for Tesseract
#             text = pytesseract.image_to_string(img, lang="vie", config="--psm 3")
#             fout.write(f"--- PAGE {pageno+1} ---\n")
#             fout.write(text + "\n\n")
#     doc.close()
#     print(f"OCR output for {period} saved to: {out_txt}") # Uses 'period'

# print("\n--- OCR Process Complete ---")

--- Starting OCR Process for Multiple Periods ---

Processing PDF for period: 2020 (D:\Visual Studio Projects\Financial Statement Data Retriever\PVI Insurance\financial_statements\2020.pdf)
OCR output for 2020 saved to: D:\Visual Studio Projects\Financial Statement Data Retriever\PVI Insurance\text_statements\2020_ocr.txt

Processing PDF for period: 2021 (D:\Visual Studio Projects\Financial Statement Data Retriever\PVI Insurance\financial_statements\2021.pdf)
OCR output for 2021 saved to: D:\Visual Studio Projects\Financial Statement Data Retriever\PVI Insurance\text_statements\2021_ocr.txt

Processing PDF for period: 2022 (D:\Visual Studio Projects\Financial Statement Data Retriever\PVI Insurance\financial_statements\2022.pdf)
OCR output for 2022 saved to: D:\Visual Studio Projects\Financial Statement Data Retriever\PVI Insurance\text_statements\2022_ocr.txt

Processing PDF for period: 2023 (D:\Visual Studio Projects\Financial Statement Data Retriever\PVI Insurance\financial_statement

In [4]:
import os
import json
import pandas as pd
from pathlib import Path
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# --- Configuration ---
# Set your Google API Key here.
# It's recommended to set this as an environment variable (e.g., GOOGLE_API_KEY)
# **CRITICAL: Replace the placeholder below with your actual, valid Google API Key**
os.environ["GOOGLE_API_KEY"] = "AIzaSyD1f3CDdw71J98b4LEFFM6IUY893qfnqdg" 

# Base directories for the files, using the dynamically set company_base_path
json_dir = company_base_path / "json_statements"
excel_dir = company_base_path / "excel_statements"
ocr_dir = company_base_path / "text_statements"

# Ensure output directories exist
json_dir.mkdir(parents=True, exist_ok=True)
excel_dir.mkdir(parents=True, exist_ok=True)

# Define the years to process (must match the OCR cell) - REMOVED, now uses periods_to_process from above
# years = [2020, 2021, 2022, 2023, 2024]

# --- Initialize Gemini 2.5 Flash LLM ---
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0)

# --- Define Prompt for Financial Statement Extraction ---
prompt_template = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an expert financial analyst. Your task is to extract key financial statement line items and their values from the provided text. "
                   "Identify items from Income Statements, Balance Sheets, and Cash Flow Statements. "
                   "Output the extracted data as a JSON array of objects, where each object has 'statement_type', 'item', 'year', and 'value'. "
                   "Ensure values are numeric (remove commas, currency symbols, etc.) or leave empty if not found."
                   "Ensure that the line items, as well as the name of the statements are the same as the language being used in the text."
                   "Sometimes there can be grammatial error and line item numering error, make sure to fix it as well, don't be too rigid"
                   "Don't take Footnotes unless specified, Only the 3 Statements that is 'Income Statement', 'Balance Sheet', 'Cash flow Statement'"
                   "ONLY take the current year from this statement, not the last years."
                   "Make sure that the line items are in proper form, that is no FULL CAPITALIZTATION, and only First Letter Capitalization"),
        ("human", "Extract financial statements from the following text:\n\n{text}")
    ]
)

# --- Create LangChain Chain ---
output_parser = StrOutputParser()
chain = prompt_template | llm | output_parser

# ...existing code...
print("--- STEP 1: Extracting data using Gemini 2.5 Flash for Multiple Periods ---")

for period in periods_to_process: # Changed 'year' to 'period'
    ocr_text_file_path = ocr_dir/ f"{period}_ocr.txt" # Uses 'period'
    output_json_file_path = json_dir / f"{period}_financial_statements_raw.json" # Uses 'period'

    print(f"\nProcessing period: {period}") # Uses 'period'

    # Load OCR Text for the current period
    if not ocr_text_file_path.exists():
        print(f"Error: OCR text file not found for {period} at {ocr_text_file_path}. Skipping LLM extraction for this period.")
        continue
    
    try:
        with ocr_text_file_path.open("r", encoding="utf-8") as f:
            ocr_content = f.read()
    except Exception as e:
        print(f"Error reading OCR text file for {period} at {ocr_text_file_path}: {e}. Skipping LLM extraction.")
        continue

    ## STEP 1: Invoke LLM and Save Raw JSON Output
    llm_response = None
    try:
        print(f"Sending text for {period} to Gemini 2.5 Flash for extraction...") # Uses 'period'
        llm_response = chain.invoke({"text": ocr_content})
        print(f"Received response from Gemini for {period}.") # Uses 'period'

        # Save the raw string response (should be JSON) to a file
        with output_json_file_path.open("w", encoding="utf-8") as f:
            f.write(llm_response)
        print(f"Successfully saved raw LLM output for {period} to: {output_json_file_path}") # Uses 'period'

    except Exception as e:
        print(f"An error occurred during LLM invocation for {period}: {e}") # Uses 'period'
        # Continue to the next year even if one fails
        continue

print("\n--- LLM Extraction Process Complete ---")

--- STEP 1: Extracting data using Gemini 2.5 Flash for Multiple Periods ---

Processing period: 2020
Sending text for 2020 to Gemini 2.5 Flash for extraction...
Received response from Gemini for 2020.
Successfully saved raw LLM output for 2020 to: D:\Visual Studio Projects\Financial Statement Data Retriever\PVI Insurance\json_statements\2020_financial_statements_raw.json

Processing period: 2021
Sending text for 2021 to Gemini 2.5 Flash for extraction...
Received response from Gemini for 2021.
Successfully saved raw LLM output for 2021 to: D:\Visual Studio Projects\Financial Statement Data Retriever\PVI Insurance\json_statements\2021_financial_statements_raw.json

Processing period: 2022
Sending text for 2022 to Gemini 2.5 Flash for extraction...
Received response from Gemini for 2022.
Successfully saved raw LLM output for 2022 to: D:\Visual Studio Projects\Financial Statement Data Retriever\PVI Insurance\json_statements\2022_financial_statements_raw.json

Processing period: 2023
Sendi

In [5]:
import json
import pandas as pd
from pathlib import Path

# Base directories for the files, using the dynamically set company_base_path
json_dir = company_base_path / "json_statements"
excel_dir = company_base_path / "excel_statements"
ocr_dir = company_base_path / "text_statements"

# Ensure output directories exist
json_dir.mkdir(parents=True, exist_ok=True)
excel_dir.mkdir(parents=True, exist_ok=True)

# Define the years to process (must match the previous cells) - REMOVED, now uses periods_to_process from above
# years = [2021, 2022, 2023, 2024]

print("\n--- STEP 2: Loading JSON Files, Parsing, and Converting to Excel for Multiple Periods ---")

for period in periods_to_process: # Changed 'year' to 'period'
    output_json_file_path = json_dir / f"{period}_financial_statements_raw.json" # Uses 'period'
    output_excel_file_path = excel_dir / f"{period}_financial_statements.xlsx" # Uses 'period'

    print(f"\nProcessing JSON for period: {period}") # Uses 'period'
    extracted_data = []

    if not output_json_file_path.exists():
        print(f"Error: Raw JSON file not found for {period} at {output_json_file_path}. Skipping Excel conversion for this period.") # Uses 'period'
        continue

    try:
        # Read the saved JSON string from the file
        with output_json_file_path.open("r", encoding="utf-8") as f:
            json_string = f.read()

        # Clean the JSON string if it's wrapped in a Markdown code block
        cleaned_json_string = json_string.strip()
        if cleaned_json_string.startswith("```json"):
            cleaned_json_string = cleaned_json_string[len("```json"):].strip()
        if cleaned_json_string.endswith("```"):
            cleaned_json_string = cleaned_json_string[:-len("```")].strip()

        # Parse the cleaned JSON
        extracted_data = json.loads(cleaned_json_string)

        if not isinstance(extracted_data, list):
            print(f"Warning: Parsed JSON for {period} was not a simple array. Attempting to recover.") # Uses 'period'
            # Recovery logic for wrapped JSON
            if isinstance(extracted_data, dict) and "financial_statements" in extracted_data:
                extracted_data = extracted_data["financial_statements"]
            elif isinstance(extracted_data, dict) and "data" in extracted_data:
                extracted_data = extracted_data["data"]
            else:
                extracted_data = [] # Fallback

    except json.JSONDecodeError as e:
        print(f"Error decoding JSON from file for {period}: {e}") # Uses 'period'
        print(f"Check {output_json_file_path} for formatting errors.")
        extracted_data = [] # Fallback to empty list
    except Exception as e:
        print(f"An unexpected error occurred during file loading or parsing for {period}: {e}") # Uses 'period'
        extracted_data = []

    # --- Convert to Pandas DataFrame and Save to Excel ---
    if extracted_data:
        df = pd.DataFrame(extracted_data)

        # Clean up values: remove commas, convert to numeric where possible
        if 'value' in df.columns:
            df['value'] = df['value'].astype(str).str.replace(',', '').str.strip()
            # Attempt to convert to numeric, coercing errors to NaN
            df['value'] = pd.to_numeric(df['value'], errors='coerce')

        df.to_excel(output_excel_file_path, index=False)
        print(f"Successfully extracted {len(df)} financial items for {period}, cleaned, and saved to: {output_excel_file_path}") # Uses 'period'
    else:
        print(f"No financial data was extracted or parsed successfully for {period}. Excel file not created.") # Uses 'period'
        print("Please check the JSON file and the parsing logic.")

print("\n--- Excel Conversion Process Complete ---")


--- STEP 2: Loading JSON Files, Parsing, and Converting to Excel for Multiple Periods ---

Processing JSON for period: 2020
Successfully extracted 130 financial items for 2020, cleaned, and saved to: D:\Visual Studio Projects\Financial Statement Data Retriever\PVI Insurance\excel_statements\2020_financial_statements.xlsx

Processing JSON for period: 2021
Successfully extracted 149 financial items for 2021, cleaned, and saved to: D:\Visual Studio Projects\Financial Statement Data Retriever\PVI Insurance\excel_statements\2021_financial_statements.xlsx

Processing JSON for period: 2022
Successfully extracted 148 financial items for 2022, cleaned, and saved to: D:\Visual Studio Projects\Financial Statement Data Retriever\PVI Insurance\excel_statements\2022_financial_statements.xlsx

Processing JSON for period: 2023
Successfully extracted 138 financial items for 2023, cleaned, and saved to: D:\Visual Studio Projects\Financial Statement Data Retriever\PVI Insurance\excel_statements\2023_fin