In [1]:
# ...existing code...
from pathlib import Path
# --- User Input for Company Folder ---
# Enter the name of the company folder (e.g., "PVIAM", "PVI Insurance")
company_folder_name = input("Enter the company folder name (e.g., PVIAM): ").strip()

if not company_folder_name:
    print("No company folder name entered. Please run this cell again and provide a name.")
    # You might want to exit or raise an error here if a company name is mandatory
else:
    print(f"Company folder set to: {company_folder_name}")

# Construct the base path for the company's data
# This assumes your company folders are directly under 'Financial Statement Data Retriever'
company_base_path = Path(r"D:\Visual Studio Projects\Financial Statement Data Retriever") / company_folder_name

# --- User Input for Periods ---
# Enter the periods (e.g., years, quarters) you want to process, separated by commas.
# Example: "2021, 2022, 2023, 2024" or "2020_Q1, 2020_Q2, 2021"
periods_input = input("Enter periods to process (e.g., 2021, 2022, 2023): ")
periods_to_process = [p.strip() for p in periods_input.split(',') if p.strip()]

if not periods_to_process:
    print("No periods entered. Please run this cell again and provide periods.")
else:
    print(f"Periods set for processing: {periods_to_process}")

# --- User Input for PDF Text Extraction Method ---
# Choose between OCR (for scanned PDFs) or Direct Text Extraction (for text-searchable PDFs)
while True:
    extraction_method_input = input("Choose PDF text extraction method (OCR / Direct): ").strip().lower()
    if extraction_method_input in ["ocr", "direct"]:
        extraction_method = extraction_method_input
        print(f"PDF text extraction method set to: {extraction_method.upper()}")
        break
    else:
        print("Invalid input. Please enter 'OCR' or 'Direct'.")

# --- User Input for Page Range (New) ---
page_range_input = input("Enter page range to extract (e.g., 50-90, leave blank for all pages): ").strip()
start_page = None
end_page = None

if page_range_input:
    try:
        if '-' in page_range_input:
            start_str, end_str = page_range_input.split('-')
            start_page = int(start_str)
            end_page = int(end_str)
            if start_page > end_page:
                print("Warning: Start page is greater than end page. Processing all pages.")
                start_page = None
                end_page = None
            else:
                print(f"Page range set to: {start_page}-{end_page}")
        else:
            print("Invalid page range format. Processing all pages.")
    except ValueError:
        print("Invalid page number in range. Processing all pages.")
else:
    print("No specific page range entered. Processing all pages.")


# You can also hardcode the list here if you prefer not to use input() every time:
# periods_to_process = ["2021", "2022", "2023", "2024"]
# extraction_method = "ocr" # or "direct"
# start_page = 50
# end_page = 90
# ...existing code...

Company folder set to: general_mills
Periods set for processing: ['2022']
PDF text extraction method set to: DIRECT
Page range set to: 3-90


## <span style = 'color:blue'> Converting PDF to Text files (Direct or through the OCR method)

In [2]:
# ...existing code...
import os
import json
import pandas as pd
from pathlib import Path
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
import re # Import re for regular expressions

# --- Configuration ---
# Set your Google API Key here.
os.environ["GOOGLE_API_KEY"] = "AIzaSyBAAP0sKv5nJOGySzVmd66I23vUDtjjI-s" 

# Base directories for the files, using the dynamically set company_base_path
json_dir = company_base_path / "json_statements"
excel_dir = company_base_path / "excel_statements"
ocr_dir = company_base_path / "text_statements"

# Ensure output directories exist
json_dir.mkdir(parents=True, exist_ok=True)
excel_dir.mkdir(parents=True, exist_ok=True)

# Define the years to process (must match the OCR cell) - REMOVED, now uses periods_to_process from above
# years = [2020, 2021, 2022, 2023, 2024]

# --- Initialize Gemini 2.5 Flash LLM ---
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.05)

# --- Define Prompt for Financial Statement Extraction ---
prompt_template = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an expert financial analyst. Your task is to extract various line items and their values from the provided text. "
                   "Output the extracted data as a JSON array of objects, where each object has 'item_number' (if there is item number, or else leave blank),'statement_type', 'item', 'year', and 'value'. "
                   "Ensure values are numeric (remove commas, currency symbols, etc.) or leave empty if not found."
                   "Ensure that the line items, as well as the name of the statements are the same as the language being used in the text."
                   "Sometimes there can be grammatial error and line item numering error, make sure to fix it as well, don't be too rigid"
                   "ONLY take the current year from this statement, not the last years."
                   "Make sure that the line items are in proper form, that is no FULL CAPITALIZTATION, and only First Letter Capitalization"),
        ("human", "Extract the 3 main statements from the Financial report :\n\n{text}")
    ]
)

# --- Create LangChain Chain ---
output_parser = StrOutputParser()
chain = prompt_template | llm | output_parser

# --- Helper function to extract text from a specific page range (New) ---
def extract_pages(text_content, start_page=None, end_page=None):
    if start_page is None and end_page is None:
        return text_content # No filtering needed, return full content

    lines = text_content.split('\n')
    filtered_lines = []
    current_page = 0
    in_desired_range = False

    for line in lines:
        page_header_match = re.match(r'--- PAGE (\d+) ---', line)
        if page_header_match:
            current_page = int(page_header_match.group(1))
            if (start_page is None or current_page >= start_page) and \
               (end_page is None or current_page <= end_page):
                in_desired_range = True
                filtered_lines.append(line) # Include the page header
            else:
                in_desired_range = False
        elif in_desired_range:
            filtered_lines.append(line)
    
    return "\n".join(filtered_lines)


print("--- STEP 1: Extracting data using Gemini 2.5 Flash for Multiple Periods ---")

for period in periods_to_process: # Changed 'year' to 'period'
    ocr_text_file_path = ocr_dir/ f"{period}_ocr.txt" # Uses 'period'
    output_json_file_path = json_dir / f"{period}_financial_statements_raw.json" # Uses 'period'

    print(f"\nProcessing period: {period}") # Uses 'period'

    # Load OCR Text for the current period
    if not ocr_text_file_path.exists():
        print(f"Error: OCR text file not found for {period} at {ocr_text_file_path}. Skipping LLM extraction for this period.")
        continue
    
    try:
        with ocr_text_file_path.open("r", encoding="utf-8") as f:
            ocr_content = f.read()
    except Exception as e:
        print(f"Error reading OCR text file for {period} at {ocr_text_file_path}: {e}. Skipping LLM extraction.")
        continue

    # Filter OCR content by the specified page range (New)
    filtered_ocr_content = extract_pages(ocr_content, start_page, end_page)
    
    if not filtered_ocr_content.strip():
        print(f"No content found in the specified page range ({start_page}-{end_page}) for {period}. Skipping LLM extraction.")
        continue

    ## STEP 1: Invoke LLM and Save Raw JSON Output
    llm_response = None
    try:
        print(f"Sending text for {period} (pages {start_page}-{end_page} if specified) to Gemini 2.5 Flash for extraction...") # Uses 'period'
        llm_response = chain.invoke({"text": filtered_ocr_content})
        print(f"Received response from Gemini for {period}.") # Uses 'period'

        # Save the raw string response (should be JSON) to a file
        with output_json_file_path.open("w", encoding="utf-8") as f:
            f.write(llm_response)
        print(f"Successfully saved raw LLM output for {period} to: {output_json_file_path}") # Uses 'period'

    except Exception as e:
        print(f"An error occurred during LLM invocation for {period}: {e}") # Uses 'period'
        # Continue to the next year even if one fails
        continue

print("\n--- LLM Extraction Process Complete ---")

--- STEP 1: Extracting data using Gemini 2.5 Flash for Multiple Periods ---

Processing period: 2022
Sending text for 2022 (pages 3-90 if specified) to Gemini 2.5 Flash for extraction...
Received response from Gemini for 2022.
Successfully saved raw LLM output for 2022 to: D:\Visual Studio Projects\Financial Statement Data Retriever\general_mills\json_statements\2022_financial_statements_raw.json

--- LLM Extraction Process Complete ---


## <span style = 'color:blue'> Converting JSON files into Excel Files

In [3]:
import json
import pandas as pd
from pathlib import Path

# Base directories for the files, using the dynamically set company_base_path
json_dir = company_base_path / "json_statements"
excel_dir = company_base_path / "excel_statements"
ocr_dir = company_base_path / "text_statements"

# Ensure output directories exist
json_dir.mkdir(parents=True, exist_ok=True)
excel_dir.mkdir(parents=True, exist_ok=True)

# Define the years to process (must match the previous cells) - REMOVED, now uses periods_to_process from above
# years = [2021, 2022, 2023, 2024]

print("\n--- STEP 2: Loading JSON Files, Parsing, and Converting to Excel for Multiple Periods ---")

for period in periods_to_process: # Changed 'year' to 'period'
    output_json_file_path = json_dir / f"{period}_financial_statements_raw.json" # Uses 'period'
    output_excel_file_path = excel_dir / f"{period}_financial_statements.xlsx" # Uses 'period'

    print(f"\nProcessing JSON for period: {period}") # Uses 'period'
    extracted_data = []

    if not output_json_file_path.exists():
        print(f"Error: Raw JSON file not found for {period} at {output_json_file_path}. Skipping Excel conversion for this period.") # Uses 'period'
        continue

    try:
        # Read the saved JSON string from the file
        with output_json_file_path.open("r", encoding="utf-8") as f:
            json_string = f.read()

        # Clean the JSON string if it's wrapped in a Markdown code block
        cleaned_json_string = json_string.strip()
        if cleaned_json_string.startswith("```json"):
            cleaned_json_string = cleaned_json_string[len("```json"):].strip()
        if cleaned_json_string.endswith("```"):
            cleaned_json_string = cleaned_json_string[:-len("```")].strip()

        # Parse the cleaned JSON
        extracted_data = json.loads(cleaned_json_string)

        if not isinstance(extracted_data, list):
            print(f"Warning: Parsed JSON for {period} was not a simple array. Attempting to recover.") # Uses 'period'
            # Recovery logic for wrapped JSON
            if isinstance(extracted_data, dict) and "financial_statements" in extracted_data:
                extracted_data = extracted_data["financial_statements"]
            elif isinstance(extracted_data, dict) and "data" in extracted_data:
                extracted_data = extracted_data["data"]
            else:
                extracted_data = [] # Fallback

    except json.JSONDecodeError as e:
        print(f"Error decoding JSON from file for {period}: {e}") # Uses 'period'
        print(f"Check {output_json_file_path} for formatting errors.")
        extracted_data = [] # Fallback to empty list
    except Exception as e:
        print(f"An unexpected error occurred during file loading or parsing for {period}: {e}") # Uses 'period'
        extracted_data = []

    # --- Convert to Pandas DataFrame and Save to Excel ---
    if extracted_data:
        df = pd.DataFrame(extracted_data)

        # Clean up values: remove commas, convert to numeric where possible
        if 'value' in df.columns:
            df['value'] = df['value'].astype(str).str.replace(',', '').str.strip()
            # Attempt to convert to numeric, coercing errors to NaN
            df['value'] = pd.to_numeric(df['value'], errors='coerce')

        df.to_excel(output_excel_file_path, index=False)
        print(f"Successfully extracted {len(df)} financial items for {period}, cleaned, and saved to: {output_excel_file_path}") # Uses 'period'
    else:
        print(f"No financial data was extracted or parsed successfully for {period}. Excel file not created.") # Uses 'period'
        print("Please check the JSON file and the parsing logic.")

print("\n--- Excel Conversion Process Complete ---")


--- STEP 2: Loading JSON Files, Parsing, and Converting to Excel for Multiple Periods ---

Processing JSON for period: 2022
Successfully extracted 87 financial items for 2022, cleaned, and saved to: D:\Visual Studio Projects\Financial Statement Data Retriever\general_mills\excel_statements\2022_financial_statements.xlsx

--- Excel Conversion Process Complete ---
