In [8]:
from pathlib import Path
# --- User Input for Company Folder ---
# Enter the name of the company folder (e.g., "PVIAM", "PVI Insurance")
company_folder_name = input("Enter the company folder name (e.g., PVIAM): ").strip()

if not company_folder_name:
    print("No company folder name entered. Please run this cell again and provide a name.")
    # You might want to exit or raise an error here if a company name is mandatory
else:
    print(f"Company folder set to: {company_folder_name}")

# Construct the base path for the company's data
# This assumes your company folders are directly under 'Financial Statement Data Retriever'
company_base_path = Path(r"D:\Visual Studio Projects\Financial Statement Data Retriever") / company_folder_name

# The periods_input and periods_to_process are not needed for this notebook
# as it processes files based on statement type names from the final_statements folder.
# Keeping this section commented out for clarity if you ever need period context.
# --- User Input for Periods (Optional, but good for context if needed later) ---
# periods_input = input("Enter periods to process (e.g., 2021, 2022, 2023) or leave blank: ")
# periods_to_process = [p.strip() for p in periods_input.split(',') if p.strip()]
# if not periods_to_process:
#     print("No periods entered. Proceeding without period-specific filtering.")
# else:
#     print(f"Periods set for processing (for context): {periods_to_process}")
# periods_to_process = ["2021", "2022", "2023", "2024"]

Company folder set to: CALM


## <span style = 'color:blue'> Standardize the Excel Files

In [9]:
import pandas as pd
from pathlib import Path
import os
import json
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# --- Configuration ---
# Set your Google API Key here.
# It's recommended to set this as an environment variable (e.g., GOOGLE_API_KEY)
# **CRITICAL: Replace the placeholder below with your actual, valid Google API Key**
os.environ["GOOGLE_API_KEY"] = "AIzaSyANqiR6J33QWw1O37AqDNEjfi17whkaJ1g" 

# Define input and output directories using the dynamically set company_base_path
input_dir = company_base_path / "final_statements"
output_dir = company_base_path / "final_statements_standardized"

# Ensure the output directory exists
output_dir.mkdir(parents=True, exist_ok=True)

# --- Initialize Gemini 2.5 Flash LLM ---
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.5)

# --- Define Prompt for Financial Item Standardization ---
prompt_template = ChatPromptTemplate.from_messages(
    [
        ("system", "You are an expert financial analyst specializing in financial statements. "
                   "Your task is to standardize financial statement line items. "
                   "You will be given a list of items, which may contain variations due to OCR errors, slightly different phrasing, or garbled numbering. "
                   "For each group of semantically similar items, identify them and propose a single, concise, and commonly accepted standardized name. "
                   "The standardized name should be in proper case (first letter of each word capitalized). "
                   "Prioritize standardized names that include a line item number if available among the original items. "
                   "If a 'total' type item, ensure its standardized name clearly reflects it as a total. "
                   "Make sure that the item that has the same name, isn't containing different values from each other. Since there could be sub-items that have the same name but bear different values for their parent item."
                   "Output the mapping as a JSON array of objects. Each object in the array should represent a standardized item and contain two keys: "
                   "'standardized_item' (the proposed standardized name) and 'original_items' (a list of all original items that map to this standardized name). "
                   "The order of objects in the JSON array MUST represent the logical order of items in a financial statement (e.g., assets (and its total) before liabilities (and its total as well), short-term before long-term, and within sections, by line item number if present). "
                   "Ensure all original items from the input list are present in your output mapping under their respective standardized items."),
        ("human", "Standardize the following financial statement items:\n\n{items_list_json}")
    ]
)

# --- Create LangChain Chain ---
output_parser = StrOutputParser()
chain = prompt_template | llm | output_parser

print("--- Starting Financial Statement Item Standardization ---")

# Process each Excel file in the input directory
for file_path in input_dir.glob("*.xlsx"):
    print(f"\nProcessing file for standardization: {file_path.name}")
    try:
        # Read the reformatted Excel file (item is already the index)
        df_wide = pd.read_excel(file_path, index_col=0)

        if df_wide.empty:
            print(f"  Warning: {file_path.name} is empty. Skipping standardization.")
            continue

        # Get the list of unique items (index values) to send to the LLM
        items_to_standardize = df_wide.index.astype(str).unique().tolist()

        if not items_to_standardize:
            print(f"  No items found in {file_path.name} to standardize. Skipping.")
            continue

        print(f"  Found {len(items_to_standardize)} unique items. Sending to Gemini for standardization...")
        
        # Convert the list of items to a JSON string for the LLM prompt
        items_list_json = json.dumps(items_to_standardize, ensure_ascii=False, indent=2)
        
        # Invoke LLM for standardization mapping
        llm_response = chain.invoke({"items_list_json": items_list_json})
        
        print(f"  Received standardization mapping from Gemini for {file_path.name}.")

        # Clean and parse the LLM's JSON response
        cleaned_json_string = llm_response.strip()
        if cleaned_json_string.startswith("```json"):
            cleaned_json_string = cleaned_json_string[len("```json"):].strip()
        if cleaned_json_string.endswith("```"):
            cleaned_json_string = cleaned_json_string[:-len("```")].strip()

        standardization_groups = json.loads(cleaned_json_string)
        
        # Create a dictionary for mapping: {original_item: standardized_item}
        item_mapping = {}
        # Create an ordered list of standardized items
        ordered_standardized_items = []

        for group in standardization_groups:
            standardized_name = group['standardized_item']
            ordered_standardized_items.append(standardized_name)
            for original_item in group['original_items']:
                item_mapping[original_item] = standardized_name

        # Apply the mapping to the DataFrame's index
        df_temp = df_wide.rename(index=item_mapping)

        # Handle potential duplicates after standardization (e.g., if "Cash" and "Cash and equivalents" both map to "Cash")
        # We sum the values for items that now share the same standardized name.
        df_aggregated = df_temp.groupby(df_temp.index).sum()

        # Reindex the aggregated DataFrame to enforce Gemini's desired order
        # Any standardized items from Gemini's list not present in df_aggregated will be added as NaN rows.
        # Any items in df_aggregated not in Gemini's ordered list will be dropped (this shouldn't happen if Gemini maps all).
        df_standardized = df_aggregated.reindex(ordered_standardized_items)

        # Define the output path in the final_statements_standardized directory
        output_file_path = output_dir / file_path.name
        
        # Save the standardized DataFrame to a new Excel file
        df_standardized.to_excel(output_file_path)
        print(f"  Successfully standardized and saved '{file_path.name}' to: {output_file_path}")
        print(f"  Final standardized DataFrame shape: {df_standardized.shape}")
        print(f"  Final standardized DataFrame head:\n{df_standardized.head()}")

    except json.JSONDecodeError as e:
        print(f"  ERROR: JSON decoding failed for LLM response for {file_path.name}: {e}")
        print(f"  LLM Response (raw):\n{llm_response}")
        continue
    except Exception as e:
        print(f"  ERROR processing {file_path.name}: {e}")
        continue

print("\n--- Financial Statement Item Standardization Complete ---")

--- Starting Financial Statement Item Standardization ---

Processing file for standardization: Balance Sheet.xlsx
  Found 91 unique items. Sending to Gemini for standardization...
  Received standardization mapping from Gemini for Balance Sheet.xlsx.
  Successfully standardized and saved 'Balance Sheet.xlsx' to: D:\Visual Studio Projects\Financial Statement Data Retriever\CALM\final_statements_standardized\Balance Sheet.xlsx
  Final standardized DataFrame shape: (61, 6)
  Final standardized DataFrame head:
                                         2020     2021      2023      2024  \
item                                                                         
Cash And Cash Equivalents                 0.0  57352.0  292824.0  237878.0   
Current Investment Securities        154163.0      0.0       0.0       0.0   
Total Current Investment Securities  154163.0      0.0       0.0       0.0   
Trade And Other Receivables            4935.0   5057.0    9267.0   13433.0   
Allowance For Doubt