In [1]:
from pathlib import Path
# --- User Input for Company Folder ---
# Enter the name of the company folder (e.g., "PVIAM", "PVI Insurance")
company_folder_name = input("Enter the company folder name (e.g., PVIAM): ").strip()

if not company_folder_name:
    print("No company folder name entered. Please run this cell again and provide a name.")
    # You might want to exit or raise an error here if a company name is mandatory
else:
    print(f"Company folder set to: {company_folder_name}")

# Construct the base path for the company's data
# This assumes your company folders are directly under 'Financial Statement Data Retriever'
company_base_path = Path(r"D:\Visual Studio Projects\Financial Statement Data Retriever") / company_folder_name

# The periods_input and periods_to_process are not needed for this notebook
# as it processes files based on statement type names from the final_statements folder.
# Keeping this section commented out for clarity if you ever need period context.
# --- User Input for Periods (Optional, but good for context if needed later) ---
# periods_input = input("Enter periods to process (e.g., 2021, 2022, 2023) or leave blank: ")
# periods_to_process = [p.strip() for p in periods_input.split(',') if p.strip()]
# if not periods_to_process:
#     print("No periods entered. Proceeding without period-specific filtering.")
# else:
#     print(f"Periods set for processing (for context): {periods_to_process}")
# periods_to_process = ["2021", "2022", "2023", "2024"]

Company folder set to: general_mills


## <span style = 'color:blue'> Format the Excel files 

In [None]:
# ...existing code...
import pandas as pd
from pathlib import Path


# Define the input and output directories using the dynamically set company_base_path
period_statements_dir = company_base_path / "period_statements"
final_statements_dir = company_base_path / "final_statements"

# Ensure the final_statements directory exists
final_statements_dir.mkdir(parents=True, exist_ok=True)

print("--- Starting Financial Statement Reformatting ---")

# Loop through each Excel file in the period_statements directory
for file_path in period_statements_dir.glob("*.xlsx"):
    print(f"\nProcessing file: {file_path.name}")
    try:
        # Read the Excel file into a DataFrame
        df_long = pd.read_excel(file_path)

        # Validate required columns
        required_columns = ['item', 'year', 'value', 'statement_type']
        if not all(col in df_long.columns for col in required_columns):
            print(f"Warning: Skipping {file_path.name} — missing required columns ({', '.join(required_columns)}).")
            continue

        # Normalize value column to numeric where possible
        # handle parentheses as negative, remove thousands separators and non-numeric chars
        def clean_value(x):
            s = str(x).strip()
            if s == 'nan' or s == '' or s.lower() == 'n/a':
                return pd.NA
            # parentheses => negative
            if s.startswith('(') and s.endswith(')'):
                s = '-' + s[1:-1]
            # remove common currency symbols and spaces
            s = s.replace(',', '').replace(' ', '')
            # remove any remaining non-numeric except - and .
            s = pd.Series([s]).replace(r'[^\d\.\-]', '', regex=True).iloc[0]
            return pd.to_numeric(s, errors='coerce')

        df_long['value'] = df_long['value'].apply(clean_value)

        # Drop rows without item or year
        df_long = df_long.dropna(subset=['item', 'year'])

        # Ensure item and year are strings for pivot stability
        df_long['item'] = df_long['item'].astype(str)
        df_long['year'] = df_long['year'].astype(str)

        # If there are duplicate item-year rows, keep the first non-null value
        # Group and take first non-null value per item/year
        df_grouped = (
            df_long
            .sort_values(['item', 'year'])
            .groupby(['item', 'year'], as_index=False)
            .agg({'value': 'first', 'statement_type': 'first'})
        )

        # Pivot to wide format using first as aggfunc (should be unique after grouping)
        df_wide = df_grouped.pivot_table(index='item', columns='year', values='value', aggfunc='first')

        # Clean column name
        df_wide.columns.name = None

        # Reorder columns to follow periods_to_process if available in this scope
        try:
            if 'periods_to_process' in globals() and isinstance(periods_to_process, (list, tuple)):
                # keep only columns that exist and follow the order in periods_to_process
                ordered = [str(p) for p in periods_to_process if str(p) in df_wide.columns]
                # follow with any other columns that weren't in periods_to_process
                remaining = [c for c in df_wide.columns if c not in ordered]
                df_wide = df_wide.reindex(columns=ordered + remaining)
        except Exception:
            pass

        # Define the output path in the final_statements directory
        # Use the statement-type name for the filename if available; fallback to original filename
        st_name = df_grouped['statement_type'].dropna().unique()
        if len(st_name) > 0:
            safe_name = str(st_name[0]).strip()
            output_file = final_statements_dir / f"{safe_name}.xlsx"
        else:
            output_file = final_statements_dir / file_path.name

        # Save the reformatted DataFrame to a new Excel file (item as index)
        df_wide.to_excel(output_file)
        print(f"Successfully reformatted and saved to: {output_file}")

    except Exception as e:
        print(f"Error processing {file_path.name}: {e}")
        continue

print("\n--- Financial Statement Reformatting Complete ---")
# ...existing code...

--- Starting Financial Statement Reformatting ---

Processing file: all_periods_concatenated.xlsx
Successfully reformatted and saved to: D:\Visual Studio Projects\Financial Statement Data Retriever\general_mills\final_statements\Consolidated Statements Of Cash Flows.xlsx

Processing file: Chi Phí Hoạt Động Kinh Doanh, Giá Vốn Bán Hàng.xlsx
Successfully reformatted and saved to: D:\Visual Studio Projects\Financial Statement Data Retriever\general_mills\final_statements\Chi Phí Hoạt Động Kinh Doanh, Giá Vốn Bán Hàng.xlsx

Processing file: Consolidated Balance Sheets.xlsx
Successfully reformatted and saved to: D:\Visual Studio Projects\Financial Statement Data Retriever\general_mills\final_statements\Consolidated Balance Sheets.xlsx

Processing file: Consolidated Statements Of Cash Flows.xlsx
Successfully reformatted and saved to: D:\Visual Studio Projects\Financial Statement Data Retriever\general_mills\final_statements\Consolidated Statements Of Cash Flows.xlsx

Processing file: Consoli