In [None]:
import pandas as pd
from datetime import datetime
from openpyxl import load_workbook
import hashlib
import os
import glob

# === Path Setup ===
source_folder = '/mnt/c/Bhavcopy/NSE_FundaMenal_Source/'
final_output_path = '/mnt/c/Bhavcopy/NSE_Funda/Temp_Yearly_Result.xlsx'
sheet_name = 'Data Sheet'

# Expected fiscal years (you can expand as needed)
expected_years = ['Mar-20', 'Mar-21', 'Mar-22', 'Mar-23', 'Mar-24', 'Mar-25']

# === Step 1: Loop through all Excel files in the source folder ===
excel_files = glob.glob(os.path.join(source_folder, '*.xlsx'))

for source_path in excel_files:
    print(f"📄 Processing file: {source_path}")

    try:
        # === Step 2: Read metadata from B1, B2, B3 ===
        meta_values = pd.read_excel(
            source_path,
            sheet_name=sheet_name,
            usecols='B',
            nrows=3,
            header=None
        ).squeeze()

        company_name = meta_values.iloc[0]
        latest_version = meta_values.iloc[1]
        current_version = meta_values.iloc[2]

        # === Step 3: Read and transpose P&L block (rows 16–31) ===
        df_pl = pd.read_excel(source_path, sheet_name=sheet_name, skiprows=15, nrows=16)
        df_pl.set_index(df_pl.columns[0], inplace=True)
        df_pl = df_pl.T
        df_pl.index = pd.to_datetime(df_pl.index, format='%b-%y', errors='coerce')  # ✅ Avoid warning
        df_pl.index.name = 'Report_Date'

        # === Step 4: Helper function to read and transpose other blocks ===
        def read_transposed(skiprows, nrows):
            df = pd.read_excel(source_path, sheet_name=sheet_name, skiprows=skiprows, nrows=nrows)
            df.set_index(df.columns[0], inplace=True)
            df = df.T
            df.index = df_pl.index  # Align with P&L dates
            return df

        # === Step 5: Read remaining blocks ===
        df2_t = read_transposed(55, 17)
        df3_t = read_transposed(80, 5)

        # === Step 6: PRICE and DERIVED rows using header for dates ===
        header_row = pd.read_excel(source_path, sheet_name=sheet_name, skiprows=80, nrows=1, header=None)
        date_values = pd.to_datetime(header_row.iloc[0, 1:].values, format='%b-%y', errors='coerce')

        df4 = pd.read_excel(source_path, sheet_name=sheet_name, skiprows=89, nrows=1, header=None)
        df4_t = pd.DataFrame([df4.iloc[0, 1:].values], columns=date_values).T
        df4_t.columns = [df4.iloc[0, 0]]
        df4_t.index = df_pl.index

        df5 = pd.read_excel(source_path, sheet_name=sheet_name, skiprows=92, nrows=1, header=None)
        df5_t = pd.DataFrame([df5.iloc[0, 1:].values], columns=date_values).T
        df5_t.columns = [df5.iloc[0, 0]]
        df5_t.index = df_pl.index

        # === Step 7: Merge all blocks ===
        df_combined = pd.concat([df_pl, df2_t, df3_t, df4_t, df5_t], axis=1)
        df_combined.reset_index(inplace=True)
        df_combined['Report_Date'] = pd.to_datetime(df_combined['Report_Date']).dt.date
        df_combined.insert(1, 'Company_Name', company_name)
        df_combined.insert(2, 'Latest_Version', latest_version)
        df_combined.insert(3, 'Current_Version', current_version)

        # === Step 8: Clean column names ===
        df_combined.columns = [
            str(col).replace(' ', '_').replace('.', '').replace('__', '_').strip()
            for col in df_combined.columns
        ]

        # === Step 9: Remove rows where Report_Date and Sales are both empty ===
        if 'Sales' in df_combined.columns:
            df_combined = df_combined[~(df_combined['Report_Date'].isna() & df_combined['Sales'].isna())]
        else:
            print(f"⚠️ 'Sales' column missing in {os.path.basename(source_path)} — skipping file.")
            continue

        # === Step 10: Add MD5 hash column ===
        def row_to_md5(row):
            row_string = '|'.join(str(val) for val in row.values)
            return hashlib.md5(row_string.encode('utf-8')).hexdigest()

        df_combined['Row_Hash_MD5'] = df_combined.apply(row_to_md5, axis=1)

        # === Step 11: Append to final Excel file ===
        if not os.path.exists(final_output_path):
            df_combined.to_excel(final_output_path, index=False)
        else:
            with pd.ExcelWriter(final_output_path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
                existing_wb = load_workbook(final_output_path)
                ws = existing_wb.active
                start_row = ws.max_row
                df_combined.to_excel(writer, index=False, header=False, startrow=start_row)

    except Exception as e:
        try:
            header_row = pd.read_excel(source_path, sheet_name=sheet_name, skiprows=15, nrows=1, header=None)
            actual_years = header_row.iloc[0, 1:].astype(str).tolist()
            missing_years = [y for y in expected_years if y not in actual_years]
            if missing_years:
                print(f"⚠️ Skipped file {os.path.basename(source_path)} — Missing fiscal years: {', '.join(missing_years)}")
            else:
                print(f"❌ Failed to process file {os.path.basename(source_path)}: {e}")
        except Exception:
            print(f"❌ Failed to process file {os.path.basename(source_path)}: {e}")

print("✅ All files processed and appended.")
