# Cleaning dataset

In [7]:
import pandas as pd
import re
from datetime import datetime

In [None]:
import pandas as pd
import re
from datetime import datetime

def find_layout_info(df_sheet):
    anchor_row, anchor_col = None, None
    for r in range(min(10, len(df_sheet))):
        for c in range(min(10, len(df_sheet.columns))):
            cell_value = df_sheet.iat[r, c]
            if isinstance(cell_value, str) and "Region / Province" in cell_value:
                anchor_row, anchor_col = r, c
                break
        if anchor_row is not None:
            break

    if anchor_row is None:
        return None 
    layout = {
        'comm_row': anchor_row,
        'date_row': anchor_row + 1,
        'data_start_row': anchor_row + 2,
        'loc_col': anchor_col,
        'data_start_col': anchor_col + 1
    }
    return layout

def transform_sheet(df_raw):
    layout = find_layout_info(df_raw)
    if layout is None:
        print("    - Warning: Could not find 'Region / Province' anchor. Skipping sheet.")
        return None

    # --- Step 1: Extract header and data sections based on found layout ---
    commodity_headers = df_raw.iloc[layout['comm_row']].copy().ffill()
    date_headers = df_raw.iloc[layout['date_row']].copy()
    data_df = df_raw.iloc[layout['data_start_row']:]

    # --- Step 2: Iterate and build clean records ---
    records = []
    for _, row_data in data_df.iterrows():
        location = row_data.iloc[layout['loc_col']]
        
        # Skip rows without a valid location name
        if not isinstance(location, str) or not location.strip() or not re.search('[a-zA-Z]', location):
            continue

        # Iterate through the price cells in the rest of the row
        for col_idx in range(layout['data_start_col'], len(row_data)):
            price = row_data.iloc[col_idx]
            commodity = commodity_headers.get(col_idx)
            date_phase = date_headers.get(col_idx)

            if pd.isna(price) or pd.isna(commodity) or pd.isna(date_phase) or price == '-':
                continue
                
            match = re.search(r'(First|Second)\s+Phase\s+([a-zA-Z]+)\s+(\d{4})', str(date_phase))
            if not match: continue
            
            phase, month, year = match.groups()
            day = 1 if phase == 'First' else 15
            try:
                parsed_date = datetime.strptime(f"{day} {month} {year}", "%d %B %Y").date()
                records.append({
                    'Date': parsed_date,
                    'Location': location.strip(),
                    'Commodity': str(commodity).strip(),
                    'Price': float(price)
                })
            except (ValueError, TypeError):
                continue

    if not records:
        print(f"    - Warning: No valid data records found on this sheet after processing.")
        return None
        
    return pd.DataFrame.from_records(records)


def process_excel_workbook(excel_filepath):
    """
    Main function to read a multi-sheet Excel workbook, process each sheet,
    and combine them into a single master DataFrame.
    """
    print(f"--- Starting processing of Excel workbook: '{excel_filepath}' ---")
    try:
        xls = pd.ExcelFile(excel_filepath)
    except FileNotFoundError:
        print(f"Error: The file '{excel_filepath}' was not found.")
        return None
        
    all_sheets_data = []
    for sheet_name in xls.sheet_names:
        if any(skip_word in sheet_name.lower() for skip_word in ['list', 'pivot']):
            print(f"  - Skipping non-data sheet: '{sheet_name}'")
            continue
            
        print(f"  - Processing sheet: '{sheet_name}'")
        try:
            df_raw = pd.read_excel(xls, sheet_name=sheet_name, header=None)
            cleaned_sheet = transform_sheet(df_raw)
            if cleaned_sheet is not None and not cleaned_sheet.empty:
                all_sheets_data.append(cleaned_sheet)
        except Exception as e:
            print(f"    - UNEXPECTED ERROR processing sheet '{sheet_name}': {e}")

    if not all_sheets_data:
        print("\n--- ERROR: No data could be extracted from any sheet. ---")
        return None
        
    master_df = pd.concat(all_sheets_data, ignore_index=True)
    
    master_df['Price_Type'] = 'Retail_Price'
    def get_commodity_group(commodity_name):
        name_upper = str(commodity_name).upper()
        if 'RICE' in name_upper: return 'Rice'
        if any(c in name_upper for c in ['PORK', 'BEEF', 'CHICKEN', 'CARABEEF', 'DRESSED', 'EGGS']): return 'Livestock & Poultry'
        if any(c in name_upper for c in ['TILAPIA', 'GALUNGGONG', 'BANGUS', 'FISH']): return 'Fish'
        if any(c in name_upper for c in ['CABBAGE', 'CARROT', 'TOMATO', 'POTATO', 'EGGPLANT', 'AMPALAYA', 'SITAW', 'PECHAY', 'ONION', 'GARLIC']): return 'Vegetables'
        if any(c in name_upper for c in ['BANANA', 'MANGO', 'CALAMANSI', 'PAPAYA', 'PINEAPPLE']): return 'Fruits'
        return 'Other Condiments'
    master_df['Commodity_Group'] = master_df['Commodity'].apply(get_commodity_group)
    
    final_cols = ['Date', 'Location', 'Commodity_Group', 'Commodity', 'Price_Type', 'Price']
    final_df = master_df[final_cols].copy()
    final_df.sort_values(by=['Location', 'Commodity_Group', 'Commodity', 'Date'], inplace=True)
    final_df.reset_index(drop=True, inplace=True)
    
    print("\n--- Workbook processing complete! ---")
    return final_df

# --- Main execution block ---
if __name__ == "__main__":
    input_file = 'Attachment_Statistical_Tables_on_Price_Situationer_2ndPhaseJun25.xlsx'
    
    pd.set_option('future.no_silent_downcasting', True)
    
    cleaned_data = process_excel_workbook(input_file)
    
    if cleaned_data is not None and not cleaned_data.empty:
        print("\n--- Sample of Final Combined Dataset ---")
        print(cleaned_data.head(20).to_string())
        
        print("\n--- Filtered Data for Quezon ---")
        quezon_data = cleaned_data[cleaned_data['Location'] == 'Quezon']
        if not quezon_data.empty:
            print(quezon_data.to_string())
        else:
            print("No data found for 'Quezon'.")

        output_file = 'cleaned_master_prices.csv'
        cleaned_data.to_csv(output_file, index=False)
        print(f"\nSuccessfully saved the final combined dataset to '{output_file}'")
    else:
        print("\nFailed to generate a dataset. The final DataFrame was empty.")

--- Starting processing of Excel workbook: 'Attachment_Statistical_Tables_on_Price_Situationer_2ndPhaseJun25.xlsx' ---
  - Processing sheet: 'Table_All_SR (From Editing)'
  - Skipping non-data sheet: 'list'
  - Processing sheet: 'Table_All_SR'


  for idx, row in parser.parse():


  - Processing sheet: 'Table 1_rice'
  - Processing sheet: 'Table 2_meat'
  - Processing sheet: 'Table 3_fish'
  - Processing sheet: 'Table 4_veg'
  - Processing sheet: 'Table 5_condiments'
  - Processing sheet: 'Table 6_fruits'
  - Processing sheet: 'Table 7_other comm'

--- Workbook processing complete! ---

--- Sample of Final Combined Dataset ---
          Date Location      Commodity_Group                          Commodity    Price_Type  Price
0   2025-05-15     Abra                 Fish                             BANGUS  Retail_Price  245.0
1   2025-05-15     Abra                 Fish                         GALUNGGONG  Retail_Price  220.0
2   2025-05-15     Abra                 Fish                            TILAPIA  Retail_Price  180.0
3   2025-05-15     Abra               Fruits                    BANANA, LAKATAN  Retail_Price  140.0
4   2025-05-15     Abra               Fruits                   BANANA, LATUNDAN  Retail_Price   60.0
5   2025-05-15     Abra               Fru