In [None]:
import pandas as pd
import os

# --- 1. Define Paths and Settings (Confirmed correct via your screenshot) ---
print("--- Starting Final Data Preparation ---")
csv_file_path = '../data/electricity/price_paid_records.csv' 
processed_dir = '../data/electricity/processed/' 
final_parquet_file_path = os.path.join(processed_dir, 'electricity_model_ready.parquet') 

# Ensure the target directory exists
os.makedirs(processed_dir, exist_ok=True)

# --- 3. LOAD THE RAW CSV ---
print(f"1. Loading raw CSV from: {csv_file_path}")
try:
    df = pd.read_csv(
        csv_file_path,
        parse_dates=['SETTLEMENT_DATE'],
        low_memory=False, 
        engine='c'
    )
    print(f"   -> Load successful. Raw records: {len(df):,}")
except FileNotFoundError:
    print(f"   -> ERROR: File not found. Path: {csv_file_path}. Aborting.")
    exit()

# --- 4. APPLY CONSOLIDATED CLEANING AND FEATURE ENGINEERING ---
print("\n2. Applying cleaning and feature engineering...")

# A. Column Renaming (Mandatory Assignment Requirement)
def clean_col_name(col):
    return col.lower().replace(' ', '_').strip()

df.columns = [clean_col_name(col) for col in df.columns]

# B. Feature Engineering (Add Year and Rename Date)
df['Year'] = df['SETTLEMENT_DATE'].dt.year

# C. Handle Outliers (Filtering out non-market sales)
# count_low_price = len(df[df['price'] <= 1])
# df = df[df['price'] > 1].copy()

# print(f"   -> Removed {count_low_price:,} low-value transactions (Price <= Â£1).")
# print(f"   -> Cleaned records remaining: {len(df):,}")

# D. Final Column Cleanup (Drop the unique ID column)
# df.drop(columns=['transaction_unique_identifier'], inplace=True)
# print("   -> Columns renamed, sale_year created, and ID dropped.")

# F. Drop redundant columns
# cols_to_drop = ['ppd_category_type', 'record_status_-_monthly_file_only']
# df.drop(columns=cols_to_drop, inplace=True, errors='ignore')
# print(f"\nFinal number of columns: {len(df.columns)}")

# --- 5. FINAL SAVE ---
print("\n3. Saving final model-ready file to Parquet...")
df.to_parquet(final_parquet_file_path, index=False)

print("\n--- Final Prep Complete! ---")
print(f"Model-ready data saved to: {final_parquet_file_path}")