In [2]:
import pandas as pd
import os
import numpy as np
# Import display for notebook-style output
try:
    from IPython.display import display
except ImportError:
    # Define a fallback display function if not in a notebook
    display = print

print("--- Starting Final Data Preparation ---")
input_parquet_file = '../data/electricity/processed/demanddata_combined.parquet' 
processed_dir = '../data/electricity/processed/' 
final_parquet_file_path = os.path.join(processed_dir, 'demand_model_ready.parquet') 

os.makedirs(processed_dir, exist_ok=True)

# --- 2. LOAD DATA ---
try:
    df = pd.read_parquet(input_parquet_file)
except FileNotFoundError:
    print(f"\nERROR: Input file not found at {input_parquet_file}. Please check path and run 01/02 notebooks.")
    exit()


# --- 3. CORE CLEANING & TYPE FIXES ---

# Create a new column for the year of sale
# Assuming SETTLEMENT_DATE is already datetime from previous notebook
df['Year'] = df['SETTLEMENT_DATE'].dt.year
df['Year'] = df['Year'].astype('int16')
print("Created 'Year' column:")
print(df[['SETTLEMENT_DATE', 'Year']].head())


df['TSD'].fillna(0, inplace=True)
df['EMBEDDED_WIND_GENERATION'].fillna(0, inplace=True)
df['EMBEDDED_WIND_CAPACITY'].fillna(0, inplace=True)
df['EMBEDDED_SOLAR_GENERATION'].fillna(0, inplace=True)
df['EMBEDDED_SOLAR_CAPACITY'].fillna(0, inplace=True)
df['SCOTTISH_TRANSFER'].fillna(0, inplace=True)
df['IFA2_FLOW'].fillna(0, inplace=True)
df['BRITNED_FLOW'].fillna(0, inplace=True)
df['MOYLE_FLOW'].fillna(0, inplace=True)
df['EAST_WEST_FLOW'].fillna(0, inplace=True)
df['NEMO_FLOW'].fillna(0, inplace=True)
df['NSL_FLOW'].fillna(0, inplace=True)
df['ELECLINK_FLOW'].fillna(0, inplace=True)
df['VIKING_FLOW'].fillna(0, inplace=True)
df['GREENLINK_FLOW'].fillna(0, inplace=True)


# 1. Convierte la columna 'ND' a tipo numérico.
df['ND'] = pd.to_numeric(df['ND'], errors='coerce')


# 2. Aplica la transformación logarítmica.
df['ND_log'] = np.log(df['ND'] + 1)



# --- 6. FINAL SAVE ---
print("\n2. Saving final model-ready file to Parquet...")
# Ensure the directory exists
os.makedirs(os.path.dirname(final_parquet_file_path), exist_ok=True) 

# --- FINAL CHECK (as requested) ---
print("\n--- Displaying First 5 Rows (Complete Check) ---")
pd.set_option('display.max_columns', None) # Ensure all columns are shown
display(df.head(5))
# ----------------------------------

# Save the final file with the index (datetime) intact
df.to_parquet(final_parquet_file_path, index=True) 

print("\n--- Final Prep Complete! ---")
print(f"Model-ready data saved to: {final_parquet_file_path}")
print(f"Final DataFrame shape: {df.shape}")

--- Starting Final Data Preparation ---
Created 'Year' column:
  SETTLEMENT_DATE  Year
0      2001-01-01  2001
1      2001-01-01  2001
2      2001-01-01  2001
3      2001-01-01  2001
4      2001-01-01  2001

2. Saving final model-ready file to Parquet...

--- Displaying First 5 Rows (Complete Check) ---


Unnamed: 0,SETTLEMENT_DATE,SETTLEMENT_PERIOD,ND,TSD,ENGLAND_WALES_DEMAND,EMBEDDED_WIND_GENERATION,EMBEDDED_WIND_CAPACITY,EMBEDDED_SOLAR_GENERATION,EMBEDDED_SOLAR_CAPACITY,NON_BM_STOR,PUMP_STORAGE_PUMPING,SCOTTISH_TRANSFER,IFA_FLOW,IFA2_FLOW,BRITNED_FLOW,MOYLE_FLOW,EAST_WEST_FLOW,NEMO_FLOW,NSL_FLOW,ELECLINK_FLOW,VIKING_FLOW,GREENLINK_FLOW,Year,ND_log
0,2001-01-01,1,38631,0.0,34060.0,0.0,0.0,0.0,0.0,0,862,0.0,1495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2001,10.561836
1,2001-01-01,2,39808,0.0,35370.0,0.0,0.0,0.0,0.0,0,153,0.0,1496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2001,10.591848
2,2001-01-01,3,40039,0.0,35680.0,0.0,0.0,0.0,0.0,0,18,0.0,1511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2001,10.597634
3,2001-01-01,4,39339,0.0,35029.0,0.0,0.0,0.0,0.0,0,9,0.0,1958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2001,10.579997
4,2001-01-01,5,38295,0.0,34047.0,0.0,0.0,0.0,0.0,0,211,0.0,1996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2001,10.553101



--- Final Prep Complete! ---
Model-ready data saved to: ../data/electricity/processed/demand_model_ready.parquet
Final DataFrame shape: (434590, 24)
