In [2]:
import pandas as pd
import numpy as np
import os
import joblib 
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn Imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OrdinalEncoder

# Set plotting style for consistent output
sns.set_style("whitegrid")

In [None]:
# --- 1. Settings and Data Loading (Defining the DataFrame) ---
print("--- Starting 05 Quick Baseline Model & Feature Importance (Scikit-learn DTR) ---")

# Define the relative path to the cleaned Parquet file created in 03_final_prep.ipynb
# This structure assumes the script is run from a notebook folder (e.g., /notebooks/05...)
PROCESSED_DATA_PATH = './../../data/electricity/processed/electricity_model_ready.parquet'
MODEL_PATH = './../../data/trained_models/electricity/'

try:
    # Use pandas (pd) to load the model-ready DataFrame. This defines the 'df' variable.
    resolved_path = os.path.normpath(PROCESSED_DATA_PATH)
    print(f"1. Loading cleaned data from resolved path: {resolved_path}")
    df = pd.read_parquet(resolved_path)
    print(f"   -> Data successfully loaded. Total records: {len(df):,}")
    
    # Check if the dataframe is empty after loading
    if df.empty:
        raise ValueError("Loaded DataFrame is empty. Check 03_final_prep.ipynb output.")

except FileNotFoundError:
    print(f"   -> ERROR: File not found at {PROCESSED_DATA_PATH}. Please run 03_final_prep.ipynb first.")
    exit()
except Exception as e:
    print(f"   -> An error occurred during data loading: {e}")
    exit()


--- Starting 05 Quick Baseline Model & Feature Importance (Scikit-learn DTR) ---
1. Loading cleaned data from resolved path: ../../data/electricity/processed/electricity_model_ready.parquet
   -> ERROR: File not found at ./../../data/electricity/processed/electricity_model_ready.parquet. Please run 03_final_prep.ipynb first.


: 

In [1]:
# --- 2. Feature and Target Preparation ---

# Target variable (y): We use the logarithm of the price to normalize the distribution.
y = np.log(df['ND'])

# Features (X): Select all relevant categorical and numerical features.
features = ['SETTLEMENT_DATE', 'SETTLEMENT_PERIOD', 'Year', 'ND', 'TSD', 'ENGLAND_WALES_DEMAND']
X = df[features].copy() # Use .copy() to avoid SettingWithCopyWarning

# Identify categorical features
categorical_features = [col for col in X.columns if X[col].dtype.name == 'category']

NameError: name 'np' is not defined