In [2]:
import sys
import os
os.chdir("c:/Users/lackerman008/OneDrive - pwc/Outside/Code/Machine learning/Electricity Shortfall Challenge")

print(f"working directory: {os.getcwd()}")



from data_loading import load_data
from config_and_logging import load_config, generate_run_id, save_run_metadata, create_output_dir, log_to_mlflow
from model_pipeline import choose_best_model

import joblib

config_path = os.path.join('Configs', 'shallow4.yaml')

config = load_config(config_path=config_path)
run_name = config['run']['run_name']
run_id = generate_run_id(config)
output_dir = create_output_dir(run_name, run_id)

target_column = config['data']['target_column']
print("Run name:", run_name)
print("Run ID:", run_id)
print("Data laden...")
train_df, test_df, _ = load_data(config)


working directory: c:\Users\lackerman008\OneDrive - pwc\Outside\Code\Machine learning\Electricity Shortfall Challenge
Run name: shallow4
Run ID: _20250722_125215
Data laden...


In [19]:
from preprocessing import create_preprocessing_pipeline, StandardTransformerWrapper, TimeAwareKNNImputer
import pandas as pd

imputer = TimeAwareKNNImputer()

df = pd.DataFrame({
   'temperature': [20, 21, 19, 22],
   'humidity': [30, 35, 40, 45],
   'time': pd.date_range("2023-01-01", periods=4, freq="h")
})

pipeline_no_dummies = StandardTransformerWrapper(create_preprocessing_pipeline(imputer=imputer, add_time_dummies=None))
output1 = pipeline_no_dummies.fit_transform(df)

pipeline_with_dummies = StandardTransformerWrapper(create_preprocessing_pipeline(imputer=imputer, add_time_dummies='cyclical'))
output2 = pipeline_with_dummies.fit_transform(df)

print(output1.shape)
print(output2.shape)
print(pd.DataFrame(output2).head())



(2, 2)
(2, 11)
     0     1    2    3    4         5         6         7        8    9   \
0  20.0  30.0  0.0  6.0  1.0  0.000000  1.000000 -0.781831  0.62349  0.5   
1  22.0  45.0  3.0  6.0  1.0  0.707107  0.707107 -0.781831  0.62349  0.5   

         10  
0  0.866025  
1  0.866025  


In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from plots import plot_predictions
from models import train_ar_diff_model, predict_ar_diff, get_model
from preprocessing import get_pipeline_for_model, create_preprocessing_pipeline, get_imputer
from model_pipeline import split_data

print("Choose best models...")

train_val_split = config['preprocessing']['train_val_split']

best_rmse = float("inf")
best_model = None
best_model_name = ""
best_X_train = None
best_pipeline = None

y_train = train_df[config['data']['target_column']]
X_train = train_df.drop(columns=[config['data']['target_column']])

def evaluate_model(y_true, y_pred):
    """
    Bereken de Root Mean Squared Error
    """
    return np.sqrt(mean_squared_error(y_true, y_pred))

def get_pipeline_for_model(model, config):

    if model["type"] in ["AR", "MA", "SMA"]:
        return None  # TODO pipeline to be added
    
    elif model.get('scaling', None):
        return create_preprocessing_pipeline(imputer=get_imputer(config), 
                                             freq=config['preprocessing']['freq'],
                                             fill_method=config['preprocessing']['fill_method'], 
                                             add_time_dummies=config['preprocessing']['add_time_dummies'], 
                                             scaling=model['scaling'])

    else:
        raise ValueError(f"No preprocessing pipeline defined for model_type: {model['type']}")




Choose best models...


In [59]:
print(config['models'])

[{'type': 'LinearRegression', 'params': {}, 'scaling': True}, {'type': 'RandomForest', 'params': {'n_estimators': 10, 'random_state': 42}, 'scaling': False}, {'type': 'Ridge', 'params': {'alpha': 10.0, 'random_state': 42}, 'scaling': True}, {'type': 'Lasso', 'params': {'alpha': 1.0, 'random_state': 42}, 'scaling': True}, {'type': 'ElasticNet', 'params': {'alpha': 1.0, 'l1_ratio': 0.5, 'random_state': 42}, 'scaling': True}, {'type': 'RandomForest', 'params': {'n_estimators': 100, 'max_depth': 10, 'random_state': 42}, 'scaling': False}, {'type': 'ExtraTreesRegressor', 'params': {'n_estimators': 100, 'max_depth': 10, 'random_state': 42}, 'scaling': False}, {'type': 'GradientBoostingRegressor', 'params': {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42}, 'scaling': False}, {'type': 'XGBRegressor', 'params': {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42}, 'scaling': False}, {'type': 'KNeighborsRegressor', 'params': {'n_neighbors

In [60]:
for model in config['models']:
    print(model)
    print(model["type"])

{'type': 'LinearRegression', 'params': {}, 'scaling': True}
LinearRegression
{'type': 'RandomForest', 'params': {'n_estimators': 10, 'random_state': 42}, 'scaling': False}
RandomForest
{'type': 'Ridge', 'params': {'alpha': 10.0, 'random_state': 42}, 'scaling': True}
Ridge
{'type': 'Lasso', 'params': {'alpha': 1.0, 'random_state': 42}, 'scaling': True}
Lasso
{'type': 'ElasticNet', 'params': {'alpha': 1.0, 'l1_ratio': 0.5, 'random_state': 42}, 'scaling': True}
ElasticNet
{'type': 'RandomForest', 'params': {'n_estimators': 100, 'max_depth': 10, 'random_state': 42}, 'scaling': False}
RandomForest
{'type': 'ExtraTreesRegressor', 'params': {'n_estimators': 100, 'max_depth': 10, 'random_state': 42}, 'scaling': False}
ExtraTreesRegressor
{'type': 'GradientBoostingRegressor', 'params': {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42}, 'scaling': False}
GradientBoostingRegressor
{'type': 'XGBRegressor', 'params': {'n_estimators': 100, 'max_depth': 4, 'learning_rate

In [63]:
from preprocessing import get_pipeline_for_model

for model in config['models']:
        print(model['type'])
        pipeline = get_pipeline_for_model(model, config)
        
        X_train_processed, y_train_processed = pipeline.fit_transform(X_train, y_train)

        X_train_new, X_val, y_train_new, y_val = split_data(X_train_processed, y_train_processed, train_val_split)

        model_type = get_model(model['type'], model['params'])
        trained_model = model_type.fit(X_train_new, y_train_new)

        # Predict, evaluate and plot
        predictions = trained_model.predict(X_val)
        rmse = evaluate_model(y_val, predictions)
        plot_predictions(y_val, predictions, model['type'], output_dir, dataset_name="validation")
        
        print(f"Model: {model['type']}, RMSE: {rmse:.4f}")

        if rmse < best_rmse:
            best_rmse = rmse
            best_model = model
            best_model_name = model['type']
            best_X_train = X_train
            best_pipeline = pipeline

LinearRegression


TypeError: list indices must be integers or slices, not dict

In [None]:
import joblib 
import os
os.chdir("c:/Users/lackerman008/OneDrive - pwc/Outside/Code/Machine learning/Electricity Shortfall Challenge")

model = joblib.load("saved_models/best_model.pkl")

In [5]:
pipeline = joblib.load("saved_models/preprocessing_pipeline.pkl")
model = joblib.load("saved_models/best_model.pkl")

In [7]:
from sklearn.pipeline import Pipeline

full_pipeline = Pipeline([
    ("preprocessing", pipeline),  # your loaded transformer
    ("model", model)              # your trained model
])

In [None]:
# Save combined artifact
import yaml
def load_config(config_path="configs/shallow4.yaml"):
    with open(config_path, "r") as f:
        return yaml.safe_load(f)

config = load_config()

joblib.dump(full_pipeline, f"{config['output']['saved_models_folder']}/{config['output']['combined_model_filename']}")

['saved_models/combined_model.joblib']

In [3]:
import pandas as pd
import os
import json
os.chdir("c:/Users/lackerman008/OneDrive - pwc/Outside/Code/Machine learning/Electricity Shortfall Challenge")



# Load CSV
df = pd.read_csv("data/data_raw/df_test.csv")

row = df.iloc[0].to_dict()

# Create the dictionary in the required format
payload = {"features": row}

# Save to JSON file
with open("test_input.json", "w") as f:
    json.dump(payload, f, indent=4)

print("Saved row as JSON in test_input.json")
print(payload)

Saved row as JSON in test_input.json
{'features': {'Unnamed: 0': 8763, 'time': '2018-01-01 00:00:00', 'Madrid_wind_speed': 5.0, 'Valencia_wind_deg': 'level_8', 'Bilbao_rain_1h': 0.0, 'Valencia_wind_speed': 5.0, 'Seville_humidity': 87.0, 'Madrid_humidity': 71.3333333333, 'Bilbao_clouds_all': 20.0, 'Bilbao_wind_speed': 3.0, 'Seville_clouds_all': 0.0, 'Bilbao_wind_deg': 193.3333333333, 'Barcelona_wind_speed': 4.0, 'Barcelona_wind_deg': 176.6666666667, 'Madrid_clouds_all': 0.0, 'Seville_wind_speed': 1.0, 'Barcelona_rain_1h': 0.0, 'Seville_pressure': 'sp25', 'Seville_rain_1h': 0.0, 'Bilbao_snow_3h': 0, 'Barcelona_pressure': 1017.3333333333, 'Seville_rain_3h': 0.0, 'Madrid_rain_1h': 0.0, 'Barcelona_rain_3h': 0.0, 'Valencia_snow_3h': 0, 'Madrid_weather_id': 800.0, 'Barcelona_weather_id': 800.0, 'Bilbao_pressure': 1025.6666666667, 'Seville_weather_id': 800.0, 'Valencia_pressure': nan, 'Seville_temp_max': 284.4833333333, 'Madrid_pressure': 1030.0, 'Valencia_temp_max': 287.4833333333, 'Valencia_

git add .
git commit -m "WIF fix"
git push