In [57]:
import sys
import os
os.chdir("c:/Users/lackerman008/OneDrive - pwc/Outside/Code/Machine learning/Electricity Shortfall Challenge")

print(f"working directory: {os.getcwd()}")



from data_loading import load_data
from config_and_logging import load_config, generate_run_id, save_run_metadata, create_output_dir, log_to_mlflow
from model_pipeline import choose_best_model

import joblib

config_path = os.path.join('Configs', 'shallow4.yaml')

config = load_config(config_path=config_path)
run_name = config['run']['run_name']
run_id = generate_run_id(config)
output_dir = create_output_dir(run_name, run_id)

target_column = config['data']['target_column']
print("Run name:", run_name)
print("Run ID:", run_id)
print("Data laden...")
train_df, test_df, _ = load_data(config)


working directory: c:\Users\lackerman008\OneDrive - pwc\Outside\Code\Machine learning\Electricity Shortfall Challenge
Run name: shallow4
Run ID: _20250721_214318
Data laden...


In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from plots import plot_predictions
from models import train_ar_diff_model, predict_ar_diff, get_model
from preprocessing import get_pipeline_for_model, create_preprocessing_pipeline, get_imputer
from model_pipeline import split_data

print("Choose best models...")

train_val_split = config['preprocessing']['train_val_split']

best_rmse = float("inf")
best_model = None
best_model_name = ""
best_X_train = None
best_pipeline = None

y_train = train_df[config['data']['target_column']]
X_train = train_df.drop(columns=[config['data']['target_column']])

def evaluate_model(y_true, y_pred):
    """
    Bereken de Root Mean Squared Error
    """
    return np.sqrt(mean_squared_error(y_true, y_pred))

def get_pipeline_for_model(model, config):

    if model["type"] in ["AR", "MA", "SMA"]:
        return None  # TODO pipeline to be added
    
    elif model.get('scaling', None):
        return create_preprocessing_pipeline(imputer=get_imputer(config), 
                                             freq=config['preprocessing']['freq'],
                                             fill_method=config['preprocessing']['fill_method'], 
                                             add_time_dummies=config['preprocessing']['add_time_dummies'], 
                                             scaling=model['scaling'])

    else:
        raise ValueError(f"No preprocessing pipeline defined for model_type: {model['type']}")




Choose best models...


In [59]:
print(config['models'])

[{'type': 'LinearRegression', 'params': {}, 'scaling': True}, {'type': 'RandomForest', 'params': {'n_estimators': 10, 'random_state': 42}, 'scaling': False}, {'type': 'Ridge', 'params': {'alpha': 10.0, 'random_state': 42}, 'scaling': True}, {'type': 'Lasso', 'params': {'alpha': 1.0, 'random_state': 42}, 'scaling': True}, {'type': 'ElasticNet', 'params': {'alpha': 1.0, 'l1_ratio': 0.5, 'random_state': 42}, 'scaling': True}, {'type': 'RandomForest', 'params': {'n_estimators': 100, 'max_depth': 10, 'random_state': 42}, 'scaling': False}, {'type': 'ExtraTreesRegressor', 'params': {'n_estimators': 100, 'max_depth': 10, 'random_state': 42}, 'scaling': False}, {'type': 'GradientBoostingRegressor', 'params': {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42}, 'scaling': False}, {'type': 'XGBRegressor', 'params': {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42}, 'scaling': False}, {'type': 'KNeighborsRegressor', 'params': {'n_neighbors

In [60]:
for model in config['models']:
    print(model)
    print(model["type"])

{'type': 'LinearRegression', 'params': {}, 'scaling': True}
LinearRegression
{'type': 'RandomForest', 'params': {'n_estimators': 10, 'random_state': 42}, 'scaling': False}
RandomForest
{'type': 'Ridge', 'params': {'alpha': 10.0, 'random_state': 42}, 'scaling': True}
Ridge
{'type': 'Lasso', 'params': {'alpha': 1.0, 'random_state': 42}, 'scaling': True}
Lasso
{'type': 'ElasticNet', 'params': {'alpha': 1.0, 'l1_ratio': 0.5, 'random_state': 42}, 'scaling': True}
ElasticNet
{'type': 'RandomForest', 'params': {'n_estimators': 100, 'max_depth': 10, 'random_state': 42}, 'scaling': False}
RandomForest
{'type': 'ExtraTreesRegressor', 'params': {'n_estimators': 100, 'max_depth': 10, 'random_state': 42}, 'scaling': False}
ExtraTreesRegressor
{'type': 'GradientBoostingRegressor', 'params': {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.1, 'random_state': 42}, 'scaling': False}
GradientBoostingRegressor
{'type': 'XGBRegressor', 'params': {'n_estimators': 100, 'max_depth': 4, 'learning_rate

In [63]:
from preprocessing import get_pipeline_for_model

for model in config['models']:
        print(model['type'])
        pipeline = get_pipeline_for_model(model, config)
        
        X_train_processed, y_train_processed = pipeline.fit_transform(X_train, y_train)

        X_train_new, X_val, y_train_new, y_val = split_data(X_train_processed, y_train_processed, train_val_split)

        model_type = get_model(model['type'], model['params'])
        trained_model = model_type.fit(X_train_new, y_train_new)

        # Predict, evaluate and plot
        predictions = trained_model.predict(X_val)
        rmse = evaluate_model(y_val, predictions)
        plot_predictions(y_val, predictions, model['type'], output_dir, dataset_name="validation")
        
        print(f"Model: {model['type']}, RMSE: {rmse:.4f}")

        if rmse < best_rmse:
            best_rmse = rmse
            best_model = model
            best_model_name = model['type']
            best_X_train = X_train
            best_pipeline = pipeline

LinearRegression


TypeError: list indices must be integers or slices, not dict