In [1]:
from data_loading import load_data
from config_and_logging import load_config, generate_run_id, save_run_metadata, create_output_dir, log_to_mlflow
from model_pipeline import choose_best_model, train_full_model_predict_test_set
from models import get_model
from preprocessing import get_imputer, create_preprocessing_pipeline

import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from plots import plot_predictions
from models import train_ar_diff_model, predict_ar_diff

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


ModuleNotFoundError: No module named 'data_loading'

In [3]:
config_path = 'Configs/shallow2_scaling_timeseriessplit.yaml'

config = load_config(config_path=os.path.join(os.getcwd(), config_path))
run_name = config['run']['run_name']
run_id = generate_run_id(config)
output_dir = create_output_dir(run_name, run_id)

target_column = config['data']['target_column']
print("Run name:", run_name)
print("Run ID:", run_id)
print("Data laden...")
train_df, test_df, sample_submission = load_data(config)

print("Pipelines aanmaken...")

# Shared preprocessing config
imputer = get_imputer(config)
freq = config['preprocessing']['freq']
fill_method = config['preprocessing']['fill_method']
add_time_dummies = config['preprocessing']['add_time_dummies']

NameError: name 'load_config' is not defined

In [3]:
# Split X and y sets
y_train = train_df[target_column]
X_train = train_df.drop(columns=[target_column])

# Pipelines
pipeline_scaled = create_preprocessing_pipeline(imputer, freq, fill_method, add_time_dummies, scaling=True)
pipeline_no_scaling = create_preprocessing_pipeline(imputer, freq, fill_method, add_time_dummies, scaling=False)

# Fit both pipelines on training data
X_train_scaled = pipeline_scaled.fit_transform(X_train)
X_train_no_scaling = pipeline_no_scaling.fit_transform(X_train)

# Transform test set as well (will be needed later)
test_scaled = pipeline_scaled.transform(test_df)
test_no_scaling = pipeline_no_scaling.transform(test_df)




In [None]:
print(y_train.shape)
print(X_train_scaled.shape)
print(X_train_no_scaling.shape)

(8763,)
(8763, 51)
(8763, 51)


: 

In [56]:
# Convert Valencia_wind_deg to numerical values if it exists
if 'Valencia_wind_deg' in train_df.columns:
    print("Converting Valencia_wind_deg to numerical values...")
    train_df['Valencia_wind_deg_cat'] = train_df['Valencia_wind_deg'].astype(str).str.replace('level_', '').astype(float)
    train_df = train_df.drop(columns=['Valencia_wind_deg'])

# Convert Seville_pressure to numerical values if it exists
if 'Seville_pressure' in train_df.columns:
    print("Converting Seville_pressure to numerical values...")
    train_df['Seville_pressure_cat'] = train_df['Seville_pressure'].astype(str).str.replace('sp', '').astype(float)
    train_df = train_df.drop(columns=['Seville_pressure'])


print("before", train_df.isna().sum().sum())   
def set_datetime_as_index(df, fill_method='interpolate', freq='3H'):
        """
        Convert 'time' column to datetime index, reindex to regular intervals,
        and impute missing rows.
        """
        # Convert and sort time
        df['time'] = pd.to_datetime(df['time'])
        df = df.set_index('time')

        # Create complete datetime index
        full_index = pd.date_range(start=df.index.min(), end=df.index.max(), freq='3H')
        print([dat for dat in full_index if dat not in df.index])  # Check the difference between the first index and the full index
        # Reindex
        df = df.reindex(full_index)


        # Identify fully missing rows
        fully_missing_mask = df.isna().all(axis=1)
        print(df.index[fully_missing_mask])
           
        df = df.asfreq(freq)

        df.drop(index=df.index[fully_missing_mask], inplace=True)
        return df

# Ensure time is datetime type
if 'time' in train_df.columns:
    train_df_index = set_datetime_as_index(train_df)

print("after", train_df_index.isna().sum().sum()) 
print(train_df_index.shape)
print(y_train.shape)



before 2068
[Timestamp('2015-01-05 15:00:00'), Timestamp('2015-01-05 18:00:00'), Timestamp('2015-02-01 15:00:00'), Timestamp('2015-02-01 18:00:00')]
DatetimeIndex(['2015-01-05 15:00:00', '2015-01-05 18:00:00',
               '2015-02-01 15:00:00', '2015-02-01 18:00:00'],
              dtype='datetime64[ns]', freq=None)
after 2068
(8763, 48)
(8763,)


  full_index = pd.date_range(start=df.index.min(), end=df.index.max(), freq='3H')
  df = df.asfreq(freq)


In [4]:
# Load models
model_cfgs = config['models']
models_to_try = {}

for mc in model_cfgs:
    model_name = mc['type']
    scaling_needed = mc.get('scaling', False)

    if scaling_needed:
        X_train_transformed = X_train_scaled
        X_test_transformed = test_scaled
    else:
        X_train_transformed = X_train_no_scaling
        X_test_transformed = test_no_scaling

    model = get_model(model_name, mc['params'])

    models_to_try[model_name] = {
        'model': model,
        'X_train': X_train_transformed.copy(),
        'X_test': X_test_transformed.copy()
    }

In [11]:
from model_pipeline import evaluate_model

def train_and_evaluate_model(output_dir, model, model_name, X_train, y_train, X_val, y_val):
    """ Train en evalueer een model op de validatieset. """

    if model_name == "AR1":
        model_fit, last_value, lags = train_ar_diff_model(y_train)
        predictions = predict_ar_diff(model_fit, last_value, lags, steps=len(y_val), index=y_val.index)
        
    elif model_name in ["MA1", "MA2", "SMA"]:
        predictions = train_and_predict_ma(model_name, y_train, len(y_val), y_val.index if isinstance(y_val, pd.Series) else None)
        
    else:
        model.fit(X_train, y_train.values)
        predictions = model.predict(X_val)

    # Bereken RMSE
    rmse = evaluate_model(y_val, predictions)

    plot_predictions(y_val, predictions, model_name, output_dir, dataset_name="validation")

    return model, model_name, rmse, predictions

def train_and_predict_ma(model_name, y_train, prediction_steps, y_val_index=None):
    """
    Train and predict using Moving Average models

    Args:
        model_name: 'MA1', 'MA2', or 'SMA'
        y_train: Training time series
        prediction_steps: Number of steps to predict
        y_val_index: Index for predictions
    """
    predictions = []

    if model_name == 'MA1':
        # MA(1) - uses last residual
        # For simplicity, using naive implementation
        last_value = y_train.iloc[-1]
        for _ in range(prediction_steps):
            pred = last_value  # Simplified MA(1)
            predictions.append(pred)
            
    elif model_name == 'MA2':
        # MA(2) - uses last 2 residuals
        last_values = y_train.tail(2).mean()
        for _ in range(prediction_steps):
            pred = last_values  # Simplified MA(2)
            predictions.append(pred)
            
    elif model_name == 'SMA':
        # Simple Moving Average
        window = min(8, len(y_train))  # 24-hour window or available data
        sma_value = y_train.tail(window).mean()
        predictions = [sma_value] * prediction_steps

    # Convert to pandas Series with proper index
    if y_val_index is not None:
        return pd.Series(predictions, index=y_val_index)
    else:
        return np.array(predictions)

train_val_split = config['preprocessing']['train_val_split']

best_rmse = float("inf")
best_model = None
best_model_name = ""
best_X_train = None
best_X_test = None

for model_name, entry in models_to_try.items():
    if model_name in ['LinearRegression', 'RandomForest', 'Ridge', 'Lasso', 'ElasticNet', 'BayesianRidge', 
                      'SGDRegressor', 'ExtraTreesRegressor', 'GradientBoostingRegressor', 'XGBRegressor', 
                      'KNeighborsRegressor', 'SVR']:
        continue
        

    model = entry['model']
    X_train = entry['X_train']
    X_test = entry['X_test']
    
    # Define split point
    train_val_loc = int(len(X_train) * (1-train_val_split))

    # Split
    X_train_new = X_train[:train_val_loc]
    X_val = X_train[train_val_loc:]

    y_train_new = y_train.iloc[:train_val_loc]
    y_val = y_train.iloc[train_val_loc:]

    print(f"Training model: {model_name} with {X_train_new.shape[0]} training samples and {X_val.shape[0]} validation samples")

    # Here: simple train on X_train, evaluate on same data (adjust to proper CV or split if needed)
    model, model_name, rmse, predictions = train_and_evaluate_model(
        output_dir, model, model_name, X_train_new, y_train_new, X_val, y_val
    )
    
    print(f"Model: {model_name}, RMSE: {rmse:.4f}")

    if rmse < best_rmse:
        best_rmse = rmse
        best_model = model
        best_model_name = model_name
        best_X_train = X_train
        best_X_test = X_test

Training model: MA1 with 7010 training samples and 1753 validation samples
Model: MA1, RMSE: 5563.9103
Training model: MA2 with 7010 training samples and 1753 validation samples
Model: MA2, RMSE: 5602.8634
Training model: SMA with 7010 training samples and 1753 validation samples
Model: SMA, RMSE: 5517.9642


In [12]:
def train_full_model_predict_test_set(best_model, X_train, X_test, y_train):

    if best_model == "AutoReg":  # Voor AR1 modellen hebben we alleen y_train nodig
        model_fit, last_value, lags = train_ar_diff_model(y_train)
        test_predictions = predict_ar_diff(model_fit, last_value, lags, steps=len(X_test), index=X_test.index if isinstance(X_test, pd.DataFrame) else None)

    elif model_name in ["MA1", "MA2", "SMA"]:
        test_predictions = train_and_predict_ma(model_name, y_train, len(X_test), X_test.index if isinstance(X_test, pd.DataFrame) else None)

    else:  # Voor andere modellen gebruiken we zowel X_train als y_train
        best_model.fit(X_train, y_train.values)
        test_predictions = best_model.predict(X_test)

    return test_predictions

metrics = {"rmse_validation": best_rmse, "model": best_model_name}

save_run_metadata(output_dir, config, metrics)

# Log to MLflow

log_to_mlflow(config, output_dir, run_id, best_model_name, best_model, metrics, parameters=config.get("models", {}))

if True:
    # Train on full set and predict on test set
    test_predictions = train_full_model_predict_test_set(
        best_model, 
        best_X_train, 
        best_X_test, 
        y_train
    )

    submission_df = pd.DataFrame({
        'time': test_df.index,  # or test_df['time'] if that's your column
        'load_shortfall_3h': test_predictions
    })
    submission_df.to_csv('sample_submission.csv', index=False)
    print("\nVoorspellingen opgeslagen in 'sample_submission.csv'")




Voorspellingen opgeslagen in 'sample_submission.csv'
