In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import ElasticNet, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputRegressor
import pandas as pd
import numpy as np

In [23]:
df = pd.read_csv("../data/processed/combined.csv")
df.head()

Unnamed: 0,YYYYMMDD,HH,MWD,MWS,MWS10,WG,T,TD,SD,GR,P,NO2,O3
0,20160101,1,200,40,40,70,74,44,0,0,10212,36.53,20.24
1,20160101,2,200,40,30,70,65,44,0,0,10219,22.74,32.12
2,20160101,3,210,30,30,50,55,42,0,0,10225,24.28,29.87
3,20160101,4,210,30,30,40,55,46,0,0,10228,22.56,27.62
4,20160101,5,110,10,10,40,21,15,0,0,10233,23.67,25.62


In [42]:
X = df.iloc[:, 1:11]
y = df.iloc[:, 11:] 

In [51]:
split_idx = int(0.8 * len(X))
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

split_index = int(0.75* len(X_train))
X_train, X_val = X_train[:split_index], X_train[split_index:]
y_train, y_val = y_train[:split_index], y_train[split_index:]

print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

X_train shape: (26308, 10), y_train shape: (26308, 2)
X_val shape: (8770, 10), y_val shape: (8770, 2)
X_test shape: (8770, 10), y_test shape: (8770, 2)


In [52]:
sk = StandardScaler()
X_train_scaled = sk.fit_transform(X_train)
X_val_scaled = sk.transform(X_val)
X_test_scaled = sk.transform(X_test)

pca = PCA(4)
X_train_scaled = pca.fit_transform(X_train_scaled)
X_val_scaled = pca.transform(X_val_scaled)
X_test_scaled = pca.transform(X_test_scaled)

In [45]:
def split_sequences(trainig_window, prediction_window, X, y):

    X_windows = []
    y_windows = []
    summed_window = trainig_window + prediction_window

    for start in range(0, len(X) - summed_window):  

        X_window = X[ start: start+trainig_window ]

        y_window = y[start + trainig_window : start+ summed_window ]

        X_windows.append(X_window)
        y_windows.append(y_window)

    return np.array(X_windows), np.array(y_windows)

In [49]:
import itertools

def grid_search_validation(pipeline, param_grid, X_train, y_train, X_val, y_val):


    # Get all combinations of hyperparameters
    param_combinations = list(itertools.product(*param_grid.values()))
    best_params = None
    best_score = np.inf
    best_model = None
    

    for combination in param_combinations:
        params = {key: value for key, value in zip(param_grid.keys(), combination)}
        
        pipeline.set_params(**params)
        
        pipeline.fit(X_train, y_train)
        y_val_pred = pipeline.predict(X_val)
        
        score = mean_squared_error(y_val, y_val_pred)
        
        if score < best_score:
            best_score = score
            best_params = params
            best_model = pipeline
    
    return best_model, best_params, best_score

In [54]:
param_grid = {
     'regressor__estimator__alpha': [50],
     'regressor__estimator__l1_ratio': [0.01]
 }

pipeline = Pipeline(steps = [
         ('regressor', MultiOutputRegressor(ElasticNet(random_state = 0)))
         ])


windows = [240]
prediction_window = 72 # three days

for trainig_window in windows:

    X_train_windows, y_train_windows = split_sequences(trainig_window, prediction_window, X_train_scaled, y_train)
    X_val_windows, y_val_windows = split_sequences(trainig_window, prediction_window, X_val_scaled, y_val)
    X_test_windows, y_test_windows = split_sequences(trainig_window, prediction_window, X_test_scaled, y_test)


    X_train_w = X_train_windows.reshape(X_train_windows.shape[0], -1)
    X_test_w = X_test_windows.reshape(X_test_windows.shape[0], -1)
    X_val_w = X_val_windows.reshape(X_val_windows.shape[0], -1)


    y_train_w = y_train_windows.reshape(y_train_windows.shape[0], -1)
    y_test_w = y_test_windows.reshape(y_test_windows.shape[0], -1)
    y_val_w = y_val_windows.reshape(y_val_windows.shape[0], -1)

    best_model, best_params, best_score = grid_search_validation(pipeline, param_grid, X_train_w, y_train_w, X_val_w, y_val_w)

    print(f"For window {trainig_window}: ")
    print(f"The best parameters: {best_params}")
    print(f"Best validation score (MSE): {best_score}")

For window 240: 
The best parameters: {'regressor__estimator__alpha': 50, 'regressor__estimator__l1_ratio': 0.01}
Best validation score (MSE): 315.032631022548


In [58]:
y_pred = best_model.predict(X_test_w)

# Evaluate performance 
mae = mean_absolute_error(y_test_w, y_pred)
print(f"Mean Absolute Error: {mae}")

mse = mean_squared_error(y_test_w, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Absolute Error: 13.92451922509907
Mean Squared Error: 305.3729317828829
