In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import gc

In [6]:
hull_df = pd.read_csv('Hull15-22-final.csv')

In [7]:
def shift_columns(df):
    # Shift the PM2.5 column values by 1, 2, and 3 positions
    df['1 hour'] = df['PM2.5'].shift(-1)
    df['2 hour'] = df['PM2.5'].shift(-2)
    df['3 hour'] = df['PM2.5'].shift(-3)
    df = df.iloc[:-3]
    return df


In [8]:
hull_df = shift_columns(hull_df)

In [5]:
hull_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70125 entries, 0 to 70124
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Date                     70125 non-null  object 
 1   time                     70125 non-null  object 
 2   Ozone                    70125 non-null  float64
 3   Nitric oxide             70125 non-null  float64
 4   Nitrogen dioxide         70125 non-null  float64
 5   Nitrogen oxides          70125 non-null  float64
 6   Sulphur dioxide          70125 non-null  float64
 7   PM10                     70125 non-null  float64
 8   PM2.5                    70125 non-null  float64
 9   Wind Direction           70125 non-null  float64
 10  Wind Speed               70125 non-null  float64
 11  Temperature              70125 non-null  float64
 12  Date and time            70125 non-null  object 
 13  Humidity                 70125 non-null  float64
 14  Rainfall              

In [6]:
hull_df.head()

Unnamed: 0,Date,time,Ozone,Nitric oxide,Nitrogen dioxide,Nitrogen oxides,Sulphur dioxide,PM10,PM2.5,Wind Direction,...,Cloud Base Height,Total Cloud Amount,weekday,month,hour,season,Wind Cardinal Direction,1 hour,2 hour,3 hour
0,2015-01-01,01:00:00,41.84318,1.91668,13.99823,16.9371,0.52686,19.559712,14.0,190.0,...,666.666667,8.0,4,1,1,1,5,12.2,16.4,12.7
1,2015-01-01,02:00:00,34.99127,2.96234,25.92585,30.46804,0.922,21.14982,12.2,190.0,...,8333.333333,6.0,4,1,2,1,5,16.4,12.7,12.1
2,2015-01-01,03:00:00,39.06583,1.56817,18.59572,21.00021,1.1209,24.309648,16.4,190.0,...,8333.333333,0.0,4,1,3,1,5,12.7,12.1,16.7
3,2015-01-01,04:00:00,38.91615,1.69882,17.11879,19.72361,1.385,18.785991,12.7,190.0,...,1000.0,3.0,4,1,4,1,5,12.1,16.7,13.2
4,2015-01-01,05:00:00,38.71658,1.65548,17.63134,20.1697,1.78081,18.396321,12.1,180.0,...,1000.0,8.0,4,1,5,1,5,16.7,13.2,11.325


In [7]:
hull_df.tail()

Unnamed: 0,Date,time,Ozone,Nitric oxide,Nitrogen dioxide,Nitrogen oxides,Sulphur dioxide,PM10,PM2.5,Wind Direction,...,Cloud Base Height,Total Cloud Amount,weekday,month,hour,season,Wind Cardinal Direction,1 hour,2 hour,3 hour
70120,2022-12-31,17:00:00,41.21121,3.83409,25.51641,31.39527,0.61416,14.75,11.392,160.0,...,5000.0,8.0,6,12,17,1,5,5.259,5.59,5.991
70121,2022-12-31,18:00:00,45.05293,3.24661,20.06227,25.04034,0.1675,7.175,5.259,180.0,...,5000.0,8.0,6,12,18,1,5,5.59,5.991,6.722
70122,2022-12-31,19:00:00,42.70798,3.89593,25.51644,31.49012,0.335,7.7,5.59,190.0,...,3333.333333,8.0,6,12,19,1,5,5.991,6.722,7.005
70123,2022-12-31,20:00:00,38.36733,4.57617,28.21999,35.23669,0.44666,7.825,5.991,200.0,...,3333.333333,8.0,6,12,20,1,5,6.722,7.005,5.094
70124,2022-12-31,21:00:00,34.87486,4.76169,27.1293,34.43047,0.39083,8.6,6.722,200.0,...,3333.333333,8.0,6,12,21,1,5,7.005,5.094,3.019


In [9]:
scale_factors = {
   'Ozone':10, 'Hourly Total Radiation': 100,'Cloud Base Height':100, 'Humidity':10
}

# Apply scaling
for column, factor in scale_factors.items():
    if column in hull_df.columns:
        hull_df[column] = hull_df[column] / factor

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [11]:
 features = ['Nitric oxide', 'Nitrogen dioxide', 'Nitrogen oxides', 'PM10','PM2.5','Temperature', 'Wind Speed',
              'Humidity', 'Wind Cardinal Direction', 'Maximum Gust Speed', 'Total Cloud Amount',
           'Ozone', 'Hourly Total Radiation','Cloud Base Height','hour', 'season']

In [12]:
def clear_model_memory():
    #Clear the memory of the model
  
    # Clear memory for scikit-learn
    gc.collect()
    
    # Clear memory for Keras/TensorFlow
    tf.keras.backend.clear_session()
    gc.collect()

In [9]:
def predict_pm25(df, features, output_folder= 'results'):
    # Ensure 'Date and time' is of datetime type
    if not pd.api.types.is_datetime64_any_dtype(df['Date and time']):
        df['Date and time'] = pd.to_datetime(df['Date and time'])
    
    # Sorting by datetime to maintain order
    df = df.sort_values(by='Date and time')
    
    # Creating a 'time_index' column to represent the time step
    df['time_index'] = np.arange(len(df))
    
    metrics_table = []
    
    # List of target columns
    targets = ['1 hour', '2 hour', '3 hour']
    
    for target in targets:
        # Preparing the dataset
        X = df[features]
        y = df[target]
        
        # Splitting the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Creating and training the Linear Regression model
        model = LinearRegression()
        model.fit(X_train, y_train)
        
        # Predicting the PM2.5 values for the test set
        y_pred = model.predict(X_test)
        
        # Calculating performance metrics
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
        
        # Append metrics to the table
        metrics_table.append({
            'Target': target,
            'MSE': mse,
            'MAE': mae,
            'R-squared': r2,
            'MAPE': mape
        })
        
        # Predicting PM2.5 for each row in the DataFrame
        df[f'PM2.5_pre {target}'] = model.predict(X)
    
    # Convert metrics table to DataFrame for better visualization
    metrics_df = pd.DataFrame(metrics_table)
    
    # Save the metrics DataFrame and the updated dataset
    metrics_df.to_csv(f'{output_folder}/metrics_table_reg.csv', index=False)
    df.to_csv(f'{output_folder}/predicted_reg.csv', index=False)
    
    print("Metrics Table:")
    print(metrics_df)
    return df, metrics_df

In [13]:
predict_pm25(hull_df,features, output_folder='results' )

Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   8.060731  1.820636   0.888340  28.413883
1  2 hour  14.314799  2.451510   0.800247  38.924839
2  3 hour  18.622169  2.867509   0.736767  47.372570


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-01-01  00:00:00  6.071917       1.39417           9.09728   
 0      2015-01-01  01:00:00  4.184318       1.91668          13.99823   
 1      2015-01-01  02:00:00  3.499127       2.96234          25.92585   
 2      2015-01-01  03:00:00  3.906583       1.56817          18.59572   
 3      2015-01-01  04:00:00  3.891615       1.69882          17.11879   
 ...           ...       ...       ...           ...               ...   
 70120  2022-12-31  17:00:00  4.121121       3.83409          25.51641   
 70121  2022-12-31  18:00:00  4.505293       3.24661          20.06227   
 70122  2022-12-31  19:00:00  4.270798       3.89593          25.51644   
 70123  2022-12-31  20:00:00  3.836733       4.57617          28.21999   
 70124  2022-12-31  21:00:00  3.487486       4.76169          27.12930   
 
        Nitrogen oxides  Sulphur dioxide       PM10   PM2.5  Wind Direction  \
 23            11.23498        

In [13]:
from sklearn.linear_model import Ridge

In [14]:
def predict_pm25_ridge(df, features, alpha=1.0, output_folder="results"):
    # Ensure 'Date and time' is of datetime type
    if not pd.api.types.is_datetime64_any_dtype(df['Date and time']):
        df['Date and time'] = pd.to_datetime(df['Date and time'])
    
    # Sorting by datetime to maintain order
    df = df.sort_values(by='Date and time')
    
    # Creating a 'time_index' column to represent the time step
    df['time_index'] = np.arange(len(df))
    
    # Features for the model
    features =  features
    
    metrics_table = []
    
    # List of target columns
    targets = ['1 hour', '2 hour', '3 hour']
    
    for target in targets:
        # Preparing the dataset
        X = df[features]
        y = df[target]
        
        # Splitting the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Creating and training the Ridge Regression model
        model = Ridge(alpha=alpha)
        model.fit(X_train, y_train)
        
        # Predicting the PM2.5 values for the test set
        y_pred = model.predict(X_test)
        
        # Calculating performance metrics
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
        
        # Append metrics to the table
        metrics_table.append({
            'Target': target,
            'MSE': mse,
            'MAE': mae,
            'R-squared': r2,
            'MAPE': mape
        })
        
        # Predicting PM2.5 for each row in the DataFrame
        df[f'PM2.5_pre_{target}_ridge'] = model.predict(X)
    
    # Convert metrics table to DataFrame for better visualization
    metrics_df = pd.DataFrame(metrics_table)
    
    # Save the metrics DataFrame and the updated dataset
    metrics_df.to_csv(f'{output_folder}/metrics_table_ridge.csv', index=False)
    df.to_csv(f'{output_folder}/predicted_pm25_ridge.csv', index=False)
    
    print("Metrics Table:")
    print(metrics_df)
    return df, metrics_df

In [16]:
predict_pm25_ridge(hull_df, features, alpha=1.0, output_folder="results")

Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   8.060731  1.820637   0.888340  28.413885
1  2 hour  14.314799  2.451510   0.800247  38.924841
2  3 hour  18.622168  2.867509   0.736767  47.372573


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-01-01  00:00:00  6.071917       1.39417           9.09728   
 0      2015-01-01  01:00:00  4.184318       1.91668          13.99823   
 1      2015-01-01  02:00:00  3.499127       2.96234          25.92585   
 2      2015-01-01  03:00:00  3.906583       1.56817          18.59572   
 3      2015-01-01  04:00:00  3.891615       1.69882          17.11879   
 ...           ...       ...       ...           ...               ...   
 70120  2022-12-31  17:00:00  4.121121       3.83409          25.51641   
 70121  2022-12-31  18:00:00  4.505293       3.24661          20.06227   
 70122  2022-12-31  19:00:00  4.270798       3.89593          25.51644   
 70123  2022-12-31  20:00:00  3.836733       4.57617          28.21999   
 70124  2022-12-31  21:00:00  3.487486       4.76169          27.12930   
 
        Nitrogen oxides  Sulphur dioxide       PM10   PM2.5  Wind Direction  \
 23            11.23498        

In [16]:
from sklearn.ensemble import RandomForestRegressor

In [13]:
def predict_pm25_random_forest(df, features, n_estimators=100, random_state=42, output_folder="results"):
    # Ensure 'Date and time' is of datetime type
    if not pd.api.types.is_datetime64_any_dtype(df['Date and time']):
        df['Date and time'] = pd.to_datetime(df['Date and time'])
    
    # Sorting by datetime to maintain order
    df = df.sort_values(by='Date and time')
    
    # Creating a 'time_index' column to represent the time step
    df['time_index'] = np.arange(len(df))
    
    # Features for the model
    features = features
    
    metrics_table = []
    
    # List of target columns
    targets = ['1 hour', '2 hour', '3 hour']
    
    for target in targets:
        # Preparing the dataset
        X = df[features]
        y = df[target]
        
        # Splitting the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
        
        # Creating and training the Random Forest Regressor model
        model = RandomForestRegressor(n_estimators=n_estimators, random_state=random_state)
        model.fit(X_train, y_train)
        
        # Predicting the PM2.5 values for the test set
        y_pred = model.predict(X_test)
        
        # Calculating performance metrics
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
        
        # Append metrics to the table
        metrics_table.append({
            'Target': target,
            'MSE': mse,
            'MAE': mae,
            'R-squared': r2,
            'MAPE': mape
        })
        
        # Predicting PM2.5 for each row in the DataFrame
        df[f'PM2.5_pre_{target}_rf'] = model.predict(X)
    
    # Convert metrics table to DataFrame for better visualization
    metrics_df = pd.DataFrame(metrics_table)
    
    # Save the metrics DataFrame and the updated dataset
    metrics_df.to_csv(f'{output_folder}/metrics_table_rf.csv', index=False)
    df.to_csv(f'{output_folder}/predicted_pm25_rf.csv', index=False)
    
    print("Metrics Table:")
    print(metrics_df)
    return df, metrics_df

In [19]:
predict_pm25_random_forest(hull_df,features, n_estimators=100, random_state=42, output_folder="results")

Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   7.864190  1.795811   0.891063  29.788680
1  2 hour  13.352688  2.364682   0.813673  39.822868
2  3 hour  16.708049  2.701554   0.763824  46.979947


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-01-01  00:00:00  6.071917       1.39417           9.09728   
 0      2015-01-01  01:00:00  4.184318       1.91668          13.99823   
 1      2015-01-01  02:00:00  3.499127       2.96234          25.92585   
 2      2015-01-01  03:00:00  3.906583       1.56817          18.59572   
 3      2015-01-01  04:00:00  3.891615       1.69882          17.11879   
 ...           ...       ...       ...           ...               ...   
 70120  2022-12-31  17:00:00  4.121121       3.83409          25.51641   
 70121  2022-12-31  18:00:00  4.505293       3.24661          20.06227   
 70122  2022-12-31  19:00:00  4.270798       3.89593          25.51644   
 70123  2022-12-31  20:00:00  3.836733       4.57617          28.21999   
 70124  2022-12-31  21:00:00  3.487486       4.76169          27.12930   
 
        Nitrogen oxides  Sulphur dioxide       PM10   PM2.5  Wind Direction  \
 23            11.23498        

In [17]:
import xgboost as xgb

In [18]:
def predict_pm25_xgboost(df, features, output_folder="results", n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42):
    # Ensure 'Date and time' is of datetime type
    if not pd.api.types.is_datetime64_any_dtype(df['Date and time']):
        df['Date and time'] = pd.to_datetime(df['Date and time'])
    
    # Sorting by datetime to maintain order
    df = df.sort_values(by='Date and time')
    
    # Creating a 'time_index' column to represent the time step
    df['time_index'] = np.arange(len(df))
    
    # Features for the model
    features = features
    
    metrics_table = []
    
    # List of target columns
    targets = ['1 hour', '2 hour', '3 hour']
    
    for target in targets:
        # Preparing the dataset
        X = df[features]
        y = df[target]
        
        # Splitting the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
        
        # Creating and training the XGBoost Regressor model
        model = xgb.XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate, 
                                 max_depth=max_depth, random_state=random_state)
        model.fit(X_train, y_train)
        
        # Predicting the PM2.5 values for the test set
        y_pred = model.predict(X_test)
        
        # Calculating performance metrics
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
        
        # Append metrics to the table
        metrics_table.append({
            'Target': target,
            'MSE': mse,
            'MAE': mae,
            'R-squared': r2,
            'MAPE': mape
        })
        
        # Predicting PM2.5 for each row in the DataFrame
        df[f'PM2.5_pre {target}_xgboost'] = model.predict(X)
    
    # Convert metrics table to DataFrame for better visualization
    metrics_df = pd.DataFrame(metrics_table)
    
    # Save the metrics DataFrame and the updated dataset
    metrics_df.to_csv(f'{output_folder}/metrics_table_xgboost.csv', index=False)
    df.to_csv(f'{output_folder}/predicted_pm25_xgboost.csv', index=False)
    
    print("Metrics Table:")
    print(metrics_df)
    return df, metrics_df

In [22]:
predict_pm25_xgboost(hull_df,features, output_folder="results",  n_estimators=100, learning_rate=0.01, max_depth=3, random_state=42)

Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  17.161333  2.833664   0.762276  54.726192
1  2 hour  22.561755  3.247741   0.685167  62.518294
2  3 hour  26.360644  3.549309   0.627380  67.749393


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-01-01  00:00:00  6.071917       1.39417           9.09728   
 0      2015-01-01  01:00:00  4.184318       1.91668          13.99823   
 1      2015-01-01  02:00:00  3.499127       2.96234          25.92585   
 2      2015-01-01  03:00:00  3.906583       1.56817          18.59572   
 3      2015-01-01  04:00:00  3.891615       1.69882          17.11879   
 ...           ...       ...       ...           ...               ...   
 70120  2022-12-31  17:00:00  4.121121       3.83409          25.51641   
 70121  2022-12-31  18:00:00  4.505293       3.24661          20.06227   
 70122  2022-12-31  19:00:00  4.270798       3.89593          25.51644   
 70123  2022-12-31  20:00:00  3.836733       4.57617          28.21999   
 70124  2022-12-31  21:00:00  3.487486       4.76169          27.12930   
 
        Nitrogen oxides  Sulphur dioxide       PM10   PM2.5  Wind Direction  \
 23            11.23498        

In [40]:
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler

In [17]:
def predict_pm25_elastic_net(df, features, output_folder="results", alpha=1.0, l1_ratio=0.5, test_size=0.2, random_state=42):
    #Predict PM2.5 using Elastic Net linear regression and save metrics and dataset.
    # Ensure 'Date and time' is of datetime type
    if not pd.api.types.is_datetime64_any_dtype(df['Date and time']):
        df['Date and time'] = pd.to_datetime(df['Date and time'])
    
    # Sorting by datetime to maintain order
    df = df.sort_values(by='Date and time')
    
    # Creating a 'time_index' column to represent the time step
    df['time_index'] = np.arange(len(df))
    
    metrics_table = []
    
    # List of target columns
    targets = ['1 hour', '2 hour', '3 hour']
    
    for target in targets:
        # Preparing the dataset
        X = df[features]
        y = df[target]
        

        # Standardize the features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        # Splitting the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=test_size, random_state=random_state)
        
        # Creating and training the Elastic Net model
        model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=random_state)
        model.fit(X_train, y_train)
        
        # Predicting the PM2.5 values for the test set
        y_pred = model.predict(X_test)
        
        # Calculating performance metrics
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
        
        # Append metrics to the table
        metrics_table.append({
            'Target': target,
            'MSE': mse,
            'MAE': mae,
            'R-squared': r2,
            'MAPE': mape
        })
        
        # Predicting PM2.5 for each row in the DataFrame
        y_pred_full = model.predict(X_scaled)
        df[f'PM2.5_pre_{target}_elastic_net'] = y_pred_full
    
    # Convert metrics table to DataFrame for better visualization
    metrics_df = pd.DataFrame(metrics_table)
    
    # Save the metrics DataFrame and the updated dataset
    metrics_df.to_csv(f'{output_folder}/metrics_table_elastic_net.csv', index=False)
    df.to_csv(f'{output_folder}/predicted_pm25_elastic_net.csv', index=False)
    
    print("Metrics Table:")
    print(metrics_df)
    return df, metrics_df


In [27]:
output_folder = 'results'

predict_pm25_elastic_net(hull_df, features, output_folder=output_folder, alpha=1.0, l1_ratio=0.01, test_size=0.2, random_state=42)

Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  17.308282  2.734021   0.760241  45.187911
1  2 hour  22.329686  3.129920   0.688405  53.051389
2  3 hour  26.070060  3.439548   0.631487  59.960072


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-01-01  00:00:00  6.071917       1.39417           9.09728   
 0      2015-01-01  01:00:00  4.184318       1.91668          13.99823   
 1      2015-01-01  02:00:00  3.499127       2.96234          25.92585   
 2      2015-01-01  03:00:00  3.906583       1.56817          18.59572   
 3      2015-01-01  04:00:00  3.891615       1.69882          17.11879   
 ...           ...       ...       ...           ...               ...   
 70120  2022-12-31  17:00:00  4.121121       3.83409          25.51641   
 70121  2022-12-31  18:00:00  4.505293       3.24661          20.06227   
 70122  2022-12-31  19:00:00  4.270798       3.89593          25.51644   
 70123  2022-12-31  20:00:00  3.836733       4.57617          28.21999   
 70124  2022-12-31  21:00:00  3.487486       4.76169          27.12930   
 
        Nitrogen oxides  Sulphur dioxide       PM10   PM2.5  Wind Direction  \
 23            11.23498        

In [19]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Dropout
from tensorflow.keras.optimizers import Adam,  RMSprop
from keras.callbacks import EarlyStopping




In [20]:
def predict_pm25_bidirectional_lstm(df,features,  output_folder="results", epochs=10, batch_size=32):
    # Ensure 'Date and time' is of datetime type
    if not pd.api.types.is_datetime64_any_dtype(df['Date and time']):
        df['Date and time'] = pd.to_datetime(df['Date and time'])
    
    # Sorting by datetime to maintain order
    df = df.sort_values(by='Date and time')
    
    # Creating a 'time_index' column to represent the time step
    df['time_index'] = np.arange(len(df))
    
    # Features for the model
    features =  features
    
    metrics_table = []
    
    # List of target columns
    targets = ['1 hour', '2 hour', '3 hour']
    
    for target in targets:
        # Preparing the dataset
        X = df[features]
        y = df[target]
        
        # Standardize the features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        # Reshape input to be 3D [samples, timesteps, features] for LSTM
        X_scaled = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))
        
        # Splitting the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
        
        model = Sequential()
        model.add(Bidirectional(LSTM(100, return_sequences=True), input_shape=(X_train.shape[1], X_train.shape[2])))
        model.add(Dropout(0.2))
        model.add(Bidirectional(LSTM(100)))
        model.add(Dropout(0.2))
        model.add(Dense(1))
        
        model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
        # Early stopping to avoid overfitting
        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=1, shuffle=False, callbacks=[early_stopping])
        
        # Predicting the PM2.5 values for the test set
        y_pred = model.predict(X_test).flatten()
        
        # Calculating performance metrics
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
        
        # Append metrics to the table
        metrics_table.append({
            'Target': target,
            'MSE': mse,
            'MAE': mae,
            'R-squared': r2,
            'MAPE': mape
        })
        
        # Predicting PM2.5 for each row in the DataFrame
        X_full_scaled = scaler.transform(X)
        X_full_scaled = X_full_scaled.reshape((X_full_scaled.shape[0], 1, X_full_scaled.shape[1]))
        df[f'PM2.5_pre {target}_bilstm'] = model.predict(X_full_scaled).flatten()
    
    # Convert metrics table to DataFrame for better visualization
    metrics_df = pd.DataFrame(metrics_table)
    
    # Save the metrics DataFrame and the updated dataset
    metrics_df.to_csv(f'{output_folder}/metrics_table_bilstm.csv', index=False)
    df.to_csv(f'{output_folder}/predicted_pm25_bilstm.csv', index=False)
    
    print("Metrics Table:")
    print(metrics_df)
    return df, metrics_df

In [30]:
predict_pm25_bidirectional_lstm(hull_df,features,  output_folder="results", epochs=10, batch_size=32)


Epoch 1/10

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   7.687412  1.794497   0.893512  29.459873
1  2 hour  13.900716  2.436364   0.806025  40.823592
2  3 hour  17.408212  2.787014   0.753927  47.738161


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-01-01  00:00:00  6.071917       1.39417           9.09728   
 0      2015-01-01  01:00:00  4.184318       1.91668          13.99823   
 1      2015-01-01  02:00:00  3.499127       2.96234          25.92585   
 2      2015-01-01  03:00:00  3.906583       1.56817          18.59572   
 3      2015-01-01  04:00:00  3.891615       1.69882          17.11879   
 ...           ...       ...       ...           ...               ...   
 70120  2022-12-31  17:00:00  4.121121       3.83409          25.51641   
 70121  2022-12-31  18:00:00  4.505293       3.24661          20.06227   
 70122  2022-12-31  19:00:00  4.270798       3.89593          25.51644   
 70123  2022-12-31  20:00:00  3.836733       4.57617          28.21999   
 70124  2022-12-31  21:00:00  3.487486       4.76169          27.12930   
 
        Nitrogen oxides  Sulphur dioxide       PM10   PM2.5  Wind Direction  \
 23            11.23498        

In [21]:
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, TimeDistributed

In [22]:
def create_sequences(data, target, time_steps=3):
    X, y = [], []
    for i in range(len(data) - time_steps):
        X.append(data[i:(i + time_steps)])
        y.append(target[i + time_steps])
    return np.array(X), np.array(y)

In [23]:
from keras.layers import Conv1D, MaxPooling1D, Flatten, TimeDistributed

In [24]:
def predict_pm25_cnn_lstm(df, features, epochs=50, batch_size=64, time_steps=3, output_folder="results"):
    # Ensure 'Date and time' is of datetime type
    if not pd.api.types.is_datetime64_any_dtype(df['Date and time']):
        df['Date and time'] = pd.to_datetime(df['Date and time'])
    
    # Sorting by datetime to maintain order
    df = df.sort_values(by='Date and time')
    
    metrics_table = []
    
    # List of target columns
    targets = ['1 hour', '2 hour', '3 hour']
    
    for target in targets:
        # Preparing the dataset
        X = df[features].values
        y = df[target].values
        
        # Standardize the features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        # Create sequences
        X_seq, y_seq = create_sequences(X_scaled, y, time_steps)
        
        # Reshape for Conv1D
        X_seq = X_seq.reshape((X_seq.shape[0], time_steps, X_seq.shape[2], 1))
        
        # Splitting the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)
        
        # Creating and training the CNN-LSTM model
        model = Sequential()
        model.add(TimeDistributed(Conv1D(filters=64, kernel_size=2, activation='relu'), input_shape=(time_steps, X_train.shape[2], 1)))
        model.add(TimeDistributed(MaxPooling1D(pool_size=2)))
        model.add(TimeDistributed(Flatten()))
        model.add(LSTM(50, return_sequences=True))
        model.add(Dropout(0.2))
        model.add(LSTM(50))
        model.add(Dropout(0.2))
        model.add(Dense(1))
        model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
        
        # Early stopping to avoid overfitting
        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
        
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=1, shuffle=False, callbacks=[early_stopping])
        
        # Predicting the PM2.5 values for the test set
        y_pred = model.predict(X_test).flatten()  # Flatten to ensure y_pred is 1D
        
        # Calculating performance metrics
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
        
        # Append metrics to the table
        metrics_table.append({
            'Target': target,
            'MSE': mse,
            'MAE': mae,
            'R-squared': r2,
            'MAPE': mape
        })
        
        # Predicting PM2.5 for each row in the DataFrame
        X_full_seq, _ = create_sequences(X_scaled, y, time_steps)
        X_full_seq = X_full_seq.reshape((X_full_seq.shape[0], time_steps, X_full_seq.shape[2], 1))
        predictions = model.predict(X_full_seq).flatten()
        
        # Add NaN for the first time_steps values to match the original DataFrame length
        predictions = np.concatenate([np.full(time_steps, np.nan), predictions])
        df[f'PM2.5_pre_{target}_cnn_lstm'] = predictions
    
    # Convert metrics table to DataFrame for better visualization
    metrics_df = pd.DataFrame(metrics_table)
    
    # Save the metrics DataFrame and the updated dataset
    metrics_df.to_csv(f'{output_folder}/metrics_table_cnn_lstm.csv', index=False)
    df.to_csv(f'{output_folder}/predicted_pm25_cnn_lstm.csv', index=False)
    
    print("Metrics Table:")
    print(metrics_df)
    return df, metrics_df


In [35]:
predict_pm25_cnn_lstm(hull_df, features, epochs=50, batch_size=64 , output_folder="results")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 

(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-01-01  00:00:00  6.071917       1.39417           9.09728   
 0      2015-01-01  01:00:00  4.184318       1.91668          13.99823   
 1      2015-01-01  02:00:00  3.499127       2.96234          25.92585   
 2      2015-01-01  03:00:00  3.906583       1.56817          18.59572   
 3      2015-01-01  04:00:00  3.891615       1.69882          17.11879   
 ...           ...       ...       ...           ...               ...   
 70120  2022-12-31  17:00:00  4.121121       3.83409          25.51641   
 70121  2022-12-31  18:00:00  4.505293       3.24661          20.06227   
 70122  2022-12-31  19:00:00  4.270798       3.89593          25.51644   
 70123  2022-12-31  20:00:00  3.836733       4.57617          28.21999   
 70124  2022-12-31  21:00:00  3.487486       4.76169          27.12930   
 
        Nitrogen oxides  Sulphur dioxide       PM10   PM2.5  Wind Direction  \
 23            11.23498        

In [57]:
winter_df = pd.read_csv('winter_data.csv')
spring_df = pd.read_csv('spring_data.csv')
summer_df = pd.read_csv('summer_data.csv')
fall_df = pd.read_csv('autumn_data.csv')

In [75]:
winter_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17328 entries, 0 to 17327
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Date                     17328 non-null  object 
 1   time                     17328 non-null  object 
 2   Ozone                    17328 non-null  float64
 3   Nitric oxide             17328 non-null  float64
 4   Nitrogen dioxide         17328 non-null  float64
 5   Nitrogen oxides          17328 non-null  float64
 6   PM10                     17328 non-null  float64
 7   PM2.5                    17328 non-null  float64
 8   Wind Direction           17328 non-null  float64
 9   Wind Speed               17328 non-null  float64
 10  Temperature              17328 non-null  float64
 11  Date and time            17328 non-null  object 
 12  Humidity                 17328 non-null  float64
 13  Maximum Gust Speed       17328 non-null  float64
 14  Sea Level Pressure    

In [58]:
defs = [winter_df, spring_df, summer_df, fall_df]

In [59]:
# Apply scaling
for df in defs:
    for column, factor in scale_factors.items():
        if column in df.columns:
            df[column] = df[column] / factor

In [60]:
def shift_column(df):
    # Shift the PM2.5 column values by 1, 2, and 3 positions
    df['1 hour'] = df['PM2.5'].shift(-1)
    df['2 hour'] = df['PM2.5'].shift(-2)
    df['3 hour'] = df['PM2.5'].shift(-3)
    return df

In [61]:
for i in range(len(defs)):
    defs[i] = shift_column(defs[i])
    

In [80]:
winter_df.head()

Unnamed: 0,Date,time,Ozone,Nitric oxide,Nitrogen dioxide,Nitrogen oxides,PM10,PM2.5,Wind Direction,Wind Speed,...,Cloud Base Height,Total Cloud Amount,weekday,month,hour,season,Wind Cardinal Direction,1 hour,2 hour,3 hour
0,2015-01-01,01:00:00,41.84318,1.91668,13.99823,16.9371,19.559712,14.0,190.0,6.0,...,666.666667,8.0,4,1,1,1,5,12.2,16.4,12.7
1,2015-01-01,02:00:00,34.99127,2.96234,25.92585,30.46804,21.14982,12.2,190.0,11.0,...,8333.333333,6.0,4,1,2,1,5,16.4,12.7,12.1
2,2015-01-01,03:00:00,39.06583,1.56817,18.59572,21.00021,24.309648,16.4,190.0,11.0,...,8333.333333,0.0,4,1,3,1,5,12.7,12.1,16.7
3,2015-01-01,04:00:00,38.91615,1.69882,17.11879,19.72361,18.785991,12.7,190.0,11.0,...,1000.0,3.0,4,1,4,1,5,12.1,16.7,13.2
4,2015-01-01,05:00:00,38.71658,1.65548,17.63134,20.1697,18.396321,12.1,180.0,8.0,...,1000.0,8.0,4,1,5,1,5,16.7,13.2,11.325


In [81]:
winter_df.tail()

Unnamed: 0,Date,time,Ozone,Nitric oxide,Nitrogen dioxide,Nitrogen oxides,PM10,PM2.5,Wind Direction,Wind Speed,...,Cloud Base Height,Total Cloud Amount,weekday,month,hour,season,Wind Cardinal Direction,1 hour,2 hour,3 hour
17323,2022-12-31,20:00:00,38.36733,4.57617,28.21999,35.23669,7.825,5.991,200.0,8.0,...,3333.333333,8.0,6,12,20,1,5,6.722,7.005,5.094
17324,2022-12-31,21:00:00,34.87486,4.76169,27.1293,34.43047,8.6,6.722,200.0,8.0,...,3333.333333,8.0,6,12,21,1,5,7.005,5.094,3.019
17325,2022-12-31,22:00:00,42.25895,3.33937,23.09751,28.21781,8.5,7.005,230.0,7.0,...,666.666667,8.0,6,12,22,1,6,5.094,3.019,
17326,2022-12-31,23:00:00,53.68433,1.3914,16.6468,18.78026,6.425,5.094,230.0,9.0,...,666.666667,8.0,6,12,23,1,6,3.019,,
17327,2022-12-31,00:00:00,49.54325,2.75189,27.83972,32.05922,3.975,3.019,210.0,11.0,...,3333.333333,7.0,6,12,0,1,6,,,


In [62]:
 # Drop the last 3 rows
for i in range(len(defs)):
    defs[i] =defs[i].iloc[:-3]

In [29]:
defs[0].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17325 entries, 0 to 17324
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Date                     17325 non-null  object 
 1   time                     17325 non-null  object 
 2   Ozone                    17325 non-null  float64
 3   Nitric oxide             17325 non-null  float64
 4   Nitrogen dioxide         17325 non-null  float64
 5   Nitrogen oxides          17325 non-null  float64
 6   PM10                     17325 non-null  float64
 7   PM2.5                    17325 non-null  float64
 8   Wind Direction           17325 non-null  float64
 9   Wind Speed               17325 non-null  float64
 10  Temperature              17325 non-null  float64
 11  Date and time            17325 non-null  object 
 12  Humidity                 17325 non-null  float64
 13  Maximum Gust Speed       17325 non-null  float64
 14  Sea Level Pressure    

In [84]:
defs[1].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17661 entries, 0 to 17660
Data columns (total 27 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Date                     17661 non-null  object 
 1   time                     17661 non-null  object 
 2   Ozone                    17661 non-null  float64
 3   Nitric oxide             17661 non-null  float64
 4   Nitrogen dioxide         17661 non-null  float64
 5   Nitrogen oxides          17661 non-null  float64
 6   PM10                     17661 non-null  float64
 7   PM2.5                    17661 non-null  float64
 8   Wind Direction           17661 non-null  float64
 9   Wind Speed               17661 non-null  float64
 10  Temperature              17661 non-null  float64
 11  Date and time            17661 non-null  object 
 12  Humidity                 17661 non-null  float64
 13  Maximum Gust Speed       17661 non-null  float64
 14  Sea Level Pressure    

In [31]:
def ensure_folder_exists(folder_path):
   
    #Ensure that the folder exists; if not, create it.
    
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

In [32]:
def process_seasonal_datasets(dfs, features, prediction_function, model_name, output_folder="results", **kwargs):
   
    #Process multiple seasonal datasets with a given prediction function.

    ensure_folder_exists(output_folder)

    all_metrics = []
    all_predictions = []

    for i, df in enumerate(dfs):
        season_output_folder = f"{output_folder}/season_{i+1}"
        ensure_folder_exists(season_output_folder)
        
        print(f"Processing dataset {i+1}")
        df_result, metrics_df = prediction_function(df, features, output_folder=season_output_folder, **kwargs)
        all_metrics.append(metrics_df)
        all_predictions.append(df_result)

        # Clear memory after each model to ensure a fresh start
        clear_model_memory()

    # Concatenate all metrics and calculate the average
    combined_metrics = pd.concat(all_metrics).groupby('Target').mean().reset_index()
    
    # Concatenate all predictions into one DataFrame
    combined_predictions = pd.concat(all_predictions).sort_values(by=['Date and time'])
    
    # Save the combined results
    combined_metrics.to_csv(f'{output_folder}/average_metrics_{model_name}.csv', index=False)
    combined_predictions.to_csv(f'{output_folder}/combined_predictions_{model_name}.csv', index=False)
    
    print(f"Average Metrics Table for {model_name}:")
    print(combined_metrics)
    
    return combined_predictions, combined_metrics

In [88]:
simple_reg_predictions, average_metrics_simple_Reg = process_seasonal_datasets(
    defs, features, predict_pm25, model_name='simple_reg', output_folder=output_folder
)


Processing dataset 1
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   9.409848  1.968126   0.891537  36.399932
1  2 hour  17.315833  2.722120   0.805601  45.831787
2  3 hour  22.355330  3.199862   0.746142  54.694634
Processing dataset 2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date and time'] = pd.to_datetime(df['Date and time'])


Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  12.488138  2.113249   0.863152  30.726355
1  2 hour  20.760973  2.815776   0.774350  41.089238
2  3 hour  28.262143  3.329770   0.691050  48.048009
Processing dataset 3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date and time'] = pd.to_datetime(df['Date and time'])


Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   8.629627  1.804917   0.834956  28.249190
1  2 hour  14.857192  2.386301   0.714951  40.153152
2  3 hour  19.260507  2.754539   0.628432  43.444542
Processing dataset 4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date and time'] = pd.to_datetime(df['Date and time'])


Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   7.489408  1.769298   0.872270  30.274280
1  2 hour  12.986293  2.398138   0.771526  38.364967
2  3 hour  17.675304  2.843305   0.687774  49.865062
Average Metrics Table for simple_reg:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   9.504255  1.913897   0.865479  31.412439
1  2 hour  16.480073  2.580584   0.766607  41.359786
2  3 hour  21.888321  3.031869   0.688349  49.013062


In [89]:
combined_predictions_ridge, average_metrics_ridge = process_seasonal_datasets(
    defs, features, predict_pm25_ridge, model_name='ridge', output_folder=output_folder, alpha=1.0,
)



Processing dataset 1
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   9.409844  1.968127   0.891537  36.399957
1  2 hour  17.315810  2.722120   0.805601  45.831822
2  3 hour  22.355319  3.199862   0.746142  54.694653
Processing dataset 2
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  12.488123  2.113250   0.863152  30.726382
1  2 hour  20.761006  2.815777   0.774350  41.089260
2  3 hour  28.262146  3.329768   0.691050  48.048038
Processing dataset 3
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   8.629605  1.804916   0.834956  28.249191
1  2 hour  14.857172  2.386300   0.714951  40.153151
2  3 hour  19.260474  2.754537   0.628433  43.444532
Processing dataset 4
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   7.489372  1.769295   0.872271  30.274283
1  2 hour  12.986232  2.398134   0.771527  38.364959
2  3 hour  17.675248  2.843302   0.687775  49.865045
Average 

In [90]:
combined_predictions_random_forest, average_metrics_random_forest = process_seasonal_datasets(
    defs, features, predict_pm25_random_forest, model_name='random_forest', output_folder=output_folder, n_estimators=100, random_state=42
)


Processing dataset 1
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   9.847222  1.996203   0.886496  40.460407
1  2 hour  16.155892  2.587698   0.818623  46.418823
2  3 hour  20.284595  2.993850   0.769656  53.957310
Processing dataset 2
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  12.324000  2.121951   0.864951  34.471949
1  2 hour  19.920338  2.751655   0.783487  43.512347
2  3 hour  25.898336  3.169684   0.716890  48.094746
Processing dataset 3
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   8.466123  1.789969   0.838083  28.689879
1  2 hour  14.296489  2.323289   0.725708  39.794220
2  3 hour  17.635662  2.619733   0.659778  42.320944
Processing dataset 4
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   7.552478  1.792216   0.871194  31.913085
1  2 hour  12.674370  2.340992   0.777014  38.966513
2  3 hour  15.655369  2.672471   0.723455  48.627668
Average 

In [93]:
combined_predictions_xgboost, average_metrics_xgboost = process_seasonal_datasets(
    defs, features, predict_pm25_xgboost, model_name='xgboost',
    output_folder=output_folder, n_estimators=100, learning_rate=0.01, max_depth=3, random_state=42
)


Processing dataset 1
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  21.164744  3.203536   0.756044  72.170303
1  2 hour  28.763751  3.709865   0.677079  75.961459
2  3 hour  32.901859  4.038544   0.626379  80.875858
Processing dataset 2
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  23.999402  3.259655   0.737009  59.835558
1  2 hour  31.782212  3.729986   0.654561  66.951720
2  3 hour  38.273395  4.126064   0.581611  69.038901
Processing dataset 3
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  15.021056  2.554344   0.712718  45.213232
1  2 hour  20.300138  2.931983   0.610523  54.447758
2  3 hour  24.053004  3.206057   0.535977  55.373880
Processing dataset 4
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  15.060087  2.678392   0.743154  55.765062
1  2 hour  19.527535  3.066728   0.656443  58.838470
2  3 hour  23.843885  3.430031   0.578808  69.372989
Average 

In [94]:
combined_predictions_elastic_net, average_metrics_elastic_net = process_seasonal_datasets(
    defs, features, predict_pm25_elastic_net, model_name='elastic_net', output_folder=output_folder, alpha=1.0, l1_ratio=0.01, test_size=0.2, random_state=42
)


Processing dataset 1
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  20.409264  3.006647   0.764752  57.605238
1  2 hour  27.924342  3.515849   0.686502  63.243496
2  3 hour  32.241645  3.837287   0.633877  68.867155
Processing dataset 2
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  23.451182  3.151607   0.743017  49.662709
1  2 hour  30.944937  3.609380   0.663661  56.941247
2  3 hour  36.945827  3.971703   0.596123  61.092844
Processing dataset 3
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  14.919752  2.455110   0.714655  38.804580
1  2 hour  19.793294  2.817567   0.620247  48.774235
2  3 hour  23.529884  3.095322   0.546069  50.030429
Processing dataset 4
Metrics Table:
   Target        MSE      MAE  R-squared       MAPE
0  1 hour  14.650955  2.55068   0.750132  45.465605
1  2 hour  18.859502  2.93910   0.668196  50.421694
2  3 hour  22.865832  3.28603   0.596085  60.885658
Average Metr

In [32]:
output_folder = 'results'

In [34]:
combined_predictions_bidirectional_lstm, average_metrics_bidirectional_lstm = process_seasonal_datasets(
    defs, features, predict_pm25_bidirectional_lstm, model_name='bidirectional_lstm', output_folder=output_folder, epochs=10, batch_size=32
)


Processing dataset 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date and time'] = pd.to_datetime(df['Date and time'])



Epoch 1/10

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   9.144235  1.942700   0.894599  39.494316
1  2 hour  16.473720  2.696261   0.815055  51.073656
2  3 hour  21.282574  3.179000   0.758323  60.602687
Processing dataset 2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date and time'] = pd.to_datetime(df['Date and time'])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  12.337785  2.121231   0.864800  32.402665
1  2 hour  20.239269  2.787096   0.780020  42.894916
2  3 hour  27.940279  3.347739   0.694568  51.222729
Processing dataset 3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date and time'] = pd.to_datetime(df['Date and time'])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   8.501389  1.811570   0.837408  29.534780
1  2 hour  14.372674  2.398350   0.724247  42.858568
2  3 hour  18.539793  2.776714   0.642336  46.246271
Processing dataset 4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date and time'] = pd.to_datetime(df['Date and time'])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   7.442004  1.782347   0.873079  31.710003
1  2 hour  12.704652  2.380347   0.776481  39.171306
2  3 hour  17.135144  2.825033   0.697315  50.326380
Average Metrics Table for bidirectional_lstm:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   9.356353  1.914462   0.867471  33.285441
1  2 hour  15.947579  2.565514   0.773951  43.999611
2  3 hour  21.224448  3.032122   0.698136  52.099517


In [35]:
combined_predictions_cnn_lstm, average_metrics_cnn_lstm = process_seasonal_datasets(
    defs, features, predict_pm25_cnn_lstm, model_name='cnn_lstm', output_folder=output_folder, epochs=50, batch_size=64
)


Processing dataset 1

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50


In [33]:
cor_Hull = hull_df [['PM10','PM2.5', 'Visibility', 'Ozone', 'Nitrogen dioxide', 'Nitrogen oxides', 'Nitric oxide', 'Date and time', '1 hour', '2 hour', '3 hour' ]]

In [34]:
features = ['PM10','PM2.5', 'Visibility', 'Ozone', 'Nitrogen dioxide', 'Nitrogen oxides', 'Nitric oxide']

In [35]:
def clear_model_memory():
    #Clear the memory of the model
  
    # Clear memory for scikit-learn
    gc.collect()
    
    # Clear memory for Keras/TensorFlow
    tf.keras.backend.clear_session()
    gc.collect()

In [40]:
predict_pm25(cor_Hull,features, output_folder='corr' )

Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   8.090371  1.824271   0.887930  28.437669
1  2 hour  14.459092  2.458598   0.798234  39.050018
2  3 hour  18.922358  2.884749   0.732523  47.678585


(            PM10   PM2.5  Visibility     Ozone  Nitrogen dioxide  \
 23      6.725407   1.100         7.0  6.071917           9.09728   
 0      19.559712  14.000         8.0  4.184318          13.99823   
 1      21.149820  12.200        13.0  3.499127          25.92585   
 2      24.309648  16.400        14.0  3.906583          18.59572   
 3      18.785991  12.700        14.0  3.891615          17.11879   
 ...          ...     ...         ...       ...               ...   
 70120  14.750000  11.392        10.0  4.121121          25.51641   
 70121   7.175000   5.259         8.0  4.505293          20.06227   
 70122   7.700000   5.590         6.0  4.270798          25.51644   
 70123   7.825000   5.991         8.0  3.836733          28.21999   
 70124   8.600000   6.722        10.0  3.487486          27.12930   
 
        Nitrogen oxides  Nitric oxide       Date and time  1 hour  2 hour  \
 23            11.23498       1.39417 2015-01-01 00:00:00   4.400   2.700   
 0             1

In [41]:
predict_pm25_ridge(cor_Hull, features, alpha=1.0, output_folder="corr")

Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   8.090370  1.824271   0.887930  28.437673
1  2 hour  14.459091  2.458598   0.798234  39.050022
2  3 hour  18.922357  2.884749   0.732523  47.678589


(            PM10   PM2.5  Visibility     Ozone  Nitrogen dioxide  \
 23      6.725407   1.100         7.0  6.071917           9.09728   
 0      19.559712  14.000         8.0  4.184318          13.99823   
 1      21.149820  12.200        13.0  3.499127          25.92585   
 2      24.309648  16.400        14.0  3.906583          18.59572   
 3      18.785991  12.700        14.0  3.891615          17.11879   
 ...          ...     ...         ...       ...               ...   
 70120  14.750000  11.392        10.0  4.121121          25.51641   
 70121   7.175000   5.259         8.0  4.505293          20.06227   
 70122   7.700000   5.590         6.0  4.270798          25.51644   
 70123   7.825000   5.991         8.0  3.836733          28.21999   
 70124   8.600000   6.722        10.0  3.487486          27.12930   
 
        Nitrogen oxides  Nitric oxide       Date and time  1 hour  2 hour  \
 23            11.23498       1.39417 2015-01-01 00:00:00   4.400   2.700   
 0             1

In [42]:
predict_pm25_random_forest(cor_Hull,features, n_estimators=100, random_state=42, output_folder="corr")

Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   7.974691  1.821894   0.889532  30.072499
1  2 hour  13.878677  2.423905   0.806333  40.911705
2  3 hour  18.299378  2.842385   0.741330  49.324268


(            PM10   PM2.5  Visibility     Ozone  Nitrogen dioxide  \
 23      6.725407   1.100         7.0  6.071917           9.09728   
 0      19.559712  14.000         8.0  4.184318          13.99823   
 1      21.149820  12.200        13.0  3.499127          25.92585   
 2      24.309648  16.400        14.0  3.906583          18.59572   
 3      18.785991  12.700        14.0  3.891615          17.11879   
 ...          ...     ...         ...       ...               ...   
 70120  14.750000  11.392        10.0  4.121121          25.51641   
 70121   7.175000   5.259         8.0  4.505293          20.06227   
 70122   7.700000   5.590         6.0  4.270798          25.51644   
 70123   7.825000   5.991         8.0  3.836733          28.21999   
 70124   8.600000   6.722        10.0  3.487486          27.12930   
 
        Nitrogen oxides  Nitric oxide       Date and time  1 hour  2 hour  \
 23            11.23498       1.39417 2015-01-01 00:00:00   4.400   2.700   
 0             1

In [43]:
predict_pm25_xgboost(cor_Hull,features, output_folder="corr",  n_estimators=100, learning_rate=0.01, max_depth=3, random_state=42)

Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  17.161333  2.833664   0.762276  54.726192
1  2 hour  22.538939  3.246591   0.685485  62.534942
2  3 hour  26.336339  3.548427   0.627723  67.731656


(            PM10   PM2.5  Visibility     Ozone  Nitrogen dioxide  \
 23      6.725407   1.100         7.0  6.071917           9.09728   
 0      19.559712  14.000         8.0  4.184318          13.99823   
 1      21.149820  12.200        13.0  3.499127          25.92585   
 2      24.309648  16.400        14.0  3.906583          18.59572   
 3      18.785991  12.700        14.0  3.891615          17.11879   
 ...          ...     ...         ...       ...               ...   
 70120  14.750000  11.392        10.0  4.121121          25.51641   
 70121   7.175000   5.259         8.0  4.505293          20.06227   
 70122   7.700000   5.590         6.0  4.270798          25.51644   
 70123   7.825000   5.991         8.0  3.836733          28.21999   
 70124   8.600000   6.722        10.0  3.487486          27.12930   
 
        Nitrogen oxides  Nitric oxide       Date and time  1 hour  2 hour  \
 23            11.23498       1.39417 2015-01-01 00:00:00   4.400   2.700   
 0             1

In [44]:
predict_pm25_elastic_net(cor_Hull, features, output_folder='corr', alpha=1.0, l1_ratio=0.01, test_size=0.2, random_state=42)

Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  17.063250  2.687249   0.763635  44.123561
1  2 hour  22.275829  3.108864   0.689156  52.662720
2  3 hour  26.279561  3.449634   0.628526  60.166215


(            PM10   PM2.5  Visibility     Ozone  Nitrogen dioxide  \
 23      6.725407   1.100         7.0  6.071917           9.09728   
 0      19.559712  14.000         8.0  4.184318          13.99823   
 1      21.149820  12.200        13.0  3.499127          25.92585   
 2      24.309648  16.400        14.0  3.906583          18.59572   
 3      18.785991  12.700        14.0  3.891615          17.11879   
 ...          ...     ...         ...       ...               ...   
 70120  14.750000  11.392        10.0  4.121121          25.51641   
 70121   7.175000   5.259         8.0  4.505293          20.06227   
 70122   7.700000   5.590         6.0  4.270798          25.51644   
 70123   7.825000   5.991         8.0  3.836733          28.21999   
 70124   8.600000   6.722        10.0  3.487486          27.12930   
 
        Nitrogen oxides  Nitric oxide       Date and time  1 hour  2 hour  \
 23            11.23498       1.39417 2015-01-01 00:00:00   4.400   2.700   
 0             1

In [45]:
predict_pm25_bidirectional_lstm(cor_Hull,features,  output_folder="corr", epochs=10, batch_size=32)


Epoch 1/10

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   7.637055  1.791375   0.894209  29.373417
1  2 hour  13.910570  2.438100   0.805888  41.451726
2  3 hour  18.195869  2.843637   0.742793  49.518864


(            PM10   PM2.5  Visibility     Ozone  Nitrogen dioxide  \
 23      6.725407   1.100         7.0  6.071917           9.09728   
 0      19.559712  14.000         8.0  4.184318          13.99823   
 1      21.149820  12.200        13.0  3.499127          25.92585   
 2      24.309648  16.400        14.0  3.906583          18.59572   
 3      18.785991  12.700        14.0  3.891615          17.11879   
 ...          ...     ...         ...       ...               ...   
 70120  14.750000  11.392        10.0  4.121121          25.51641   
 70121   7.175000   5.259         8.0  4.505293          20.06227   
 70122   7.700000   5.590         6.0  4.270798          25.51644   
 70123   7.825000   5.991         8.0  3.836733          28.21999   
 70124   8.600000   6.722        10.0  3.487486          27.12930   
 
        Nitrogen oxides  Nitric oxide       Date and time  1 hour  2 hour  \
 23            11.23498       1.39417 2015-01-01 00:00:00   4.400   2.700   
 0             1

In [46]:
predict_pm25_cnn_lstm(cor_Hull, features, epochs=50, batch_size=64 , output_folder="corr")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/

(            PM10   PM2.5  Visibility     Ozone  Nitrogen dioxide  \
 23      6.725407   1.100         7.0  6.071917           9.09728   
 0      19.559712  14.000         8.0  4.184318          13.99823   
 1      21.149820  12.200        13.0  3.499127          25.92585   
 2      24.309648  16.400        14.0  3.906583          18.59572   
 3      18.785991  12.700        14.0  3.891615          17.11879   
 ...          ...     ...         ...       ...               ...   
 70120  14.750000  11.392        10.0  4.121121          25.51641   
 70121   7.175000   5.259         8.0  4.505293          20.06227   
 70122   7.700000   5.590         6.0  4.270798          25.51644   
 70123   7.825000   5.991         8.0  3.836733          28.21999   
 70124   8.600000   6.722        10.0  3.487486          27.12930   
 
        Nitrogen oxides  Nitric oxide       Date and time  1 hour  2 hour  \
 23            11.23498       1.39417 2015-01-01 00:00:00   4.400   2.700   
 0             1

In [36]:
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [37]:
from tensorflow.keras.backend import clear_session

def predict_pm25_bidirectional_lstm(df, features, output_folder="results", epochs=10, batch_size=32, learning_rate=0.001, lstm_units=100, dropout_rate=0.2, optimizer_type='adam', early_stopping_patience=10, lr_schedule=False):
    clear_session()  # Clear the session at the start
    
    # Ensure 'Date and time' is of datetime type
    if not pd.api.types.is_datetime64_any_dtype(df['Date and time']):
        df['Date and time'] = pd.to_datetime(df['Date and time'])
    
    # Sorting by datetime to maintain order
    df = df.sort_values(by='Date and time')
    
    # Creating a 'time_index' column to represent the time step
    df['time_index'] = np.arange(len(df))
    
    # Preparing the dataset for '1 hour' target
    X = df[features]
    y = df['1 hour']
    
    # Standardize the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Reshape input to be 3D [samples, timesteps, features] for LSTM
    X_scaled = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))
    
    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    
    # Building the Bidirectional LSTM model
    model = Sequential()
    model.add(Bidirectional(LSTM(lstm_units, return_sequences=True), input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(Dropout(dropout_rate))
    model.add(Bidirectional(LSTM(lstm_units)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1))
    
    # Selecting the optimizer
    if optimizer_type == 'adam':
        optimizer = Adam(learning_rate=learning_rate)
    elif optimizer_type == 'sgd':
        optimizer = SGD(learning_rate=learning_rate)
    elif optimizer_type == 'rmsprop':
        optimizer = RMSprop(learning_rate=learning_rate)
    else:
        raise ValueError("Unsupported optimizer type. Choose 'adam', 'sgd', or 'rmsprop'.")
    
    # Compiling the model
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    
    # Early stopping to avoid overfitting
    early_stopping = EarlyStopping(monitor='val_loss', patience=early_stopping_patience, restore_best_weights=True)
    
    # Learning rate schedule
    callbacks = [early_stopping]
    if lr_schedule:
        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.00001)
        callbacks.append(reduce_lr)
    
    # Training the model
    model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=1, shuffle=False, callbacks=callbacks)
    
    # Predicting the PM2.5 values for the test set
    y_pred = model.predict(X_test).flatten()
    
    # Calculating performance metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    # Creating a metrics table
    metrics_table = [{
        'Target': '1 hour',
        'MSE': mse,
        'MAE': mae,
        'R-squared': r2,
        'MAPE': mape
    }]
    
    # Predicting PM2.5 for each row in the DataFrame
    X_full_scaled = scaler.transform(X)
    X_full_scaled = X_full_scaled.reshape((X_full_scaled.shape[0], 1, X_full_scaled.shape[1]))
    df['PM2.5_pre_1_hour_bilstm'] = model.predict(X_full_scaled).flatten()
    
    # Convert metrics table to DataFrame for better visualization
    metrics_df = pd.DataFrame(metrics_table)
    
    # Save the metrics DataFrame and the updated dataset
    metrics_df.to_csv(f'{output_folder}/metrics_table_bilstm.csv', index=False)
    df.to_csv(f'{output_folder}/predicted_pm25_bilstm.csv', index=False)
    
    print("Metrics Table:")
    print(metrics_df)
    return df, metrics_df


In [26]:
param_grid = {
    'learning_rate': [0.01, 0.001, 0.0001],
    'batch_size': [16, 32, 64],
    'lstm_units': [50, 100, 150],
    'dropout_rate': [0.2, 0.3, 0.4],
    'optimizer_type': ['adam', 'sgd', 'rmsprop']
}

In [38]:
import itertools

# Create all possible combinations of parameters
combinations = list(itertools.product(param_grid['learning_rate'],
                                      param_grid['batch_size'],
                                      param_grid['lstm_units'],
                                      param_grid['dropout_rate'],
                                      param_grid['optimizer_type']))


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

best_score = -np.inf  # Initialize to a very low value because we want to maximize R-squared
best_params = None
results = []

for combination in combinations:
    lr, batch_size, lstm_units, dropout_rate, optimizer_type = combination
    
    print(f"Testing combination: LR={lr}, Batch={batch_size}, LSTM Units={lstm_units}, Dropout={dropout_rate}, Optimizer={optimizer_type}")
    
    try:
        df, metrics_df = predict_pm25_bidirectional_lstm(
            df=cor_Hull,
            features=features,
            epochs=10,  # Fixed number of epochs to reduce processing time or you can include it in the grid as well
            batch_size=batch_size,
            learning_rate=lr,
            lstm_units=lstm_units,
            dropout_rate=dropout_rate,
            optimizer_type=optimizer_type
        )
    except Exception as e:
        print(f"Error occurred during combination {combination}: {e}")
        continue
    
    # Assuming R-squared (R2) is the metric of interest
    r2_value = metrics_df['R-squared'].mean()  # or any other metric you prefer
    
    results.append((combination, r2_value))
    
    if r2_value > best_score:  # For metrics like R2 where higher is better
        best_score = r2_value
        best_params = combination

print(f"Best parameters: {best_params} with score: {best_score}")


Testing combination: LR=0.01, Batch=16, LSTM Units=50, Dropout=0.2, Optimizer=adam
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Metrics Table:
   Target       MSE       MAE  R-squared       MAPE
0  1 hour  8.192275  1.850996   0.886518  30.747143
Testing combination: LR=0.01, Batch=16, LSTM Units=50, Dropout=0.2, Optimizer=sgd
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Metrics Table:
   Target       MSE       MAE  R-squared       MAPE
0  1 hour  8.250823  1.866773   0.885707  30.673115
Testing combination: LR=0.01, Batch=16, LSTM Units=50, Dropout=0.2, Optimizer=rmsprop
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Metrics Table:
   Target       MSE       MAE  R-squared       MAPE
0  1 hour  8.687612  1.928998   0.879657  32.313743
Testing combination: LR=0.01, Batch=16, LSTM Units=50, Drop

In [35]:

param_grid = {
    'learning_rate': [ 0.001, 0.0001],
    'batch_size': [32, 64],
    'lstm_units': [50, 100, 150],
    'dropout_rate': [0.2, 0.3, 0.4],
    'optimizer_type': ['adam', 'sgd', 'rmsprop']
}

# Create all possible combinations of parameters
combinations = list(itertools.product(param_grid['learning_rate'],
                                      param_grid['batch_size'],
                                      param_grid['lstm_units'],
                                      param_grid['dropout_rate'],
                                      param_grid['optimizer_type']))

In [36]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

best_score = -np.inf  # Initialize to a very low value because we want to maximize R-squared
best_params = None
results = []

for combination in combinations:
    lr, batch_size, lstm_units, dropout_rate, optimizer_type = combination

    # Skip the combination if learning_rate=0.001, batch_size=32, and lstm_units is 50 or 100
    if lr == 0.001 and batch_size == 32 and lstm_units in [50, 100]:
        continue  # Skip to the next combination
    print(f"Testing combination: LR={lr}, Batch={batch_size}, LSTM Units={lstm_units}, Dropout={dropout_rate}, Optimizer={optimizer_type}")
    
    try:
        df, metrics_df = predict_pm25_bidirectional_lstm(
            df=cor_Hull,
            features=features,
            epochs=10,  # Fixed number of epochs to reduce processing time or you can include it in the grid as well
            batch_size=batch_size,
            learning_rate=lr,
            lstm_units=lstm_units,
            dropout_rate=dropout_rate,
            optimizer_type=optimizer_type
        )
    except Exception as e:
        print(f"Error occurred during combination {combination}: {e}")
        continue
    
    # Assuming R-squared (R2) is the metric of interest
    r2_value = metrics_df['R-squared'].mean()  # or any other metric you prefer
    
    results.append((combination, r2_value))
    
    if r2_value > best_score:  # For metrics like R2 where higher is better
        best_score = r2_value
        best_params = combination

print(f"Best parameters: {best_params} with score: {best_score}")

Testing combination: LR=0.001, Batch=32, LSTM Units=150, Dropout=0.2, Optimizer=adam
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Metrics Table:
   Target       MSE       MAE  R-squared       MAPE
0  1 hour  7.689388  1.796869   0.893484  29.795302
Testing combination: LR=0.001, Batch=32, LSTM Units=150, Dropout=0.2, Optimizer=sgd
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Metrics Table:
   Target       MSE       MAE  R-squared       MAPE
0  1 hour  7.811844  1.813069   0.891788  30.799336
Testing combination: LR=0.001, Batch=32, LSTM Units=150, Dropout=0.2, Optimizer=rmsprop
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Metrics Table:
   Target       MSE       MAE  R-squared       MAPE
0  1 hour  8.224254  1.863667   0.886075  30.399374
Testing combination: LR=0.001, Batch=32, LSTM Units=1

In [37]:
# Sort results by score (ascending for errors, descending for metrics like R-squared)
sorted_results = sorted(results, key=lambda x: x[1], reverse=True)

for result in sorted_results:
    params, score = result
    print(f"Params: {params}, Score: {score}")


Params: (0.001, 64, 100, 0.2, 'adam'), Score: 0.8937515453897599
Params: (0.001, 64, 50, 0.3, 'adam'), Score: 0.8937282183770737
Params: (0.001, 32, 150, 0.3, 'adam'), Score: 0.8937236616049815
Params: (0.001, 64, 150, 0.2, 'adam'), Score: 0.8936010214939221
Params: (0.001, 32, 150, 0.2, 'adam'), Score: 0.8934843617598058
Params: (0.001, 64, 150, 0.3, 'adam'), Score: 0.8934473784058372
Params: (0.001, 64, 50, 0.2, 'adam'), Score: 0.8934168451605751
Params: (0.001, 64, 150, 0.4, 'adam'), Score: 0.8933839577637209
Params: (0.001, 64, 50, 0.4, 'adam'), Score: 0.8932918668429888
Params: (0.001, 64, 100, 0.3, 'adam'), Score: 0.8932537000210534
Params: (0.001, 64, 100, 0.4, 'adam'), Score: 0.8930947510423355
Params: (0.001, 32, 150, 0.4, 'adam'), Score: 0.8922809344678473
Params: (0.0001, 32, 150, 0.2, 'adam'), Score: 0.8919502157250294
Params: (0.001, 32, 150, 0.2, 'sgd'), Score: 0.8917880657352533
Params: (0.001, 64, 100, 0.4, 'rmsprop'), Score: 0.8917252997906564
Params: (0.001, 32, 150, 

In [1]:
def fine_tune_lstm_model(df, features, combinations, epochs=10):
    best_score = -np.inf  # Initialize to a very low value because we want to maximize R-squared
    best_params = None
    results = []

    for combination in combinations:
        lr, batch_size, lstm_units, dropout_rate, optimizer_type = combination

        print(f"Testing combination: LR={lr}, Batch={batch_size}, LSTM Units={lstm_units}, Dropout={dropout_rate}, Optimizer={optimizer_type}")

        try:
            df, metrics_df = predict_pm25_bidirectional_lstm(
                df=df,  # Pass the dataset dynamically
                features=features,  # Pass the features dynamically
                epochs=epochs,  # Fixed number of epochs to reduce processing time or you can include it in the grid as well
                batch_size=batch_size,
                learning_rate=lr,
                lstm_units=lstm_units,
                dropout_rate=dropout_rate,
                optimizer_type=optimizer_type
            )
        except Exception as e:
            print(f"Error occurred during combination {combination}: {e}")
            continue

        # Assuming R-squared (R2) is the metric of interest
        r2_value = metrics_df['R-squared'].mean()  # or any other metric you prefer

        results.append((combination, r2_value))

        if r2_value > best_score:  # For metrics like R2 where higher is better
            best_score = r2_value
            best_params = combination

    print(f"Best parameters: {best_params} with score: {best_score}")
    return best_params, best_score, results

In [41]:
param_grid = {
    'learning_rate': [ 0.001],
    'batch_size': [32],
    'lstm_units': [100],
    'dropout_rate': [0.2],
    'optimizer_type': ['adam']
}
    # Create all possible combinations of parameters
combinations = list(itertools.product(param_grid['learning_rate'],
                                      param_grid['batch_size'],
                                      param_grid['lstm_units'],
                                      param_grid['dropout_rate'],
                                      param_grid['optimizer_type']))

fine_tune_lstm_model(cor_Hull, features, combinations, epochs=50)

Testing combination: LR=0.001, Batch=32, LSTM Units=100, Dropout=0.2, Optimizer=adam
Epoch 1/50

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Metrics Table:
   Target       MSE       MAE  R-squared      MAPE
0  1 hour  7.552503  1.775688   0.895381  29.59045
Best parameters: (0.001, 32, 100, 0.2, 'adam') with score: 0.8953805333305394


((0.001, 32, 100, 0.2, 'adam'),
 0.8953805333305394,
 [((0.001, 32, 100, 0.2, 'adam'), 0.8953805333305394)])

In [43]:
 features = ['Nitric oxide', 'Nitrogen dioxide', 'Nitrogen oxides', 'PM10','PM2.5','Temperature', 'Wind Speed',
              'Humidity', 'Wind Cardinal Direction', 'Maximum Gust Speed', 'Total Cloud Amount',
           'Ozone', 'Hourly Total Radiation','Cloud Base Height','hour', 'season']

fine_tune_lstm_model(hull_df, features, combinations, epochs=50)

Testing combination: LR=0.001, Batch=32, LSTM Units=100, Dropout=0.2, Optimizer=adam
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Metrics Table:
   Target       MSE      MAE  R-squared       MAPE
0  1 hour  7.573015  1.76957   0.895096  29.156962
Best parameters: (0.001, 32, 100, 0.2, 'adam') with score: 0.8950963973944525


((0.001, 32, 100, 0.2, 'adam'),
 0.8950963973944525,
 [((0.001, 32, 100, 0.2, 'adam'), 0.8950963973944525)])

In [63]:
fine_tune_lstm_model(defs[0], features, combinations, epochs=50)

Testing combination: LR=0.001, Batch=32, LSTM Units=100, Dropout=0.2, Optimizer=adam


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date and time'] = pd.to_datetime(df['Date and time'])


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Metrics Table:
   Target       MSE       MAE  R-squared       MAPE
0  1 hour  9.095455  1.936007   0.895161  38.862851
Best parameters: (0.001, 32, 100, 0.2, 'adam') with score: 0.8951611798036831


((0.001, 32, 100, 0.2, 'adam'),
 0.8951611798036831,
 [((0.001, 32, 100, 0.2, 'adam'), 0.8951611798036831)])

In [64]:
fine_tune_lstm_model(defs[1], features, combinations, epochs=50)

Testing combination: LR=0.001, Batch=32, LSTM Units=100, Dropout=0.2, Optimizer=adam


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date and time'] = pd.to_datetime(df['Date and time'])


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  12.114228  2.116969    0.86725  33.040198
Best parameters: (0.001, 32, 100, 0.2, 'adam') with score: 0.8672495369008246


((0.001, 32, 100, 0.2, 'adam'),
 0.8672495369008246,
 [((0.001, 32, 100, 0.2, 'adam'), 0.8672495369008246)])

In [65]:
fine_tune_lstm_model(defs[2], features, combinations, epochs=50)

Testing combination: LR=0.001, Batch=32, LSTM Units=100, Dropout=0.2, Optimizer=adam


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date and time'] = pd.to_datetime(df['Date and time'])


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Metrics Table:
   Target       MSE       MAE  R-squared       MAPE
0  1 hour  8.268374  1.789526   0.841865  29.431063
Best parameters: (0.001, 32, 100, 0.2, 'adam') with score: 0.8418648391538524


((0.001, 32, 100, 0.2, 'adam'),
 0.8418648391538524,
 [((0.001, 32, 100, 0.2, 'adam'), 0.8418648391538524)])

In [66]:
fine_tune_lstm_model(defs[3], features, combinations, epochs=50)

Testing combination: LR=0.001, Batch=32, LSTM Units=100, Dropout=0.2, Optimizer=adam


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date and time'] = pd.to_datetime(df['Date and time'])


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Metrics Table:
   Target       MSE       MAE  R-squared       MAPE
0  1 hour  7.344354  1.768471   0.874744  31.739631
Best parameters: (0.001, 32, 100, 0.2, 'adam') with score: 0.8747439578221069


((0.001, 32, 100, 0.2, 'adam'),
 0.8747439578221069,
 [((0.001, 32, 100, 0.2, 'adam'), 0.8747439578221069)])

In [45]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


In [48]:

def fine_tune_xgboost(df, features, param_grid=None, random_search=False, n_iter=10, cv=5, output_folder="results"):
    # Ensure 'Date and time' is of datetime type
    if not pd.api.types.is_datetime64_any_dtype(df['Date and time']):
        df['Date and time'] = pd.to_datetime(df['Date and time'])
    
    # Sorting by datetime to maintain order
    df = df.sort_values(by='Date and time')
    
    # Creating a 'time_index' column to represent the time step
    df['time_index'] = np.arange(len(df))
    
    # Prepare the dataset for "1 hour" target
    X = df[features]
    y = df['1 hour']
    
    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Default parameter grid
    if param_grid is None:
        param_grid = {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7]
        }
    
    # Select the model tuning method
    if random_search:
        search = RandomizedSearchCV(
            xgb.XGBRegressor(random_state=42), 
            param_distributions=param_grid,
            n_iter=n_iter, 
            cv=cv, 
            scoring='r2', 
            n_jobs=-1, 
            verbose=1, 
            random_state=42
        )
    else:
        search = GridSearchCV(
            xgb.XGBRegressor(random_state=42), 
            param_grid=param_grid,
            cv=cv, 
            scoring='r2', 
            n_jobs=-1, 
            verbose=1
        )
    
    # Perform the search
    search.fit(X_train, y_train)
    
    # Save search results
    results_df = pd.DataFrame(search.cv_results_)
    os.makedirs(output_folder, exist_ok=True)
    results_df.to_csv(f'{output_folder}/xgboost_tuning_results.csv', index=False)
    
    # Get the best model
    best_model = search.best_estimator_
    
    # Predicting the PM2.5 values for the test set
    y_pred = best_model.predict(X_test)
    
    # Calculating performance metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    # Save the best model metrics
    metrics_table = [{
        'Target': '1 hour',
        'MSE': mse,
        'MAE': mae,
        'R-squared': r2,
        'MAPE': mape
    }]
    
    # Predicting PM2.5 for each row in the DataFrame
    df['PM2.5_pre_1_hour_xgboost'] = best_model.predict(X)
    
    # Convert metrics table to DataFrame for better visualization
    metrics_df = pd.DataFrame(metrics_table)
    
    # Save the metrics DataFrame and the updated dataset
    metrics_df.to_csv(f'{output_folder}/metrics_table_xgboost_1_hour.csv', index=False)
    df.to_csv(f'{output_folder}/predicted_pm25_xgboost_1_hour.csv', index=False)
    
    print("Best Parameters:", search.best_params_)
    print("Metrics Table:")
    print(metrics_df)
    return df, metrics_df, search.best_params_


In [49]:
fine_tune_xgboost(cor_Hull, features, param_grid=None, random_search=False, n_iter=10, cv=5, output_folder="results")

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Metrics Table:
   Target       MSE       MAE  R-squared      MAPE
0  1 hour  7.622923  1.773977   0.894405  29.46145


(            PM10   PM2.5  Visibility     Ozone  Nitrogen dioxide  \
 23      6.725407   1.100         7.0  6.071917           9.09728   
 0      19.559712  14.000         8.0  4.184318          13.99823   
 1      21.149820  12.200        13.0  3.499127          25.92585   
 2      24.309648  16.400        14.0  3.906583          18.59572   
 3      18.785991  12.700        14.0  3.891615          17.11879   
 ...          ...     ...         ...       ...               ...   
 70120  14.750000  11.392        10.0  4.121121          25.51641   
 70121   7.175000   5.259         8.0  4.505293          20.06227   
 70122   7.700000   5.590         6.0  4.270798          25.51644   
 70123   7.825000   5.991         8.0  3.836733          28.21999   
 70124   8.600000   6.722        10.0  3.487486          27.12930   
 
        Nitrogen oxides  Nitric oxide       Date and time  1 hour  2 hour  \
 23            11.23498       1.39417 2015-01-01 00:00:00   4.400   2.700   
 0             1

In [51]:

def fine_tune_xgboost(df, features, param_grid=None, random_search=False, n_iter=10, cv=5, output_folder="results"):
    # Ensure 'Date and time' is of datetime type
    if not pd.api.types.is_datetime64_any_dtype(df['Date and time']):
        df['Date and time'] = pd.to_datetime(df['Date and time'])
    
    # Sorting by datetime to maintain order
    df = df.sort_values(by='Date and time')
    
    # Creating a 'time_index' column to represent the time step
    df['time_index'] = np.arange(len(df))
    
    # Prepare the dataset for "1 hour" target
    X = df[features]
    y = df['1 hour']
    
    # Splitting the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Default parameter grid
    if param_grid is None:
        param_grid = {
            'n_estimators': [200],
            'learning_rate': [0.1],
            'max_depth': [3]
        }
    
    # Select the model tuning method
    if random_search:
        search = RandomizedSearchCV(
            xgb.XGBRegressor(random_state=42), 
            param_distributions=param_grid,
            n_iter=n_iter, 
            cv=cv, 
            scoring='r2', 
            n_jobs=-1, 
            verbose=1, 
            random_state=42
        )
    else:
        search = GridSearchCV(
            xgb.XGBRegressor(random_state=42), 
            param_grid=param_grid,
            cv=cv, 
            scoring='r2', 
            n_jobs=-1, 
            verbose=1
        )
    
    # Perform the search
    search.fit(X_train, y_train)
    
    # Save search results
    results_df = pd.DataFrame(search.cv_results_)
    os.makedirs(output_folder, exist_ok=True)
    results_df.to_csv(f'{output_folder}/xgboost_tuning_results.csv', index=False)
    
    # Get the best model
    best_model = search.best_estimator_
    
    # Predicting the PM2.5 values for the test set
    y_pred = best_model.predict(X_test)
    
    # Calculating performance metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    # Save the best model metrics
    metrics_table = [{
        'Target': '1 hour',
        'MSE': mse,
        'MAE': mae,
        'R-squared': r2,
        'MAPE': mape
    }]
    
    # Predicting PM2.5 for each row in the DataFrame
    df['PM2.5_pre_1_hour_xgboost'] = best_model.predict(X)
    
    # Convert metrics table to DataFrame for better visualization
    metrics_df = pd.DataFrame(metrics_table)
    
    # Save the metrics DataFrame and the updated dataset
    metrics_df.to_csv(f'{output_folder}/metrics_table_xgboost_1_hour.csv', index=False)
    df.to_csv(f'{output_folder}/predicted_pm25_xgboost_1_hour.csv', index=False)
    
    print("Best Parameters:", search.best_params_)
    print("Metrics Table:")
    print(metrics_df)
    return df, metrics_df, search.best_params_


In [53]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [54]:

fine_tune_xgboost(hull_df, features, param_grid=None, random_search=False, n_iter=50, cv=5, output_folder="results")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Metrics Table:
   Target      MSE       MAE  R-squared       MAPE
0  1 hour  7.66681  1.769174   0.893797  29.135225


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-01-01  00:00:00  6.071917       1.39417           9.09728   
 0      2015-01-01  01:00:00  4.184318       1.91668          13.99823   
 1      2015-01-01  02:00:00  3.499127       2.96234          25.92585   
 2      2015-01-01  03:00:00  3.906583       1.56817          18.59572   
 3      2015-01-01  04:00:00  3.891615       1.69882          17.11879   
 ...           ...       ...       ...           ...               ...   
 70120  2022-12-31  17:00:00  4.121121       3.83409          25.51641   
 70121  2022-12-31  18:00:00  4.505293       3.24661          20.06227   
 70122  2022-12-31  19:00:00  4.270798       3.89593          25.51644   
 70123  2022-12-31  20:00:00  3.836733       4.57617          28.21999   
 70124  2022-12-31  21:00:00  3.487486       4.76169          27.12930   
 
        Nitrogen oxides  Sulphur dioxide       PM10   PM2.5  Wind Direction  \
 23            11.23498        

In [67]:
fine_tune_xgboost(defs[0], features, param_grid=None, random_search=False, n_iter=50, cv=5, output_folder="results")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Metrics Table:
   Target       MSE       MAE  R-squared       MAPE
0  1 hour  9.200085  1.954161   0.893955  39.887956


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-01-01  00:00:00  6.071917       1.39417           9.09728   
 0      2015-01-01  01:00:00  4.184318       1.91668          13.99823   
 1      2015-01-01  02:00:00  3.499127       2.96234          25.92585   
 2      2015-01-01  03:00:00  3.906583       1.56817          18.59572   
 3      2015-01-01  04:00:00  3.891615       1.69882          17.11879   
 ...           ...       ...       ...           ...               ...   
 17320  2022-12-31  17:00:00  4.121121       3.83409          25.51641   
 17321  2022-12-31  18:00:00  4.505293       3.24661          20.06227   
 17322  2022-12-31  19:00:00  4.270798       3.89593          25.51644   
 17323  2022-12-31  20:00:00  3.836733       4.57617          28.21999   
 17324  2022-12-31  21:00:00  3.487486       4.76169          27.12930   
 
        Nitrogen oxides       PM10   PM2.5  Wind Direction  Wind Speed  ...  \
 23            11.23498   6.725

In [68]:
fine_tune_xgboost(defs[1], features, param_grid=None, random_search=False, n_iter=50, cv=5, output_folder="results")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Metrics Table:
   Target        MSE       MAE  R-squared      MAPE
0  1 hour  12.063746  2.092408   0.867803  33.34812


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-03-01  00:00:00  3.891615       2.39918          25.72743   
 0      2015-03-01  01:00:00  3.991400       3.72236          25.57268   
 1      2015-03-01  02:00:00  3.918224       3.37353          27.21679   
 2      2015-03-01  03:00:00  4.305723       2.66143          23.85413   
 3      2015-03-01  04:00:00  3.053421       6.36903          33.12832   
 ...           ...       ...       ...           ...               ...   
 17154  2022-05-31  19:00:00  5.972132       0.93909          11.68299   
 17155  2022-05-31  20:00:00  5.448261       1.15821          15.33676   
 17156  2022-05-31  21:00:00  5.822455       0.43824          11.20115   
 17157  2022-05-31  22:00:00  5.488175       0.40694          12.88350   
 17158  2022-05-31  23:00:00  5.343487       0.62606          11.92262   
 
        Nitrogen oxides       PM10  PM2.5  Wind Direction  Wind Speed  ...  \
 23            29.40612  16.5149

In [69]:
fine_tune_xgboost(defs[2], features, param_grid=None, random_search=False, n_iter=50, cv=5, output_folder="results")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Metrics Table:
   Target       MSE     MAE  R-squared       MAPE
0  1 hour  8.550319  1.7913   0.836473  28.437004


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-06-01  00:00:00  1.746238      10.40467          42.66931   
 0      2015-06-01  01:00:00  3.998052       0.93257          10.87129   
 1      2015-06-01  02:00:00  5.135601       0.29104           7.32743   
 2      2015-06-01  03:00:00  5.218756       0.39321           5.86803   
 3      2015-06-01  04:00:00  4.764734       0.52480           8.22901   
 ...           ...       ...       ...           ...               ...   
 17370  2022-08-31  19:00:00  6.515961       0.68652           5.15338   
 17371  2022-08-31  20:00:00  6.361294       0.74893           4.81526   
 17372  2022-08-31  21:00:00  5.418326       0.68652           9.12913   
 17373  2022-08-31  22:00:00  5.448261       0.49929           7.13743   
 17374  2022-08-31  23:00:00  5.867358       0.49929           5.14956   
 
        Nitrogen oxides       PM10   PM2.5  Wind Direction  Wind Speed  ...  \
 23            58.62291  20.056

In [71]:
fine_tune_xgboost(defs[3], features, param_grid=None, random_search=False, n_iter=50, cv=5, output_folder="results")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Metrics Table:
   Target       MSE       MAE  R-squared       MAPE
0  1 hour  7.284884  1.748536   0.875758  31.268448


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-09-01  00:00:00  5.762584       0.43843           4.91417   
 0      2015-09-01  01:00:00  4.523587       2.86214          22.40940   
 1      2015-09-01  02:00:00  5.355128       2.97897          19.83326   
 2      2015-09-01  03:00:00  6.850240       0.87592          10.72817   
 3      2015-09-01  04:00:00  7.364133       0.17462           5.44728   
 ...           ...       ...       ...           ...               ...   
 17394  2022-11-30  19:00:00  0.374194      18.87694          44.23362   
 17395  2022-11-30  20:00:00  0.783312      15.56735          38.90137   
 17396  2022-11-30  21:00:00  0.354237      19.64305          41.45788   
 17397  2022-11-30  22:00:00  0.274409      22.40104          38.17081   
 17398  2022-11-30  23:00:00  0.962925      20.71560          41.50856   
 
        Nitrogen oxides       PM10   PM2.5  Wind Direction  Wind Speed  ...  \
 23             5.58641   9.853

In [72]:
features = ['PM10','PM2.5', 'Visibility', 'Ozone', 'Nitrogen dioxide', 'Nitrogen oxides', 'Nitric oxide']
fine_tune_xgboost(cor_Hull, features, param_grid=None, random_search=False, n_iter=50, cv=5, output_folder="results")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
Metrics Table:
   Target       MSE       MAE  R-squared      MAPE
0  1 hour  7.622923  1.773977   0.894405  29.46145


(            PM10   PM2.5  Visibility     Ozone  Nitrogen dioxide  \
 23      6.725407   1.100         7.0  6.071917           9.09728   
 0      19.559712  14.000         8.0  4.184318          13.99823   
 1      21.149820  12.200        13.0  3.499127          25.92585   
 2      24.309648  16.400        14.0  3.906583          18.59572   
 3      18.785991  12.700        14.0  3.891615          17.11879   
 ...          ...     ...         ...       ...               ...   
 70120  14.750000  11.392        10.0  4.121121          25.51641   
 70121   7.175000   5.259         8.0  4.505293          20.06227   
 70122   7.700000   5.590         6.0  4.270798          25.51644   
 70123   7.825000   5.991         8.0  3.836733          28.21999   
 70124   8.600000   6.722        10.0  3.487486          27.12930   
 
        Nitrogen oxides  Nitric oxide       Date and time  1 hour  2 hour  \
 23            11.23498       1.39417 2015-01-01 00:00:00   4.400   2.700   
 0             1

In [73]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}


In [74]:
from sklearn.model_selection import GridSearchCV

def fine_tune_random_forest(df, features, param_grid, random_state=42, cv=5, output_folder="results"):
    # Ensure 'Date and time' is of datetime type
    if not pd.api.types.is_datetime64_any_dtype(df['Date and time']):
        df['Date and time'] = pd.to_datetime(df['Date and time'])
    
    # Sorting by datetime to maintain order
    df = df.sort_values(by='Date and time')
    
    metrics_table = []
    
    # List of target columns
    targets = ['1 hour']

    # Prepare the dataset for one target (e.g., '1 hour')
    X = df[features]
    y = df[targets[0]]  # You can loop over targets to fine-tune separately for each target
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
    
    # Initialize Random Forest Regressor
    rf = RandomForestRegressor(random_state=random_state)
    
    # Perform grid search
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=cv, n_jobs=-1, verbose=2, scoring='r2')
    grid_search.fit(X_train, y_train)
    
    # Get the best model
    best_rf = grid_search.best_estimator_
    
    # Predict on the test set
    y_pred = best_rf.predict(X_test)
    
    # Calculate performance metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    # Store the metrics
    metrics_table.append({
        'Target': targets[0],
        'MSE': mse,
        'MAE': mae,
        'R-squared': r2,
        'MAPE': mape
    })
    
    # Predicting PM2.5 for each row in the DataFrame
    df[f'PM2.5_pre_{targets[0]}_rf'] = best_rf.predict(X)
    
    # Convert metrics table to DataFrame for better visualization
    metrics_df = pd.DataFrame(metrics_table)
    
    # Save the metrics DataFrame and the updated dataset
    metrics_df.to_csv(f'{output_folder}/metrics_table_rf.csv', index=False)
    df.to_csv(f'{output_folder}/predicted_pm25_rf.csv', index=False)
    
    print("Best parameters found:", grid_search.best_params_)
    print("Metrics Table:")
    print(metrics_df)
    
    return df, metrics_df, grid_search.best_params_

# Example usage
# best_df, metrics_df, best_params = fine_tune_random_forest(your_dataset, your_features, param_grid)


In [76]:
fine_tune_random_forest(cor_Hull, features, param_grid, random_state=42, cv=5, output_folder="results")

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


540 fits failed out of a total of 1620.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
340 fits failed with the following error:
Traceback (most recent call last):
  File "D:\ai\Lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\ai\Lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "D:\ai\Lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "D:\ai\Lib\site-packages\sklearn\utils\_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.InvalidParameterError: The 'max_f

Best parameters found: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Metrics Table:
   Target       MSE       MAE  R-squared       MAPE
0  1 hour  7.587582  1.783845   0.894895  29.729785


(            PM10   PM2.5  Visibility     Ozone  Nitrogen dioxide  \
 23      6.725407   1.100         7.0  6.071917           9.09728   
 0      19.559712  14.000         8.0  4.184318          13.99823   
 1      21.149820  12.200        13.0  3.499127          25.92585   
 2      24.309648  16.400        14.0  3.906583          18.59572   
 3      18.785991  12.700        14.0  3.891615          17.11879   
 ...          ...     ...         ...       ...               ...   
 70120  14.750000  11.392        10.0  4.121121          25.51641   
 70121   7.175000   5.259         8.0  4.505293          20.06227   
 70122   7.700000   5.590         6.0  4.270798          25.51644   
 70123   7.825000   5.991         8.0  3.836733          28.21999   
 70124   8.600000   6.722        10.0  3.487486          27.12930   
 
        Nitrogen oxides  Nitric oxide       Date and time  1 hour  2 hour  \
 23            11.23498       1.39417 2015-01-01 00:00:00   4.400   2.700   
 0             1

In [78]:
param_grid = {
    'n_estimators': [ 300],
    'max_depth': [ 30],
    'min_samples_split': [ 10],
    'min_samples_leaf': [ 4],
    'max_features': [ 'sqrt']}

In [79]:
 features = ['Nitric oxide', 'Nitrogen dioxide', 'Nitrogen oxides', 'PM10','PM2.5','Temperature', 'Wind Speed',
              'Humidity', 'Wind Cardinal Direction', 'Maximum Gust Speed', 'Total Cloud Amount',
           'Ozone', 'Hourly Total Radiation','Cloud Base Height','hour', 'season']

In [80]:
fine_tune_random_forest(hull_df, features, param_grid, random_state=42, cv=5, output_folder="results")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best parameters found: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Metrics Table:
   Target       MSE       MAE  R-squared      MAPE
0  1 hour  7.654413  1.786827   0.893969  29.76793


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-01-01  00:00:00  6.071917       1.39417           9.09728   
 0      2015-01-01  01:00:00  4.184318       1.91668          13.99823   
 1      2015-01-01  02:00:00  3.499127       2.96234          25.92585   
 2      2015-01-01  03:00:00  3.906583       1.56817          18.59572   
 3      2015-01-01  04:00:00  3.891615       1.69882          17.11879   
 ...           ...       ...       ...           ...               ...   
 70120  2022-12-31  17:00:00  4.121121       3.83409          25.51641   
 70121  2022-12-31  18:00:00  4.505293       3.24661          20.06227   
 70122  2022-12-31  19:00:00  4.270798       3.89593          25.51644   
 70123  2022-12-31  20:00:00  3.836733       4.57617          28.21999   
 70124  2022-12-31  21:00:00  3.487486       4.76169          27.12930   
 
        Nitrogen oxides  Sulphur dioxide       PM10   PM2.5  Wind Direction  \
 23            11.23498        

In [81]:
fine_tune_random_forest(defs[0], features, param_grid, random_state=42, cv=5, output_folder="results")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best parameters found: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Metrics Table:
   Target       MSE       MAE  R-squared      MAPE
0  1 hour  9.530031  1.992882   0.890152  40.84755


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-01-01  00:00:00  6.071917       1.39417           9.09728   
 0      2015-01-01  01:00:00  4.184318       1.91668          13.99823   
 1      2015-01-01  02:00:00  3.499127       2.96234          25.92585   
 2      2015-01-01  03:00:00  3.906583       1.56817          18.59572   
 3      2015-01-01  04:00:00  3.891615       1.69882          17.11879   
 ...           ...       ...       ...           ...               ...   
 17320  2022-12-31  17:00:00  4.121121       3.83409          25.51641   
 17321  2022-12-31  18:00:00  4.505293       3.24661          20.06227   
 17322  2022-12-31  19:00:00  4.270798       3.89593          25.51644   
 17323  2022-12-31  20:00:00  3.836733       4.57617          28.21999   
 17324  2022-12-31  21:00:00  3.487486       4.76169          27.12930   
 
        Nitrogen oxides       PM10   PM2.5  Wind Direction  Wind Speed  ...  \
 23            11.23498   6.725

In [82]:
fine_tune_random_forest(defs[1], features, param_grid, random_state=42, cv=5, output_folder="results")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best parameters found: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  12.213845  2.128478   0.866158  34.044491


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-03-01  00:00:00  3.891615       2.39918          25.72743   
 0      2015-03-01  01:00:00  3.991400       3.72236          25.57268   
 1      2015-03-01  02:00:00  3.918224       3.37353          27.21679   
 2      2015-03-01  03:00:00  4.305723       2.66143          23.85413   
 3      2015-03-01  04:00:00  3.053421       6.36903          33.12832   
 ...           ...       ...       ...           ...               ...   
 17154  2022-05-31  19:00:00  5.972132       0.93909          11.68299   
 17155  2022-05-31  20:00:00  5.448261       1.15821          15.33676   
 17156  2022-05-31  21:00:00  5.822455       0.43824          11.20115   
 17157  2022-05-31  22:00:00  5.488175       0.40694          12.88350   
 17158  2022-05-31  23:00:00  5.343487       0.62606          11.92262   
 
        Nitrogen oxides       PM10  PM2.5  Wind Direction  Wind Speed  ...  \
 23            29.40612  16.5149

In [83]:
fine_tune_random_forest(defs[2], features, param_grid, random_state=42, cv=5, output_folder="results")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best parameters found: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Metrics Table:
   Target       MSE      MAE  R-squared       MAPE
0  1 hour  8.395085  1.77493   0.839441  28.585228


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-06-01  00:00:00  1.746238      10.40467          42.66931   
 0      2015-06-01  01:00:00  3.998052       0.93257          10.87129   
 1      2015-06-01  02:00:00  5.135601       0.29104           7.32743   
 2      2015-06-01  03:00:00  5.218756       0.39321           5.86803   
 3      2015-06-01  04:00:00  4.764734       0.52480           8.22901   
 ...           ...       ...       ...           ...               ...   
 17370  2022-08-31  19:00:00  6.515961       0.68652           5.15338   
 17371  2022-08-31  20:00:00  6.361294       0.74893           4.81526   
 17372  2022-08-31  21:00:00  5.418326       0.68652           9.12913   
 17373  2022-08-31  22:00:00  5.448261       0.49929           7.13743   
 17374  2022-08-31  23:00:00  5.867358       0.49929           5.14956   
 
        Nitrogen oxides       PM10   PM2.5  Wind Direction  Wind Speed  ...  \
 23            58.62291  20.056

In [84]:
fine_tune_random_forest(defs[3], features, param_grid, random_state=42, cv=5, output_folder="results")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Best parameters found: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Metrics Table:
   Target       MSE       MAE  R-squared       MAPE
0  1 hour  7.439625  1.773166   0.873119  31.950325


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-09-01  00:00:00  5.762584       0.43843           4.91417   
 0      2015-09-01  01:00:00  4.523587       2.86214          22.40940   
 1      2015-09-01  02:00:00  5.355128       2.97897          19.83326   
 2      2015-09-01  03:00:00  6.850240       0.87592          10.72817   
 3      2015-09-01  04:00:00  7.364133       0.17462           5.44728   
 ...           ...       ...       ...           ...               ...   
 17394  2022-11-30  19:00:00  0.374194      18.87694          44.23362   
 17395  2022-11-30  20:00:00  0.783312      15.56735          38.90137   
 17396  2022-11-30  21:00:00  0.354237      19.64305          41.45788   
 17397  2022-11-30  22:00:00  0.274409      22.40104          38.17081   
 17398  2022-11-30  23:00:00  0.962925      20.71560          41.50856   
 
        Nitrogen oxides       PM10   PM2.5  Wind Direction  Wind Speed  ...  \
 23             5.58641   9.853

In [86]:
def fine_tune_elastic_net(df, features, output_folder="results", param_grid=None, test_size=0.2, random_state=42):
    # Set up a default parameter grid if none is provided
    if param_grid is None:
        param_grid = {
            'alpha': [0.1, 0.5, 1.0, 2.0, 5.0],
            'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
        }

    # Ensure 'Date and time' is of datetime type
    if not pd.api.types.is_datetime64_any_dtype(df['Date and time']):
        df['Date and time'] = pd.to_datetime(df['Date and time'])
    
    # Sorting by datetime to maintain order
    df = df.sort_values(by='Date and time')
    
    # Creating a 'time_index' column to represent the time step
    df['time_index'] = np.arange(len(df))
    
    metrics_table = []
    best_params_table = []
    
    # List of target columns
    targets = ['1 hour', '2 hour', '3 hour']
    
    for target in targets:
        # Preparing the dataset
        X = df[features]
        y = df[target]
        
        # Standardize the features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        # Splitting the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=test_size, random_state=random_state)
        
        # Create the Elastic Net model
        model = ElasticNet(random_state=random_state)
        
        # Perform Grid Search
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        # Best model
        best_model = grid_search.best_estimator_
        
        # Predicting the PM2.5 values for the test set
        y_pred = best_model.predict(X_test)
        
        # Calculating performance metrics
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
        
        # Append metrics and best parameters to the table
        metrics_table.append({
            'Target': target,
            'MSE': mse,
            'MAE': mae,
            'R-squared': r2,
            'MAPE': mape
        })
        
        best_params_table.append({
            'Target': target,
            'Best Alpha': grid_search.best_params_['alpha'],
            'Best L1 Ratio': grid_search.best_params_['l1_ratio'],
            'Best R-squared': grid_search.best_score_
        })
        
        # Predicting PM2.5 for each row in the DataFrame
        y_pred_full = best_model.predict(X_scaled)
        df[f'PM2.5_pre_{target}_elastic_net'] = y_pred_full
    
    # Convert metrics table to DataFrame for better visualization
    metrics_df = pd.DataFrame(metrics_table)
    best_params_df = pd.DataFrame(best_params_table)
    
    # Save the metrics DataFrame, the best parameters, and the updated dataset
    metrics_df.to_csv(f'{output_folder}/metrics_table_elastic_net.csv', index=False)
    best_params_df.to_csv(f'{output_folder}/best_params_elastic_net.csv', index=False)
    df.to_csv(f'{output_folder}/predicted_pm25_elastic_net.csv', index=False)
    
    print("Metrics Table:")
    print(metrics_df)
    print("\nBest Parameters for each target:")
    print(best_params_df)
    
    return df, metrics_df, best_params_df


In [87]:

features = ['PM10','PM2.5', 'Visibility', 'Ozone', 'Nitrogen dioxide', 'Nitrogen oxides', 'Nitric oxide']
fine_tune_elastic_net(cor_Hull, features, output_folder="results", param_grid=None, test_size=0.2, random_state=42)

Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   8.116330  1.835458   0.887570  28.828017
1  2 hour  14.511558  2.471469   0.797501  39.570012
2  3 hour  19.075720  2.905947   0.730356  48.303601

Best Parameters for each target:
   Target  Best Alpha  Best L1 Ratio  Best R-squared
0  1 hour         0.1            0.9        0.885571
1  2 hour         0.1            0.9        0.794423
2  3 hour         0.1            0.9        0.713385


(            PM10   PM2.5  Visibility     Ozone  Nitrogen dioxide  \
 23      6.725407   1.100         7.0  6.071917           9.09728   
 0      19.559712  14.000         8.0  4.184318          13.99823   
 1      21.149820  12.200        13.0  3.499127          25.92585   
 2      24.309648  16.400        14.0  3.906583          18.59572   
 3      18.785991  12.700        14.0  3.891615          17.11879   
 ...          ...     ...         ...       ...               ...   
 70120  14.750000  11.392        10.0  4.121121          25.51641   
 70121   7.175000   5.259         8.0  4.505293          20.06227   
 70122   7.700000   5.590         6.0  4.270798          25.51644   
 70123   7.825000   5.991         8.0  3.836733          28.21999   
 70124   8.600000   6.722        10.0  3.487486          27.12930   
 
        Nitrogen oxides  Nitric oxide       Date and time  1 hour  2 hour  \
 23            11.23498       1.39417 2015-01-01 00:00:00   4.400   2.700   
 0             1

In [88]:
def fine_tune_elastic_net(df, features, output_folder="results", param_grid=None, test_size=0.2, random_state=42):
    # Set up a default parameter grid if none is provided
    if param_grid is None:
        param_grid = {
            'alpha': [0.1],
            'l1_ratio': [ 0.9]
        }

    # Ensure 'Date and time' is of datetime type
    if not pd.api.types.is_datetime64_any_dtype(df['Date and time']):
        df['Date and time'] = pd.to_datetime(df['Date and time'])
    
    # Sorting by datetime to maintain order
    df = df.sort_values(by='Date and time')
    
    # Creating a 'time_index' column to represent the time step
    df['time_index'] = np.arange(len(df))
    
    metrics_table = []
    best_params_table = []
    
    # List of target columns
    targets = ['1 hour', '2 hour', '3 hour']
    
    for target in targets:
        # Preparing the dataset
        X = df[features]
        y = df[target]
        
        # Standardize the features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        # Splitting the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=test_size, random_state=random_state)
        
        # Create the Elastic Net model
        model = ElasticNet(random_state=random_state)
        
        # Perform Grid Search
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        # Best model
        best_model = grid_search.best_estimator_
        
        # Predicting the PM2.5 values for the test set
        y_pred = best_model.predict(X_test)
        
        # Calculating performance metrics
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
        
        # Append metrics and best parameters to the table
        metrics_table.append({
            'Target': target,
            'MSE': mse,
            'MAE': mae,
            'R-squared': r2,
            'MAPE': mape
        })
        
        best_params_table.append({
            'Target': target,
            'Best Alpha': grid_search.best_params_['alpha'],
            'Best L1 Ratio': grid_search.best_params_['l1_ratio'],
            'Best R-squared': grid_search.best_score_
        })
        
        # Predicting PM2.5 for each row in the DataFrame
        y_pred_full = best_model.predict(X_scaled)
        df[f'PM2.5_pre_{target}_elastic_net'] = y_pred_full
    
    # Convert metrics table to DataFrame for better visualization
    metrics_df = pd.DataFrame(metrics_table)
    best_params_df = pd.DataFrame(best_params_table)
    
    # Save the metrics DataFrame, the best parameters, and the updated dataset
    metrics_df.to_csv(f'{output_folder}/metrics_table_elastic_net.csv', index=False)
    best_params_df.to_csv(f'{output_folder}/best_params_elastic_net.csv', index=False)
    df.to_csv(f'{output_folder}/predicted_pm25_elastic_net.csv', index=False)
    
    print("Metrics Table:")
    print(metrics_df)
    print("\nBest Parameters for each target:")
    print(best_params_df)
    
    return df, metrics_df, best_params_df


In [89]:
 features = ['Nitric oxide', 'Nitrogen dioxide', 'Nitrogen oxides', 'PM10','PM2.5','Temperature', 'Wind Speed',
              'Humidity', 'Wind Cardinal Direction', 'Maximum Gust Speed', 'Total Cloud Amount',
           'Ozone', 'Hourly Total Radiation','Cloud Base Height','hour', 'season']
fine_tune_elastic_net(hull_df, features, output_folder="results", param_grid=None, test_size=0.2, random_state=42)

Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   8.092537  1.831772   0.887900  28.716793
1  2 hour  14.383830  2.463079   0.799284  39.319451
2  3 hour  18.775624  2.881491   0.734598  47.765979

Best Parameters for each target:
   Target  Best Alpha  Best L1 Ratio  Best R-squared
0  1 hour         0.1            0.9        0.886063
1  2 hour         0.1            0.9        0.796613
2  3 hour         0.1            0.9        0.718010


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-01-01  00:00:00  6.071917       1.39417           9.09728   
 0      2015-01-01  01:00:00  4.184318       1.91668          13.99823   
 1      2015-01-01  02:00:00  3.499127       2.96234          25.92585   
 2      2015-01-01  03:00:00  3.906583       1.56817          18.59572   
 3      2015-01-01  04:00:00  3.891615       1.69882          17.11879   
 ...           ...       ...       ...           ...               ...   
 70120  2022-12-31  17:00:00  4.121121       3.83409          25.51641   
 70121  2022-12-31  18:00:00  4.505293       3.24661          20.06227   
 70122  2022-12-31  19:00:00  4.270798       3.89593          25.51644   
 70123  2022-12-31  20:00:00  3.836733       4.57617          28.21999   
 70124  2022-12-31  21:00:00  3.487486       4.76169          27.12930   
 
        Nitrogen oxides  Sulphur dioxide       PM10   PM2.5  Wind Direction  \
 23            11.23498        

In [90]:
fine_tune_elastic_net(defs[0], features, output_folder="results", param_grid=None, test_size=0.2, random_state=42)

Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   9.491914  1.985045   0.890591  37.001169
1  2 hour  17.392788  2.733981   0.804737  46.725852
2  3 hour  22.466716  3.207157   0.744877  55.390823

Best Parameters for each target:
   Target  Best Alpha  Best L1 Ratio  Best R-squared
0  1 hour         0.1            0.9        0.891355
1  2 hour         0.1            0.9        0.803963
2  3 hour         0.1            0.9        0.729004


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-01-01  00:00:00  6.071917       1.39417           9.09728   
 0      2015-01-01  01:00:00  4.184318       1.91668          13.99823   
 1      2015-01-01  02:00:00  3.499127       2.96234          25.92585   
 2      2015-01-01  03:00:00  3.906583       1.56817          18.59572   
 3      2015-01-01  04:00:00  3.891615       1.69882          17.11879   
 ...           ...       ...       ...           ...               ...   
 17320  2022-12-31  17:00:00  4.121121       3.83409          25.51641   
 17321  2022-12-31  18:00:00  4.505293       3.24661          20.06227   
 17322  2022-12-31  19:00:00  4.270798       3.89593          25.51644   
 17323  2022-12-31  20:00:00  3.836733       4.57617          28.21999   
 17324  2022-12-31  21:00:00  3.487486       4.76169          27.12930   
 
        Nitrogen oxides       PM10   PM2.5  Wind Direction  Wind Speed  ...  \
 23            11.23498   6.725

In [91]:
fine_tune_elastic_net(defs[1], features, output_folder="results", param_grid=None, test_size=0.2, random_state=42)

Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  12.595619  2.136676   0.861974  31.175564
1  2 hour  21.066652  2.834552   0.771027  41.416141
2  3 hour  28.493087  3.340325   0.688525  48.576799

Best Parameters for each target:
   Target  Best Alpha  Best L1 Ratio  Best R-squared
0  1 hour         0.1            0.9        0.879772
1  2 hour         0.1            0.9        0.777819
2  3 hour         0.1            0.9        0.692155


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-03-01  00:00:00  3.891615       2.39918          25.72743   
 0      2015-03-01  01:00:00  3.991400       3.72236          25.57268   
 1      2015-03-01  02:00:00  3.918224       3.37353          27.21679   
 2      2015-03-01  03:00:00  4.305723       2.66143          23.85413   
 3      2015-03-01  04:00:00  3.053421       6.36903          33.12832   
 ...           ...       ...       ...           ...               ...   
 17154  2022-05-31  19:00:00  5.972132       0.93909          11.68299   
 17155  2022-05-31  20:00:00  5.448261       1.15821          15.33676   
 17156  2022-05-31  21:00:00  5.822455       0.43824          11.20115   
 17157  2022-05-31  22:00:00  5.488175       0.40694          12.88350   
 17158  2022-05-31  23:00:00  5.343487       0.62606          11.92262   
 
        Nitrogen oxides       PM10  PM2.5  Wind Direction  Wind Speed  ...  \
 23            29.40612  16.5149

In [92]:
fine_tune_elastic_net(defs[2], features, output_folder="results", param_grid=None, test_size=0.2, random_state=42)

Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   8.629088  1.810185   0.834966  28.424408
1  2 hour  14.838377  2.378985   0.715312  40.114741
2  3 hour  19.243372  2.751461   0.628763  43.594564

Best Parameters for each target:
   Target  Best Alpha  Best L1 Ratio  Best R-squared
0  1 hour         0.1            0.9        0.844384
1  2 hour         0.1            0.9        0.731197
2  3 hour         0.1            0.9        0.636289


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-06-01  00:00:00  1.746238      10.40467          42.66931   
 0      2015-06-01  01:00:00  3.998052       0.93257          10.87129   
 1      2015-06-01  02:00:00  5.135601       0.29104           7.32743   
 2      2015-06-01  03:00:00  5.218756       0.39321           5.86803   
 3      2015-06-01  04:00:00  4.764734       0.52480           8.22901   
 ...           ...       ...       ...           ...               ...   
 17370  2022-08-31  19:00:00  6.515961       0.68652           5.15338   
 17371  2022-08-31  20:00:00  6.361294       0.74893           4.81526   
 17372  2022-08-31  21:00:00  5.418326       0.68652           9.12913   
 17373  2022-08-31  22:00:00  5.448261       0.49929           7.13743   
 17374  2022-08-31  23:00:00  5.867358       0.49929           5.14956   
 
        Nitrogen oxides       PM10   PM2.5  Wind Direction  Wind Speed  ...  \
 23            58.62291  20.056

In [93]:
fine_tune_elastic_net(defs[3], features, output_folder="results", param_grid=None, test_size=0.2, random_state=42)

Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   7.493838  1.776508   0.872195  30.737982
1  2 hour  13.040247  2.403537   0.770577  38.922793
2  3 hour  17.776505  2.851953   0.685986  50.449912

Best Parameters for each target:
   Target  Best Alpha  Best L1 Ratio  Best R-squared
0  1 hour         0.1            0.9        0.857085
1  2 hour         0.1            0.9        0.745122
2  3 hour         0.1            0.9        0.650233


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-09-01  00:00:00  5.762584       0.43843           4.91417   
 0      2015-09-01  01:00:00  4.523587       2.86214          22.40940   
 1      2015-09-01  02:00:00  5.355128       2.97897          19.83326   
 2      2015-09-01  03:00:00  6.850240       0.87592          10.72817   
 3      2015-09-01  04:00:00  7.364133       0.17462           5.44728   
 ...           ...       ...       ...           ...               ...   
 17394  2022-11-30  19:00:00  0.374194      18.87694          44.23362   
 17395  2022-11-30  20:00:00  0.783312      15.56735          38.90137   
 17396  2022-11-30  21:00:00  0.354237      19.64305          41.45788   
 17397  2022-11-30  22:00:00  0.274409      22.40104          38.17081   
 17398  2022-11-30  23:00:00  0.962925      20.71560          41.50856   
 
        Nitrogen oxides       PM10   PM2.5  Wind Direction  Wind Speed  ...  \
 23             5.58641   9.853

In [94]:
def predict_pm25_ridge(df, features, alpha, output_folder="results"):
    # Ensure 'Date and time' is of datetime type
    if not pd.api.types.is_datetime64_any_dtype(df['Date and time']):
        df['Date and time'] = pd.to_datetime(df['Date and time'])
    
    # Sorting by datetime to maintain order
    df = df.sort_values(by='Date and time')
    
    # Creating a 'time_index' column to represent the time step
    df['time_index'] = np.arange(len(df))
    
    metrics_table = []
    
    # List of target columns
    targets = ['1 hour', '2 hour', '3 hour']
    
    for target in targets:
        # Preparing the dataset
        X = df[features]
        y = df[target]
        
        # Splitting the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Creating and training the Ridge Regression model
        model = Ridge(alpha=alpha)
        model.fit(X_train, y_train)
        
        # Predicting the PM2.5 values for the test set
        y_pred = model.predict(X_test)
        
        # Calculating performance metrics
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
        
        # Append metrics to the table
        metrics_table.append({
            'Target': target,
            'MSE': mse,
            'MAE': mae,
            'R-squared': r2,
            'MAPE': mape
        })
        
        # Predicting PM2.5 for each row in the DataFrame
        df[f'PM2.5_pre_{target}_ridge'] = model.predict(X)
    
    # Convert metrics table to DataFrame for better visualization
    metrics_df = pd.DataFrame(metrics_table)
    
    # Save the metrics DataFrame and the updated dataset
    metrics_df.to_csv(f'{output_folder}/metrics_table_ridge.csv', index=False)
    df.to_csv(f'{output_folder}/predicted_pm25_ridge.csv', index=False)
    
    print("Metrics Table:")
    print(metrics_df)
    return df, metrics_df


In [96]:
features = ['PM10','PM2.5', 'Visibility', 'Ozone', 'Nitrogen dioxide', 'Nitrogen oxides', 'Nitric oxide']

In [98]:
# Define the range of alpha values to try
alpha_values = [0.1, 0.5, 1.0, 2.0, 5.0, 10.0]

best_score = -np.inf
best_alpha = None
results = []

for alpha in alpha_values:
    print(f"Testing alpha={alpha}")
    
    try:
        # Run the Ridge regression model with the current alpha
        df_result, metrics_df = predict_pm25_ridge(df=cor_Hull, features=features, alpha=alpha, output_folder="results")
    except Exception as e:
        print(f"Error occurred with alpha={alpha}: {e}")
        continue
    
    # Extract the R-squared value for the '1 hour' target
    r2_value = metrics_df.loc[metrics_df['Target'] == '1 hour', 'R-squared'].values[0]
    
    # Check if this alpha gives a better R-squared for '1 hour'
    if r2_value > best_score:
        best_score = r2_value
        best_alpha = alpha

    # Record the results for future reference
    results.append((alpha, r2_value))

# Print the best alpha for the '1 hour' target
print(f"Best alpha for 1 hour: {best_alpha} with R-squared: {best_score}")


Testing alpha=0.1
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   8.090371  1.824271   0.887930  28.437669
1  2 hour  14.459092  2.458598   0.798234  39.050019
2  3 hour  18.922357  2.884749   0.732523  47.678586
Testing alpha=0.5
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   8.090371  1.824271   0.887930  28.437671
1  2 hour  14.459092  2.458598   0.798234  39.050020
2  3 hour  18.922357  2.884749   0.732523  47.678587
Testing alpha=1.0
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   8.090370  1.824271   0.887930  28.437673
1  2 hour  14.459091  2.458598   0.798234  39.050022
2  3 hour  18.922357  2.884749   0.732523  47.678589
Testing alpha=2.0
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   8.090370  1.824271   0.887930  28.437676
1  2 hour  14.459091  2.458598   0.798234  39.050025
2  3 hour  18.922357  2.884749   0.732523  47.678593
Testing alpha=5.0
Me

In [99]:
 features = ['Nitric oxide', 'Nitrogen dioxide', 'Nitrogen oxides', 'PM10','PM2.5','Temperature', 'Wind Speed',
              'Humidity', 'Wind Cardinal Direction', 'Maximum Gust Speed', 'Total Cloud Amount',
           'Ozone', 'Hourly Total Radiation','Cloud Base Height','hour', 'season']
predict_pm25_ridge(hull_df, features, alpha =10, output_folder="results")

Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   8.060725  1.820638   0.888341  28.413910
1  2 hour  14.314796  2.451512   0.800247  38.924859
2  3 hour  18.622163  2.867510   0.736767  47.372592


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-01-01  00:00:00  6.071917       1.39417           9.09728   
 0      2015-01-01  01:00:00  4.184318       1.91668          13.99823   
 1      2015-01-01  02:00:00  3.499127       2.96234          25.92585   
 2      2015-01-01  03:00:00  3.906583       1.56817          18.59572   
 3      2015-01-01  04:00:00  3.891615       1.69882          17.11879   
 ...           ...       ...       ...           ...               ...   
 70120  2022-12-31  17:00:00  4.121121       3.83409          25.51641   
 70121  2022-12-31  18:00:00  4.505293       3.24661          20.06227   
 70122  2022-12-31  19:00:00  4.270798       3.89593          25.51644   
 70123  2022-12-31  20:00:00  3.836733       4.57617          28.21999   
 70124  2022-12-31  21:00:00  3.487486       4.76169          27.12930   
 
        Nitrogen oxides  Sulphur dioxide       PM10   PM2.5  Wind Direction  \
 23            11.23498        

In [100]:
predict_pm25_ridge(defs[0], features, alpha =10, output_folder="results")

Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   9.409811  1.968133   0.891538  36.400208
1  2 hour  17.315603  2.722120   0.805603  45.832106
2  3 hour  22.355234  3.199860   0.746143  54.694809


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-01-01  00:00:00  6.071917       1.39417           9.09728   
 0      2015-01-01  01:00:00  4.184318       1.91668          13.99823   
 1      2015-01-01  02:00:00  3.499127       2.96234          25.92585   
 2      2015-01-01  03:00:00  3.906583       1.56817          18.59572   
 3      2015-01-01  04:00:00  3.891615       1.69882          17.11879   
 ...           ...       ...       ...           ...               ...   
 17320  2022-12-31  17:00:00  4.121121       3.83409          25.51641   
 17321  2022-12-31  18:00:00  4.505293       3.24661          20.06227   
 17322  2022-12-31  19:00:00  4.270798       3.89593          25.51644   
 17323  2022-12-31  20:00:00  3.836733       4.57617          28.21999   
 17324  2022-12-31  21:00:00  3.487486       4.76169          27.12930   
 
        Nitrogen oxides       PM10   PM2.5  Wind Direction  Wind Speed  ...  \
 23            11.23498   6.725

In [101]:
predict_pm25_ridge(defs[1], features, alpha =10, output_folder="results")

Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  12.487984  2.113255   0.863154  30.726569
1  2 hour  20.761314  2.815783   0.774346  41.089393
2  3 hour  28.262158  3.329745   0.691050  48.048267


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-03-01  00:00:00  3.891615       2.39918          25.72743   
 0      2015-03-01  01:00:00  3.991400       3.72236          25.57268   
 1      2015-03-01  02:00:00  3.918224       3.37353          27.21679   
 2      2015-03-01  03:00:00  4.305723       2.66143          23.85413   
 3      2015-03-01  04:00:00  3.053421       6.36903          33.12832   
 ...           ...       ...       ...           ...               ...   
 17154  2022-05-31  19:00:00  5.972132       0.93909          11.68299   
 17155  2022-05-31  20:00:00  5.448261       1.15821          15.33676   
 17156  2022-05-31  21:00:00  5.822455       0.43824          11.20115   
 17157  2022-05-31  22:00:00  5.488175       0.40694          12.88350   
 17158  2022-05-31  23:00:00  5.343487       0.62606          11.92262   
 
        Nitrogen oxides       PM10  PM2.5  Wind Direction  Wind Speed  ...  \
 23            29.40612  16.5149

In [102]:
predict_pm25_ridge(defs[2], features, alpha =10, output_folder="results")

Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   8.629402  1.804909   0.834960  28.249154
1  2 hour  14.856982  2.386282   0.714955  40.152994
2  3 hour  19.260164  2.754517   0.628439  43.444273


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-06-01  00:00:00  1.746238      10.40467          42.66931   
 0      2015-06-01  01:00:00  3.998052       0.93257          10.87129   
 1      2015-06-01  02:00:00  5.135601       0.29104           7.32743   
 2      2015-06-01  03:00:00  5.218756       0.39321           5.86803   
 3      2015-06-01  04:00:00  4.764734       0.52480           8.22901   
 ...           ...       ...       ...           ...               ...   
 17370  2022-08-31  19:00:00  6.515961       0.68652           5.15338   
 17371  2022-08-31  20:00:00  6.361294       0.74893           4.81526   
 17372  2022-08-31  21:00:00  5.418326       0.68652           9.12913   
 17373  2022-08-31  22:00:00  5.448261       0.49929           7.13743   
 17374  2022-08-31  23:00:00  5.867358       0.49929           5.14956   
 
        Nitrogen oxides       PM10   PM2.5  Wind Direction  Wind Speed  ...  \
 23            58.62291  20.056

In [103]:
predict_pm25_ridge(defs[3], features, alpha =10, output_folder="results")

Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour   7.489071  1.769271   0.872276  30.274251
1  2 hour  12.985722  2.398103   0.771536  38.364854
2  3 hour  17.674774  2.843269   0.687783  49.864756


(             Date      time     Ozone  Nitric oxide  Nitrogen dioxide  \
 23     2015-09-01  00:00:00  5.762584       0.43843           4.91417   
 0      2015-09-01  01:00:00  4.523587       2.86214          22.40940   
 1      2015-09-01  02:00:00  5.355128       2.97897          19.83326   
 2      2015-09-01  03:00:00  6.850240       0.87592          10.72817   
 3      2015-09-01  04:00:00  7.364133       0.17462           5.44728   
 ...           ...       ...       ...           ...               ...   
 17394  2022-11-30  19:00:00  0.374194      18.87694          44.23362   
 17395  2022-11-30  20:00:00  0.783312      15.56735          38.90137   
 17396  2022-11-30  21:00:00  0.354237      19.64305          41.45788   
 17397  2022-11-30  22:00:00  0.274409      22.40104          38.17081   
 17398  2022-11-30  23:00:00  0.962925      20.71560          41.50856   
 
        Nitrogen oxides       PM10   PM2.5  Wind Direction  Wind Speed  ...  \
 23             5.58641   9.853