In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
Hull_df = pd.read_csv('Hull15-22-final.csv')

In [3]:
def shift_columns(df):
    # Shift the PM2.5 column values by 1, 2, and 3 positions
    df['1 hour'] = df['PM2.5'].shift(-1)
    df['2 hour'] = df['PM2.5'].shift(-2)
    df['3 hour'] = df['PM2.5'].shift(-3)
    df = df.iloc[:-3]
    return df

In [5]:
hull_df = shift_columns(Hull_df)

In [6]:
scale_factors = {
   'Ozone':10, 'Hourly Total Radiation': 100,'Cloud Base Height':100, 'Humidity':10
}

# Apply scaling
for column, factor in scale_factors.items():
    if column in hull_df.columns:
        hull_df[column] = hull_df[column] / factor

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hull_df[column] = hull_df[column] / factor


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [8]:
 features = ['Nitric oxide', 'Nitrogen dioxide', 'Nitrogen oxides', 'PM10','PM2.5','Temperature', 'Wind Speed',
              'Humidity', 'Wind Cardinal Direction', 'Maximum Gust Speed', 'Total Cloud Amount',
           'Ozone', 'Hourly Total Radiation','Cloud Base Height','hour', 'season']

In [9]:
def clear_model_memory():
    #Clear the memory of the model
  
    # Clear memory for scikit-learn
    gc.collect()
    
    # Clear memory for Keras/TensorFlow
    tf.keras.backend.clear_session()
    gc.collect()

In [10]:
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Dropout, Conv1D, MaxPooling1D, Flatten, TimeDistributed
from tensorflow.keras.optimizers import Adam,  RMSprop
from keras.callbacks import EarlyStopping





In [11]:
def create_sequences(data, target, time_steps=3):
    X, y = [], []
    for i in range(len(data) - time_steps):
        X.append(data[i:(i + time_steps)])
        y.append(target[i + time_steps])
    return np.array(X), np.array(y)

In [12]:
winter_df = pd.read_csv('winter_data.csv')
spring_df = pd.read_csv('spring_data.csv')
summer_df = pd.read_csv('summer_data.csv')
fall_df = pd.read_csv('autumn_data.csv')

In [14]:
defs = [winter_df, spring_df, summer_df, fall_df]
# Apply scaling
for df in defs:
    for column, factor in scale_factors.items():
        if column in df.columns:
            df[column] = df[column] / factor

def shift_column(df):
    # Shift the PM2.5 column values by 1, 2, and 3 positions
    df['1 hour'] = df['PM2.5'].shift(-1)
    df['2 hour'] = df['PM2.5'].shift(-2)
    df['3 hour'] = df['PM2.5'].shift(-3)
    return df

for i in range(len(defs)):
    defs[i] = shift_column(defs[i])
 # Drop the last 3 rows
for i in range(len(defs)):
    defs[i] =defs[i].iloc[:-3]    

In [17]:
scale_factors = {
   'Ozone':10, 'Hourly Total Radiation': 100,'Cloud Base Height':100, 'Humidity':10
}

# Apply scaling
for column, factor in scale_factors.items():
    if column in hull_df.columns:
        hull_df[column] = hull_df[column] / factor

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hull_df[column] = hull_df[column] / factor


In [18]:
def create_sequences(data, target, time_steps=3):
    X, y = [], []
    for i in range(len(data) - time_steps):
        X.append(data[i:(i + time_steps)])
        y.append(target[i + time_steps])
    return np.array(X), np.array(y)

In [19]:
def predict_pm25_cnn_lstm(df, features, epochs=50, batch_size=64, time_steps=3, output_folder="results"):
    # Ensure 'Date and time' is of datetime type
    if not pd.api.types.is_datetime64_any_dtype(df['Date and time']):
        df['Date and time'] = pd.to_datetime(df['Date and time'])
    
    # Sorting by datetime to maintain order
    df = df.sort_values(by='Date and time')
    
    metrics_table = []
    
    # List of target columns
    targets = ['1 hour']
    
    for target in targets:
        # Preparing the dataset
        X = df[features].values
        y = df[target].values
        
        # Standardize the features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        # Create sequences
        X_seq, y_seq = create_sequences(X_scaled, y, time_steps)
        
        # Reshape for Conv1D
        X_seq = X_seq.reshape((X_seq.shape[0], time_steps, X_seq.shape[2], 1))
        
        # Splitting the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)
        
        # Creating and training the CNN-LSTM model
        model = Sequential()
        model.add(TimeDistributed(Conv1D(filters=64, kernel_size=2, activation='relu'), input_shape=(time_steps, X_train.shape[2], 1)))
        model.add(TimeDistributed(MaxPooling1D(pool_size=2)))
        model.add(TimeDistributed(Flatten()))
        model.add(LSTM(50, return_sequences=True))
        model.add(Dropout(0.2))
        model.add(LSTM(50))
        model.add(Dropout(0.2))
        model.add(Dense(1))
        model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
        
        # Early stopping to avoid overfitting
        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
        
        model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(X_test, y_test), verbose=1, shuffle=False, callbacks=[early_stopping])
        
        # Predicting the PM2.5 values for the test set
        y_pred = model.predict(X_test).flatten()  # Flatten to ensure y_pred is 1D
        
        # Calculating performance metrics
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
        
        # Append metrics to the table
        metrics_table.append({
            'Target': target,
            'MSE': mse,
            'MAE': mae,
            'R-squared': r2,
            'MAPE': mape
        })
        
        # Predicting PM2.5 for each row in the DataFrame
        X_full_seq, _ = create_sequences(X_scaled, y, time_steps)
        X_full_seq = X_full_seq.reshape((X_full_seq.shape[0], time_steps, X_full_seq.shape[2], 1))
        predictions = model.predict(X_full_seq).flatten()
        
        # Add NaN for the first time_steps values to match the original DataFrame length
        predictions = np.concatenate([np.full(time_steps, np.nan), predictions])
        df[f'PM2.5_pre_{target}_cnn_lstm'] = predictions
    
    # Convert metrics table to DataFrame for better visualization
    metrics_df = pd.DataFrame(metrics_table)
    
    # Save the metrics DataFrame and the updated dataset
    metrics_df.to_csv(f'{output_folder}/metrics_table_cnn_lstm.csv', index=False)
    df.to_csv(f'{output_folder}/predicted_pm25_cnn_lstm.csv', index=False)
    
    print("Metrics Table:")
    print(metrics_df)
    return df, metrics_df


In [16]:
from sklearn.model_selection import ParameterGrid
import itertools

def fine_tune_cnn_lstm(df, features, param_grid, time_steps=3, output_folder="results"):
    best_score = -np.inf
    best_params = None
    results = []
    
    # Generate all combinations of hyperparameters
    param_combinations = list(itertools.product(
        param_grid['epochs'],
        param_grid['batch_size'],
        param_grid['learning_rate'],
        param_grid['lstm_units'],
        param_grid['dropout_rate']
    ))
    
    for combination in param_combinations:
        epochs, batch_size, learning_rate, lstm_units, dropout_rate = combination
        print(f"Testing combination: Epochs={epochs}, Batch={batch_size}, Learning Rate={learning_rate}, LSTM Units={lstm_units}, Dropout={dropout_rate}")
        
        try:
            df_result, metrics_df = predict_pm25_cnn_lstm(
                df=df,
                features=features,
                epochs=epochs,
                batch_size=batch_size,
                time_steps=time_steps,
                output_folder=output_folder
            )
        except Exception as e:
            print(f"Error occurred during combination {combination}: {e}")
            continue
        
        # Extract R-squared value for '1 hour' target
        r2_value = metrics_df.loc[metrics_df['Target'] == '1 hour', 'R-squared'].values[0]
        
        # Save results and update best score if applicable
        results.append((combination, r2_value))
        if r2_value > best_score:
            best_score = r2_value
            best_params = combination

    print(f"Best parameters: {best_params} with score: {best_score}")
    return best_params, best_score, results




In [21]:
# Example hyperparameter grid
features = ['PM10','PM2.5', 'Visibility', 'Ozone', 'Nitrogen dioxide', 'Nitrogen oxides', 'Nitric oxide']
param_grid = {
    'epochs': [10],
    'batch_size': [32, 64],
    'learning_rate': [0.001, 0.0001],
    'lstm_units': [50, 100],
    'dropout_rate': [0.2, 0.3]
}

# Fine-tune the CNN-LSTM model
best_params, best_score, results = fine_tune_cnn_lstm(df=hull_df, features=features, param_grid=param_grid)

Testing combination: Epochs=10, Batch=32, Learning Rate=0.001, LSTM Units=50, Dropout=0.2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date and time'] = pd.to_datetime(df['Date and time'])




Epoch 1/10

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  18.940914  2.768926   0.735084  48.478613
Testing combination: Epochs=10, Batch=32, Learning Rate=0.001, LSTM Units=50, Dropout=0.3
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  18.956982  2.783417    0.73486  48.951629
Testing combination: Epochs=10, Batch=32, Learning Rate=0.001, LSTM Units=100, Dropout=0.2
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  18.874915  2.770726   0.736007  48.732952
Testing combination: Epochs=10, Batch=32, Learning Rate=0.001, LSTM Units=100, Dropout=0.3
Epoch 1/10
Epoch 2/10
Epoch 3

In [22]:
param_grid = {
    'epochs': [50],
    'batch_size': [64],
    'learning_rate': [0.0001],
    'lstm_units': [100],
    'dropout_rate': [0.2]
}
best_params, best_score, results = fine_tune_cnn_lstm(df=hull_df, features=features, param_grid=param_grid)

Testing combination: Epochs=50, Batch=64, Learning Rate=0.0001, LSTM Units=100, Dropout=0.2
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  18.458037  2.678758   0.741838  45.075116
Best parameters: (50, 64, 0.0001, 100, 0.2) with score: 0.7418379564475106


In [23]:
 features = ['Nitric oxide', 'Nitrogen dioxide', 'Nitrogen oxides', 'PM10','PM2.5','Temperature', 'Wind Speed',
              'Humidity', 'Wind Cardinal Direction', 'Maximum Gust Speed', 'Total Cloud Amount',
           'Ozone', 'Hourly Total Radiation','Cloud Base Height','hour', 'season']
best_params, best_score, results = fine_tune_cnn_lstm(df=hull_df, features=features, param_grid=param_grid)

Testing combination: Epochs=50, Batch=64, Learning Rate=0.0001, LSTM Units=100, Dropout=0.2
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  17.823524  2.616946   0.750713  42.011909
Best parameters: (50, 64, 0.0001, 100, 0.2) with score: 0.750712538448806


In [24]:
best_params, best_score, results = fine_tune_cnn_lstm(df=defs[0], features=features, param_grid=param_grid)

Testing combination: Epochs=50, Batch=64, Learning Rate=0.0001, LSTM Units=100, Dropout=0.2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date and time'] = pd.to_datetime(df['Date and time'])


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Metrics Table:
   Target       MSE       MAE  R-squared       MAPE
0  1 hour  18.72549  2.840388   0.791681  48.958534
Best parameters: (50, 64, 0.0001, 100, 0.2) with score: 0.7916807818822134


In [25]:
best_params, best_score, results = fine_tune_cnn_lstm(df=defs[1], features=features, param_grid=param_grid)

Testing combination: Epochs=50, Batch=64, Learning Rate=0.0001, LSTM Units=100, Dropout=0.2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date and time'] = pd.to_datetime(df['Date and time'])


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  20.506644  2.794481   0.763936  44.091972
Best parameters: (50, 64, 0.0001, 100, 0.2) with score: 0.7639355016757208


In [26]:
best_params, best_score, results = fine_tune_cnn_lstm(df=defs[2], features=features, param_grid=param_grid)

Testing combination: Epochs=50, Batch=64, Learning Rate=0.0001, LSTM Units=100, Dropout=0.2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date and time'] = pd.to_datetime(df['Date and time'])


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  15.727692  2.531923   0.686139  39.873999
Best parameters: (50, 64, 0.0001, 100, 0.2) with score: 0.686139285343613


In [27]:
best_params, best_score, results = fine_tune_cnn_lstm(df=defs[3], features=features, param_grid=param_grid)

Testing combination: Epochs=50, Batch=64, Learning Rate=0.0001, LSTM Units=100, Dropout=0.2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date and time'] = pd.to_datetime(df['Date and time'])


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Metrics Table:
   Target        MSE       MAE  R-squared       MAPE
0  1 hour  15.698533  2.582103    0.72583  44.436708
Best parameters: (50, 64, 0.0001, 100, 0.2) with score: 0.7258295982636855
