1. Import the required libraries:

In [1]:
import pandas as pd
import numpy as np
from scipy import signal
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

2. Load and sort your dataset:

In [2]:
data = pd.read_csv('filled.csv')
data = data.set_index('date')
data.sort_index(inplace=True)

3. Check for outliers and handle them as necessary (e.g., remove or replace with median):

In [3]:
# Replace 'threshold' with an appropriate value for your dataset
threshold = 3
z_scores = np.abs((data - data.mean()) / data.std())
print(z_scores)
outliers = (z_scores > threshold).any(axis=1)
for column in data.columns:
    column_median = data[column].median()
    data.loc[outliers, column] = column_median

                           BC     N_CPC     PM-10    PM-2.5    PM-1.0  \
date                                                                    
2019-01-01 00:00:00  1.982537  0.521296  0.153448  0.421661  0.720662   
2019-01-01 01:00:00  3.576515  1.387400  0.140165  1.065990  1.430471   
2019-01-01 02:00:00  1.523931  0.512062  0.271565  0.182133  0.492488   
2019-01-01 03:00:00  0.564016  0.182136  0.539595  0.376822  0.109699   
2019-01-01 04:00:00  1.069967  0.312931  0.498152  0.282722  0.017669   
...                       ...       ...       ...       ...       ...   
2019-12-10 19:00:00  0.539793  0.553649  0.572487  0.690432  0.759232   
2019-12-10 20:00:00  0.485020  0.535307  0.557868  0.656214  0.730710   
2019-12-10 21:00:00  0.235294  0.550992  0.577384  0.687694  0.771972   
2019-12-10 22:00:00  0.469238  0.524682  0.545662  0.659122  0.784711   
2019-12-10 23:00:00  0.692971  0.561491  0.539595  0.647659  0.778246   

                          NO2        O3       SO2 

4. Normalize the data:

In [4]:
scaler = StandardScaler()
data_normalized = pd.DataFrame(scaler.fit_transform(data), columns=data.columns, index=data.index)
data_normalized.describe()

Unnamed: 0,BC,N_CPC,PM-10,PM-2.5,PM-1.0,NO2,O3,SO2,CO,NO,NOX,TEMP,HUM
count,4584.0,4584.0,4584.0,4584.0,4584.0,4584.0,4584.0,4584.0,4584.0,4584.0,4584.0,4584.0,4584.0
mean,7.440238e-17,2.557582e-16,-4.9601590000000004e-17,-3.720119e-17,1.24004e-17,-2.4800790000000003e-17,9.610308000000001e-17,-1.736056e-16,9.920317000000001e-17,6.820218e-17,6.200198e-18,-1.488048e-16,5.208167e-16
std,1.000109,1.000109,1.000109,1.000109,1.000109,1.000109,1.000109,1.000109,1.000109,1.000109,1.000109,1.000109,1.000109
min,-1.535687,-1.807172,-1.990028,-1.840096,-1.547732,-1.571246,-2.235147,-1.54955,-1.273077,-0.6885732,-1.427995,-2.532501,-3.304216
25%,-0.6512812,-0.7198164,-0.6456667,-0.6955285,-0.7584807,-0.7415023,-0.5035736,-0.5009666,-0.5334767,-0.604552,-0.6779947,-0.7746734,-0.5154168
50%,-0.1366352,-0.04638024,0.03031395,-0.02874407,-0.06343894,-0.103238,-0.08712303,-0.5009666,-0.5334767,-0.2684673,-0.1485828,0.002725764,-0.03208392
75%,0.2745589,0.3393854,0.3470539,0.3300215,0.3332721,0.2797206,0.5444839,0.2717533,0.2061237,0.2356598,0.3635376,0.734064,0.7198176
max,5.121507,3.977215,6.618721,4.130767,3.898765,4.109307,3.324115,3.693368,4.643726,9.309947,5.895536,3.164674,2.155777


5. Smooth the data using a rolling window:

In [5]:
window_size = 5
data_smoothed = data_normalized.rolling(window=window_size).mean().dropna()
data_smoothed.describe()

Unnamed: 0,BC,N_CPC,PM-10,PM-2.5,PM-1.0,NO2,O3,SO2,CO,NO,NOX,TEMP,HUM
count,4580.0,4580.0,4580.0,4580.0,4580.0,4580.0,4580.0,4580.0,4580.0,4580.0,4580.0,4580.0,4580.0
mean,-0.000583,1.8e-05,0.000428,0.000151,0.000123,0.000174,-5.1e-05,0.000438,0.000466,-0.00022,3e-05,-7e-05,-0.000362
std,0.752531,0.786999,0.841652,0.8865,0.890145,0.769012,0.854832,0.748232,0.830999,0.672141,0.738617,0.939323,0.903346
min,-1.44785,-1.655544,-1.899566,-1.743147,-1.516525,-1.50742,-2.235147,-1.54955,-1.273077,-0.688573,-1.357407,-2.434745,-3.164125
25%,-0.543866,-0.559347,-0.586708,-0.630283,-0.674701,-0.550023,-0.521801,-0.500967,-0.533477,-0.43651,-0.536818,-0.702193,-0.525565
50%,-0.12758,-0.04638,0.023582,-0.04386,-0.06406,-0.077707,-0.087123,-0.500967,-0.385557,-0.234859,-0.113289,0.002726,-0.032084
75%,0.339299,0.422766,0.457734,0.417518,0.420968,0.369078,0.462462,0.3379,0.301713,0.282388,0.363538,0.725592,0.620798
max,3.840476,3.281256,4.794597,3.494499,3.639639,3.726348,3.160071,3.693368,4.643726,6.15075,3.98083,3.066993,2.128503


In [6]:
test_size = 24*30 #the number of days is 191

train = data_smoothed.iloc[:-test_size]
# y_train_raw = y.iloc[:-test_size]
test = data_smoothed.iloc[-test_size:]
# y_test_raw = y.iloc[-test_size:]
print(train.shape, test.shape)

def build_sequences(df, target_labels=['BC'], window=200, stride=200):
    # Sanity check to avoid runtime errors
    assert window % stride == 0
    dataset = []
    labels = []
    temp_df = df.copy().values
    temp_label = df[target_labels].copy().values
    padding_len = len(df) % window
    
    if padding_len != 0:
        # Compute padding length
        padding_len = window - len(df) % window
        padding = np.zeros((padding_len, temp_df.shape[1]), dtype='float32')
        temp_df = np.concatenate((padding, temp_df))
        padding = np.zeros((padding_len,1), dtype='float32')
        #padding = np.zeros((padding_len, temp_label.shape[1]), dtype='float32')
        temp_label = np.concatenate((padding, temp_label))
        assert len(temp_df) % window == 0

    # Build sequences and labels
    for i in range(0, len(temp_df) - window + 1, stride):
        dataset.append(temp_df[i:i + window])
        labels.append(temp_label[i:i + window])

    return np.array(dataset), np.array(labels)

X_train, y_train = build_sequences(train, window=24, stride=24)
X_test, y_test = build_sequences(test, window=24, stride=24)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

(3860, 13) (720, 13)


((161, 24, 13), (161, 24, 1), (30, 24, 13), (30, 24, 1))

In [7]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Conv1D, Add, Activation, BatchNormalization, MaxPooling1D, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.applications.resnet50 import ResNet50
tf.config.set_visible_devices([], 'GPU') #disables GPU


In [8]:
def resnet_block(input_tensor, filters, kernel_size, stride=1):
    x = Conv1D(filters=filters, kernel_size=kernel_size, strides=stride, padding='same')(input_tensor)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)

    x = Conv1D(filters=filters, kernel_size=kernel_size, strides=1, padding='same')(x)
    x = BatchNormalization()(x)

    if stride != 1:
        input_tensor = Conv1D(filters=filters, kernel_size=1, strides=stride, padding='same')(input_tensor)
        input_tensor = BatchNormalization()(input_tensor)

    x = Add()([x, input_tensor])
    x = Activation('relu')(x)
    return x

In [12]:
def create_resnet_model(input_shape, num_classes):
    inputs = Input(shape=input_shape)
    x = Conv1D(filters=64, kernel_size=7, strides=2, padding='same')(inputs)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = MaxPooling1D(pool_size=3, strides=2, padding='same')(x)

    # x = resnet_block(x, filters=64, kernel_size=3)
    # x = resnet_block(x, filters=128, kernel_size=3, stride=2)
    # x = resnet_block(x, filters=256, kernel_size=3, stride=2)
    # x = resnet_block(x, filters=512, kernel_size=3, stride=2)

    base_model = ResNet50(weights='imagenet', include_top=False, input_tensor=x)
    
    # Freeze the layers of the base model
    for layer in base_model.layers:
        layer.trainable = False

    x = base_model.output
    
    x = GlobalAveragePooling1D()(x)
    outputs = Dense(num_classes, activation='linear')(x)

    model = Model(inputs=inputs, outputs=outputs)
    return model

In [13]:
input_shape = (24, 13)      # Update these based on your preprocessed data
num_classes = 1             # We are predicting only the "BC" column
num_epochs = 100
batch_size = 16

model = create_resnet_model(input_shape, num_classes)
model.summary()
model.compile(optimizer='adam', loss='mse', metrics=['mae'])



history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=num_epochs,
    batch_size=batch_size,
    verbose=1
)

ValueError: Input 0 of layer "conv1_pad" is incompatible with the layer: expected ndim=4, found ndim=3. Full shape received: (None, 6, 64)