# Possibly the last iteration

### The plan:

- Describe the dataset
- Talk first easy NN just to see the difference in the data-cleaning part
- Data cleaning, each significant step will have its own nn try
- Different architectures
- Optimizing hyperparameters

*The LSTM ans the creation of the windows (timeseries) will be done in a separate window*

## Sensor Data

The sensors are monitoring environmental factors, we can specify the following types of sensor data:
1. Temperature (TEMP)
2. Humidity (HUM)
...

The costly sensor is assumed to measure **Black Carbon concentration (BC)**, which requires more advanced technology and calibration, thus making it more expensive.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the CSV file
data = pd.read_csv("BC-Data-Set.csv")
data = data.set_index(pd.to_datetime(data['date']))
data.describe()

#now we can fill the days that are missing some hours, we will do that using the mean
test = data.groupby([data.index.date])
for group_name, group_data in test:
    if len(group_data)!=24:
        existing_hours=group_data.index.hour.unique()
        missing_hours = set(range(24)) - set(existing_hours)
        missing_rows = pd.DataFrame(columns=group_data.columns)
        for missing_hour in missing_hours:
            datetime_obj = pd.to_datetime(group_data.index.date[0]) + pd.to_timedelta(missing_hour, unit='H')
            data.loc[datetime_obj] = data.mean()

We can see that outliers are present, let's remove them.
We will scale the values too, the NN needs this to work efficiently

In [None]:
data = data.sort_index()
data = data.reset_index(drop=True)
dates = data["date"]
data = data.drop(["date"], axis=1)

In [None]:
def inspect_dataframe(df, columns):
    figs, axs = plt.subplots(len(columns), 1, sharex=True, figsize=(17,17))
    for i, col in enumerate(columns):
        axs[i].plot(df[col])
        axs[i].set_title(col)
    plt.show()
inspect_dataframe(data, data.columns)

In [None]:
threshold = 6 # theshold a little high to retain some outliers
z_scores = np.abs((data - data.mean()) / data.std())
outliers = (z_scores > threshold).any(axis=1)
for column in data.columns:
    column_median = data[column].median()
    data.loc[outliers, column] = column_median

In [None]:
# Normalize the data, this is a rudimentary MinMaxScaler
max_df = data.max()
min_df = data.min()

data_norm = (data - min_df)/(max_df - min_df)
data_norm = pd.DataFrame(data_norm, columns=data.columns)

In [None]:
correlations = data_norm.corr()
# Scatter plots
sns.pairplot(data_norm, x_vars=data_norm.columns[1:], y_vars=['BC'])

# Heat map
plt.figure(figsize=(10, 7))
sns.heatmap(correlations, annot=True, cmap="coolwarm")
plt.show()

In [None]:
inspect_dataframe(data_norm, data.columns)

#### Talk first easy NN just to see the difference in the data-cleaning part

In [None]:
import tensorflow as tf
import os
import random
import seaborn as sns
import matplotlib as mpl
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")
tfk = tf.keras
tfkl = tf.keras.layers
tf.config.set_visible_devices([], 'GPU') #disables GPU
print(tf.__version__)

In [None]:
# Random seed for reproducibility
seed = 42

random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)

In [None]:
# just to prove that is the same as the classic MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

target = pd.DataFrame(data.BC)
X = data.drop(['BC'], axis=1)
print('Boston data_normset shape',X.shape)
print('Target shape', target.shape)
X.describe()

scaler_x = MinMaxScaler()
x_scaled = scaler_x.fit_transform(X)
x_scaled = pd.DataFrame(x_scaled, columns=X.columns)

#do the same for y

scaler_y = MinMaxScaler()
y_scaled = scaler_y.fit_transform(target) 
y_scaled = pd.DataFrame(y_scaled, columns=target.columns)


y_scaled.describe()   
x_scaled.describe()

In [None]:
#split in test and train

# X_train, X_test, y_train, y_test = train_test_split(x_scaled, y_scaled, test_size = 0.2, random_state=seed, shuffle=True)
# print(X_train.shape, y_train.shape)
# print(X_test.shape, y_test.shape)

test_size = 24*19

X_train = x_scaled.iloc[:-test_size]
y_train = y_scaled.iloc[:-test_size]
X_test = x_scaled.iloc[-test_size:]
y_test = y_scaled.iloc[-test_size:]


print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
# Inspect the target
plt.figure(figsize=(15,5))
sns.histplot(data=y_train, x='BC')
plt.show()

In [None]:
input_shape = X_train.shape[1:]
input_shape

In [None]:
def build_ffnn(input_shape):

    # Build the neural network layer by layer
    input_layer = tfkl.Input(shape=input_shape, name='Input')
    hidden_layer1 = tfkl.Dense(units=128, activation='relu', name='Hidden1')(input_layer)
    hidden_layer2 = tfkl.Dense(units=64, activation='relu', name='Hidden2')(hidden_layer1)
    output_layer = tfkl.Dense(units=1, activation='linear', name='Output')(hidden_layer2)

    # Connect input and output through the Model class
    model = tfk.Model(inputs=input_layer, outputs=output_layer, name='FFNN')

    # Compile the model
    loss = tfk.losses.MeanSquaredError()
    learning_rate = 0.2
    optimizer = tfk.optimizers.legacy.SGD(learning_rate)
    model.compile(loss=loss, optimizer=optimizer, metrics=[tfk.metrics.MeanAbsoluteError(), tfk.metrics.RootMeanSquaredError()])

    # Return the model
    return model

In [None]:
ffnn = build_ffnn(input_shape)
ffnn.summary()

In [None]:
batch_size = 32
epochs = 5000

In [None]:
history = ffnn.fit(
    x = X_train,
    y = y_train, 
    batch_size = batch_size,
    validation_split=0.3,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=20,  restore_best_weights=True),
        tfk.callbacks.ReduceLROnPlateau(monitor='val_loss', mode='min', patience=5, factor=0.5, min_lr=1e-5)
                            ],
    epochs = epochs
).history

In [None]:
scores = ffnn.evaluate(X_test, y_test, verbose=1)
print("Accuracy: %.2f%%" % (scores[1]*100))


In [None]:
y_pred=ffnn.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
print("MSE",mean_squared_error(y_test,y_pred))
print("R2",r2_score(y_test,y_pred))

In [None]:
from sklearn.metrics import mean_squared_error
print("MSE",mean_squared_error(scaler_y.inverse_transform(y_test),scaler_y.inverse_transform(y_pred)))
print("R2",r2_score(scaler_y.inverse_transform(y_test),scaler_y.inverse_transform(y_pred)))

In [None]:
def plot_residuals(model, X_, y_):
    X_['sort'] = y_
    X_ = X_.sort_values(by=['sort'])
    y_ = np.expand_dims(X_['sort'], 1)
    X_.drop(['sort'], axis=1, inplace=True)

    y_pred = model.predict(X_)
    SSE = np.square(scaler_y.inverse_transform(y_pred) - scaler_y.inverse_transform(y_))
    MSE = np.mean(SSE)

    print('Mean Squared Error (MSE):', MSE)
    # mpl.rcParams.update(mpl.rcParamsDefault)
    # sns.set(font_scale=1.1, style=None, palette='Set1')
    plt.figure(figsize=(15,5))
    plt.scatter(np.arange(len(y_)), y_, label='True')
    plt.scatter(np.arange(len(y_pred)), y_pred, label='Prediction')
    
    for i in range(len(y_)):
        if(y_[i]>=y_pred[i]):
            plt.vlines(i,y_pred[i],y_[i],alpha=.5)
        else:
            plt.vlines(i,y_[i],y_pred[i],alpha=.5)
            
    plt.legend()
    plt.grid(alpha=.3)
    plt.show()

In [None]:
plot_residuals(ffnn, X_test, y_test)

In [None]:
plot_residuals(ffnn, X_train, y_train)