In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from scikeras.wrappers import KerasRegressor
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import mean_absolute_error, mean_squared_error

The cell below contains code that was used to extract chosen variables to be used as features later.

In [None]:
#Loads the dataset
dataset = pd.read_csv("/Users/maks/Documents/ML_project/Met Dataset 2015-2022.csv")

#Removes rows with any NaN values
cleaned_dataset = dataset.dropna()

#List specifying which columns to keep
columns = ["x_coord", "y_coord", "year", 
           "hurs_1", "hurs_2", "hurs_3", "hurs_4", "hurs_5", "hurs_6", "hurs_7", "hurs_8", "hurs_9", "hurs_10", "hurs_11", "hurs_12",
           "psl_1", "psl_2", "psl_3", "psl_4", "psl_5", "psl_6", "psl_7", "psl_8", "psl_9", "psl_10", "psl_11","psl_12",
           "tas_1", "tas_2", "tas_3", "tas_4", "tas_5", "tas_6", "tas_7", "tas_8", "tas_9", "tas_10", "tas_11", "tas_12", 
           "rainfall_1", "rainfall_2", "rainfall_3", "rainfall_4", "rainfall_5", "rainfall_6", "rainfall_7", "rainfall_8", "rainfall_9", "rainfall_10", "rainfall_11", "rainfall_12"]

filtered_dataset = cleaned_dataset[columns]

#Saves the filtered dataset
filtered_dataset.to_csv("/Users/maks/Documents/ML_project/Filtered_Dataset.csv")

The cells below contain code that was used to preprocess the data and train the model.

In [None]:
# Loads in the cleaned and reduced MET dataset
dataset = pd.read_csv("/Users/maks/Documents/ML_project/Filtered_Dataset.csv")

In [None]:
#Sets up labels from the loaded in .csv file to prepare features next
feature_columns = [f"hurs_{i}" for i in range(1, 13)] + [f"psl_{i}" for i in range(1, 13)] + [f"tas_{i}" for i in range(1, 13)] + [f"rainfall_{i}" for i in range(1, 13)]

#Prepares the featurees to get them into data sequences
data = dataset[feature_columns]

#Prepares data sequences
x = []
y = []
for i in range(len(data) - 1):
    x.append(data.iloc[i].values)
    y.append(data.iloc[i + 1]["hurs_1"])    #Used to predict next month's relative humidity

x = np.array(x)
y = np.array(y)

In [None]:
#Splits data into training and testing sets. test_size controls the size of data put towards testing, here 0.2
#means that 20% will be used for testing. Specifying random_state allows for reproducible output across cells or
#runs.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 13)

#Initializes scaler
x_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()

#Fits scaler to training data
scaled_x_train = x_scaler.fit_transform(x_train)
scaled_y_train = y_scaler.fit_transform(y_train.reshape(-1, 1))

#Fits scaler to testing data
scaled_x_test = x_scaler.transform(x_test)
scaled_y_test = y_scaler.transform(y_test.reshape(-1, 1))

#Checks the sizes of x and y training and testing arrays
print("Shape of scaled_x_train:", scaled_x_train.shape)
print("Shape of scaled_y_train:", scaled_y_train.shape)
print("Shape of scaled_x_test:", scaled_x_test.shape)
print("Shape of scaled_y_test:", scaled_y_test.shape)

In [None]:
#Specifies the shape of the input matrix
input_shape = scaled_x_train.shape[1]

#Builds the MLP model. Dropout and Lasso regularization are used to prevent overfitting. Dropout rate is 
# varied depending on how deep the layer is. 
model = Sequential([
    Input(shape = (input_shape,)),
    Dense(256, activation = "relu", kernel_regularizer = l2(0.006)), 
    Dropout(0.2),
    Dense(128, activation = "relu", kernel_regularizer = l2(0.006)), 
    Dropout(0.3),
    Dense(64, activation = "relu", kernel_regularizer = l2(0.006)), 
    Dropout(0.4),
    Dense(32, activation = "relu", kernel_regularizer = l2(0.006)), 
    Dropout(0.5),
    Dense(1)
])

In [None]:
#Compiles the model
model.compile(optimizer = "adam", loss = "mse")

In [None]:
#Prepares a stopper to allow for a larger number of epochs but to help avoid overfitting
stop = EarlyStopping(monitor = "val_loss", patience = 10, restore_best_weights = True)

class_weight = {0: 1., 1: 10.}

#Trains the model
model.fit(scaled_x_train, scaled_y_train, epochs = 100, batch_size = 32, class_weight = class_weight, validation_data = (scaled_x_test, scaled_y_test), callbacks = [stop])

In [None]:
#Evaluates the model in a simple way
loss = model.evaluate(scaled_x_test, scaled_y_test)
print(f"Test Loss: {loss}")

In [None]:
predictions_scaled = model.predict(scaled_x_test)
predictions = y_scaler.inverse_transform(predictions_scaled)

for i in range(10):
    print(f"Predicted: {predictions[i] [0]}, Actual: {y_test[i]}")

The cell below includes code that saved predicted values and corresponding actual values fo relative humidity.

In [None]:
#Saves test values and corresponding predictions.
results_df = pd.DataFrame({
    "true_label": y_test.flatten(),
    "prediction": predictions.flatten()
})

results_df.to_csv("/Users/maks/Documents/ML_project/model.csv")

The cell below includes code that was used to calculate evaluation metrcis of the model.

In [None]:
#The code below evaluates the model using multiple metrics.

#Distance metrics
mean_absolute = mean_absolute_error(y_test, predictions)
mean_squared = mean_squared_error(y_test, predictions)
root_mean = mean_squared_error(y_test, predictions, squared = False)

#Correlation metrics
pearson = pearsonr(y_test, predictions.flatten())
spearman = spearmanr(y_test, predictions)

#Displays the results
print(f"Mean absolute error of the model: {mean_absolute}")
print(f"Mean squared error of the model: {mean_squared}")
print(f"Root mean sqared error of the model: {root_mean}")
print(f"Pearson correlation coefficient of the model: {pearson}")
print(f"Spearman rank correlation of the model: {spearman}")