In [293]:
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import os
import pandas as pd
import numpy as np

In [294]:
#Opens a new dataframe with the Clean csv
cleancsv = pd.read_csv('CSV/CLEAN.csv') 

In [295]:
#Convert data into Date time and create date filter
cleancsv['Date'] = pd.to_datetime(cleancsv['Date'])
cleancsv['Date'] = cleancsv['Date'] + pd.to_timedelta(cleancsv["Hr"], unit="h")
cleancsv.drop('Hr', axis=1, inplace=True)

"""
Use this in future if data set needs specific dates
prediction = data.loc{
    (untouched_csv['Date'] > datetime(x, x, x)) &
    (untouched_csv['Date'] < datetime(x, x, x,))
}
"""

"\nUse this in future if data set needs specific dates\nprediction = data.loc{\n    (untouched_csv['Date'] > datetime(x, x, x)) &\n    (untouched_csv['Date'] < datetime(x, x, x,))\n}\n"

In [None]:
#Prepare colomns into variables
data_main_air_temp = cleancsv['Mainland Air Temp']
data_humidity_per = cleancsv['Humidity (%)']
data_wind_direction = cleancsv['Direction (A)']
data_wind_speed = cleancsv['Wind Speed (A)']
data_gusting = cleancsv['Gusting']
data_pressure = cleancsv['Atmospheric Pressure (IN)']
data_rainfall = cleancsv['Precipitation Rate']
data_bay_temp = cleancsv['Bay Temp']
data_salinity = cleancsv['Salinity']
data_lbi_temp = cleancsv['LBI Air Temp']
data_ocean_temp = cleancsv['Ocean Temp']
data_onshore_flag = cleancsv['Onshore']
data_upwelling_flag = cleancsv['upwelling_flag']

#saves all input data into one Numpy array
dataset = np.column_stack([
    data_main_air_temp.values,
    data_humidity_per.values,
    #data_wind_direction.values,
    data_wind_speed.values,
    data_gusting.values,
    data_pressure.values,
    data_rainfall.values,
    data_bay_temp.values,
    data_salinity.values,
    data_lbi_temp.values,
    data_ocean_temp.values,
    data_onshore_flag.values,
    data_upwelling_flag.values,
])

#Save output data into variables and reshape it to be a 2d array
output_data = data_wind_direction.values
#output_data = np.array(output_data).reshape(-1, 1)
output_data = output_data.reshape(-1, 1)

In [297]:
#Length of training data
training_data_len = int(np.ceil(len(dataset) * 0.90)) #Use 90% of training data

In [298]:
#Scaler
scaler_x= StandardScaler()
scaler_y= StandardScaler()

scaledx = scaler_x.fit_transform(dataset)
#scaledy = scaler_y.fit_transform(output_data)

training_data_x = scaledx[:training_data_len] #95% of all data
#training_data_y = scaledy[:training_data_len] #95% of all data
training_data_y = output_data[:training_data_len]

X_train, y_train = [], []

In [299]:
#Sliding window over last 24 hrs
for i in range(24, training_data_len):
    X_train.append(training_data_x[i-24:i, :])
    y_train.append(training_data_y[i,0])

#Convert lists to arrays
X_train = np.array(X_train)
y_train = np.array(y_train).reshape(-1, 1)

In [300]:
cleancsv["Onshore"].value_counts(normalize=True)


Onshore
0    0.644501
1    0.355499
Name: proportion, dtype: float64

In [301]:
print("X_train:", X_train.shape)   # (samples, 24, 13)
print("y_train:", y_train.shape)   # (samples, 1)
print("First 10 labels:", y_train[:10].ravel())

X_train: (2440, 24, 12)
y_train: (2440, 1)
First 10 labels: [0 0 0 0 0 0 0 0 0 1]


In [302]:
#Put more weight on 1's
labels = cleancsv["Onshore"].astype(int).values
neg, pos = np.bincount(labels)
total = neg + pos

class_weight = {
    0: total / (2.0 * neg),
    1: total / (2.0 * pos)
}
print("class_weight:", class_weight)


class_weight: {0: np.float64(0.7757936507936508), 1: np.float64(1.4064748201438848)}


In [303]:
#Build the model
model = keras.models.Sequential()

In [304]:
#Layer Zero input_shape=(X_train.shape[1], 1)
model.add(keras.layers.Input(shape=(X_train.shape[1], X_train.shape[2])))

In [305]:
#First Layer input_shape=(X_train.shape[1], 1)
model.add(keras.layers.LSTM(64, return_sequences=True))

In [306]:
#Second Layer
model.add(keras.layers.LSTM(64, return_sequences=False))

In [307]:
#3rd Layer (Dense)
model.add(keras.layers.Dense(128, activation="relu"))

In [308]:
#4th Layer (Dropout)
model.add(keras.layers.Dropout(0.5))

In [309]:
#Final Output Layer (Dense)
model.add(keras.layers.Dense(1, activation="sigmoid"))

In [310]:
#Put all the layers together
model.summary()
model.compile(optimizer="adam",
    loss="binary_crossentropy",           # binary classification loss
    metrics=[keras.metrics.BinaryAccuracy(), keras.metrics.AUC()])

In [311]:
#Train the model

#epochs = # of runs
#batch size = how much data is in each batch
training = model.fit(
    X_train,
    y_train, 
    epochs=100, 
    batch_size=32,
    validation_split=0.2,
    shuffle=False,
    class_weight=class_weight
    )

Epoch 1/100


[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - auc_10: 0.5959 - binary_accuracy: 0.4754 - loss: 0.6992 - val_auc_10: 0.5000 - val_binary_accuracy: 0.2234 - val_loss: 0.6955
Epoch 2/100
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - auc_10: 0.5099 - binary_accuracy: 0.4175 - loss: 0.7193 - val_auc_10: 0.5000 - val_binary_accuracy: 0.2234 - val_loss: 0.6990
Epoch 3/100
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - auc_10: 0.4535 - binary_accuracy: 0.4175 - loss: 0.7186 - val_auc_10: 0.5000 - val_binary_accuracy: 0.2234 - val_loss: 0.7024
Epoch 4/100
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - auc_10: 0.5000 - binary_accuracy: 0.4175 - loss: 0.7179 - val_auc_10: 0.5000 - val_binary_accuracy: 0.2234 - val_loss: 0.7056
Epoch 5/100
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - auc_10: 0.4308 - binary_accuracy: 0.4175 - loss: 0.7173 - val_auc_10: 0.5

In [312]:
test_x = scaledx[training_data_len-24:]
X_test = []

#rebuild window
for i in range(24, len(test_x)):
    X_test.append(test_x[i-24:i, :])

X_test = np.array(X_test)   # (samples_test, 24, n_features)

#model outputs probability in [0,1] because of Dense(1, activation="sigmoid")
prediction_prob = model.predict(X_test)          # shape (n_test, 1)
print(prediction_prob[:10])
#convert to 0/1 flag
threshold = 0.5
prediction_flag = (prediction_prob >= threshold).astype(int)  # shape (n_test, 1)

prediction_scaled = model.predict(X_test)

# back to original units
#prediction = scaler_y.inverse_transform(prediction_scaled)  


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[[0.5641844]
 [0.5641844]
 [0.5641844]
 [0.5641844]
 [0.5641844]
 [0.5641844]
 [0.5641844]
 [0.5641844]
 [0.5641844]
 [0.5641844]]
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [313]:
# rows that correspond to X_test / predictions
test_start = training_data_len
test_end   = training_data_len + prediction_flag.shape[0]

test_df = cleancsv.iloc[test_start:test_end].copy()
test_df["Onshore_pred_prob"] = prediction_prob.ravel()
test_df["Onshore_pred_flag"] = prediction_flag.ravel()

test_df.to_csv("CSV/predictions_binary.csv", index=False)
test_df.head()

Unnamed: 0,Date,Mainland Air Temp,Humidity (%),Direction (A),Wind Speed (A),Gusting,Atmospheric Pressure (IN),Precipitation Rate,Bay Temp,Salinity,LBI Air Temp,Ocean Temp,Onshore,upwelling_flag,Onshore_pred_prob,Onshore_pred_flag
2464,2025-09-12 16:00:00,22.6,67.0,67.5,6.1,13.5,30.12,0.0,22.36,30.19,21.4,21.7,1,0,0.564184,1
2465,2025-09-12 17:00:00,22.2,71.0,67.5,3.5,6.0,30.13,0.0,21.97,30.14,21.7,21.7,1,0,0.564184,1
2466,2025-09-12 18:00:00,21.7,70.0,67.5,2.3,5.8,30.12,0.0,22.09,30.05,21.8,21.7,1,0,0.564184,1
2467,2025-09-12 19:00:00,20.6,74.0,22.5,0.5,1.5,30.12,0.0,21.79,29.73,21.4,21.7,1,0,0.564184,1
2468,2025-09-12 20:00:00,18.6,81.0,0.0,0.9,2.4,30.14,0.0,21.74,29.33,20.8,21.7,0,0,0.564184,1


In [314]:
y_true = cleancsv["Onshore"].iloc[test_start:test_end].values
y_pred = prediction_flag.ravel()

print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true, y_pred, digits=3))

[[  0 229]
 [  0  44]]
              precision    recall  f1-score   support

           0      0.000     0.000     0.000       229
           1      0.161     1.000     0.278        44

    accuracy                          0.161       273
   macro avg      0.081     0.500     0.139       273
weighted avg      0.026     0.161     0.045       273



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
