In [None]:
! pip install geopandas

# Import Libraries

In [2]:
import torch.nn as nn
import torch.nn.functional as F
import torch
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.feature_selection as fs
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, StackingClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
import sklearn.preprocessing as preprocessing
from sklearn.metrics import confusion_matrix, mean_absolute_error, mean_squared_error
from torch.utils.data import Dataset, DataLoader
from torch.nn.functional import normalize
import math
from mpl_toolkits.mplot3d import Axes3D
from sklearn.model_selection import KFold
from sklearn.svm import SVC, NuSVC
from sklearn.pipeline import make_pipeline

In [3]:
data = pd.read_csv("../../../../../../Downloads/UKRiverData.csv")
print(len(data))

141431


In [4]:
weather = pd.read_csv("../../../../../../Downloads/archive.csv")
weather['time'] = pd.to_datetime(weather['time'])
len(weather)

253752

# Preprocessing

In [5]:
data = data.dropna(subset=['DO_MGL', 'NH4_N_MGL', 'PH', 'NO2_N_MGL'])
# Convert 'Date' and 'Time' columns to datetime
data['Date'] = pd.to_datetime(data['Date'])
data['Time'] = pd.to_datetime(data['Time'])

# Combine 'Date' and 'Time' columns into 'DateTime' column
data['DateTime'] = data['Date'] + pd.to_timedelta(data['Time'].dt.strftime('%H:%M:%S'))
data = data.rename(columns={'DateTime': 'time'})
data = data.drop_duplicates(subset=['time'])
data['time'] = data['time'].dt.round('H')
print(len(data))

# Remove 'Date' and 'Time' columns if no longer needed
data = data.drop(['Site_Code', 'Site_Status_21Oct2020', 'OBJECTID', 'Station_Name', 'RWB_ID_RBP2', 'FESOL1_UGL',
                  'P_SOL_MGL', 'SS_MGL','ZN_SOL_UGL', 'GlobalID','Primary_Basin', 'Depth', 'ALK_MGL', 'BOD_MGL', 
                  'COND_USCM', 'CUSOL1_MGL', 'CUSOL2_UGL', 'Date', 'Time'], axis=1)

# Combine the dataframs using merge function
weather['time'] = weather['time'].dt.tz_localize('UTC')
comb = pd.merge(data, weather, on=['time'], how='left')

# Convert our problem to a classification problem
comb['decreaseFeed'] = 1
comb.loc[comb['NH4_N_MGL'] <= 0.1, 'decreaseFeed'] = 0

90393


In [None]:
x = comb.sample(2000)
dec = x[x['decreaseFeed'] == 1]
man = x[x['decreaseFeed'] == 0]
plt.plot(dec['NO3_N_MGL'], dec['NO2_N_MGL'], 'ro')
plt.plot(man['NO3_N_MGL'], man['NO2_N_MGL'], 'bo')
plt.show()

# Feature Selection

In [6]:
samp = comb.sample(5000)
X = samp[['DO_MGL','PH','temperature_2m (°C)', 'pressure_msl (hPa)', 'diffuse_radiation (W/m²)']].values
y = samp['decreaseFeed'].values
regressor = RandomForestRegressor(n_estimators=50)
regressor.fit(X, y)

print(regressor.feature_importances_)

[0.10251899 0.12441052 0.1698491  0.12720548 0.10725873 0.36875718]


# Regression/Classification Methods

In [6]:
# split testset into training and tetsing
test = comb.copy(True)
X_train,X_test,y_train,y_test = train_test_split(test[['DO_MGL','PH','temperature_2m (°C)','pressure_msl (hPa)', 'diffuse_radiation (W/m²)']], 
        test['decreaseFeed'], test_size=0.15)

In [7]:
def stats(y_pred, y_test):
    cm = confusion_matrix(y_test, y_pred)
    print(cm)

    tp = cm.data[0, 0]
    tn = cm.data[0, 1]
    fp = cm.data[1, 0]
    fn = cm.data[1, 1]

    print("Precision : ", tp/(tp + tn))
    print("Recall: ", tp/(tp + fp))
    print("Accuracy: ", (tp + fn)/(tp + tn + fp + fn))

    # Calculate MAE
    mae = mean_absolute_error(y_test, y_pred)

    # Calculate RMSE
    mse = mean_squared_error(y_test, y_pred, squared=False)

    print("Mean Absolute Error (MAE):", mae)
    print("Root Mean Squared Error (RMSE):", math.sqrt(mse))

In [9]:
print("------ Polynomial SVC Results ------")
clf = make_pipeline(preprocessing.SplineTransformer(), SVC(kernel='poly'))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
stats(y_pred, y_test)

print("------ Exponential SVC Results ------")
clf = make_pipeline(preprocessing.QuantileTransformer(), SVC(kernel='rbf'))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
stats(y_pred, y_test)

------ Linear SVC Results ------
[[9395  163]
 [3664  337]]
Precision :  0.9829462230592174
Recall:  0.7194272149475458
Accuracy:  0.717752046611107
Mean Absolute Error (MAE): 0.28224795338889297
Root Mean Squared Error (RMSE): 0.7288827973137014
------ Exponential SVC Results ------
[[9233  325]
 [3475  526]]
Precision :  0.9659970705168446
Recall:  0.7265502045955303
Accuracy:  0.7197433439044177
Mean Absolute Error (MAE): 0.28025665609558226
Root Mean Squared Error (RMSE): 0.727593790434319


In [10]:
print("------ Boosting Results ------")
model = make_pipeline(preprocessing.SplineTransformer() ,  GradientBoostingClassifier(learning_rate=0.1, max_depth=6, criterion='squared_error'))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
stats(y_pred, y_test)

print("------ Bagging Results ------")
clf = BaggingClassifier(n_jobs=5)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
stats(y_pred, y_test)

print("------ Stacking Results ------")
neural = ('nn', make_pipeline(preprocessing.SplineTransformer(), MLPClassifier(hidden_layer_sizes=(16, 32), activation='relu', max_iter=200000)))
clf = StackingClassifier(estimators=[
    ('rf', RandomForestClassifier(max_depth=24, criterion='entropy', max_features='sqrt')), 
    ('svm', SVC(kernel='rbf')),
    ('gb', GradientBoostingClassifier(max_depth=2))
    ])
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
stats(y_pred, y_test)

------ Boosting Results ------
[[8846  638]
 [2935 1140]]
Precision :  0.9327288064107971
Recall:  0.750870044987692
Accuracy:  0.7364849915185485
Mean Absolute Error (MAE): 0.26351500848145143
Root Mean Squared Error (RMSE): 0.7164754911783368
------ Bagging Results ------
[[8407 1077]
 [2688 1387]]
Precision :  0.8864403205398566
Recall:  0.7577287066246057
Accuracy:  0.7223246552105612
Mean Absolute Error (MAE): 0.2776753447894387
Root Mean Squared Error (RMSE): 0.7259125920310261
------ Stacking Results ------
[[8687  797]
 [2607 1468]]
Precision :  0.9159637283846478
Recall:  0.7691694705153179
Accuracy:  0.7489490375396416
Mean Absolute Error (MAE): 0.2510509624603584
Root Mean Squared Error (RMSE): 0.7078487552110309


In [1]:
print("------ KNN Results ------")
from sklearn.neighbors import KNeighborsClassifier
k = 2 
knn = KNeighborsClassifier(n_neighbors=k)
# Train the kNN classifier
knn.fit(X_train, y_train)
# Make predictions on the test set
y_pred = knn.predict(X_test)
stats(y_pred, y_test)

print("------ Decision Tree Results ------")
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=20)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
stats(y_pred, y_test)

print("------ Random Forest Results ------")
clf = RandomForestClassifier(max_depth=24, criterion='entropy', max_features='sqrt')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
stats(y_pred, y_test)

------ KNN Results ------


NameError: name 'X_train' is not defined

# Deep Learning

In [75]:
class MyDataset(Dataset):
    def __init__(self, dataframe):
        self.numeric_columns = ['DO_MGL','PH','temperature_2m (°C)', 'pressure_msl (hPa)', 'diffuse_radiation (W/m²)']
        self.transformer = preprocessing.MaxAbsScaler()
        transformed_data = self.transformer.fit_transform(dataframe[self.numeric_columns].values)
        dataframe[self.numeric_columns] = transformed_data
        self.data = dataframe.dropna(subset=['decreaseFeed']).reset_index(drop=True)
        

    def __len__(self):
        return len(self.data)
    
    def getInverseTransform(self, inputs):
        return self.quantile_transformer.inverse_transform(inputs)

    def __getitem__(self, idx):
        inputs = self.data[self.numeric_columns].iloc[idx].values.astype(np.float32)
        label = self.data['decreaseFeed'].iloc[idx].astype(np.float32)
        inputs = torch.tensor(inputs)
        label = torch.tensor(label)
        return inputs, label

Artifical Neural Network

In [83]:
model = nn.Sequential(
    nn.Linear(3, 16),
    nn.Dropout(0.5),
    nn.LeakyReLU(),
    nn.Linear(16, 32),
    nn.Dropout(0.5),
    nn.LeakyReLU(),
    nn.Linear(32, 8),
    nn.Dropout(0.5),
    nn.LeakyReLU(),
    nn.Linear(8, 1),
)

Reccurrent Neural Network

In [32]:
class extract_tensor(nn.Module):
    def forward(self,x):
        # Output shape (batch, features, hidden)
        tensor, _ = x
        # Reshape shape (batch, hidden)
        return tensor[-1, :]

model = nn.Sequential(
    nn.RNN(input_size=3, hidden_size=32, num_layers=5, batch_first=True, nonlinearity="relu"),
    extract_tensor(),
    nn.CELU(),
    nn.Linear(32, 1),
)

Long Short Term Memory (LSTM)

In [54]:
model = nn.Sequential(
    nn.LSTM(input_size=3, hidden_size=16, num_layers=3 ,dropout=0.15),
    extract_tensor(),
    nn.Linear(16, 1)
)

A mix of RNN and LSTM

In [63]:
model = nn.Sequential(
    nn.LSTM(input_size=3, hidden_size=8, num_layers=3 ,dropout=0.15),
    extract_tensor(),
    nn.Linear(8, 8),
    nn.CELU(),
    nn.RNN(input_size=8, hidden_size=16, num_layers=2, batch_first=True, nonlinearity="relu"),
    extract_tensor(),
    nn.Linear(16, 1)
)

In [84]:
loss_fn = nn.SmoothL1Loss()
optimizer = optim.AdamW(model.parameters(), lr=0.00005, weight_decay=0.1)
num_epochs = 1

test = comb.copy(True)

# split testset into training and tetsing
trainset, testset = train_test_split(test, test_size=0.15)

dataset = MyDataset(trainset)
data_loader = DataLoader(dataset, drop_last=True)

print(model)
for epoch in range(num_epochs):
    running_loss = 0.0
    model.train()

    for inputs, labels in data_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        
        labels = labels.view(outputs.shape)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += abs(loss.item())

    epoch_loss = running_loss / len(data_loader)
    print(f"Epoch {epoch+1}, Training Loss: {epoch_loss:.4f}")

Sequential(
  (0): Linear(in_features=3, out_features=16, bias=True)
  (1): Dropout(p=0.5, inplace=False)
  (2): LeakyReLU(negative_slope=0.01)
  (3): Linear(in_features=16, out_features=32, bias=True)
  (4): Dropout(p=0.5, inplace=False)
  (5): LeakyReLU(negative_slope=0.01)
  (6): Linear(in_features=32, out_features=8, bias=True)
  (7): Dropout(p=0.5, inplace=False)
  (8): LeakyReLU(negative_slope=0.01)
  (9): Linear(in_features=8, out_features=1, bias=True)
)
Epoch 1, Training Loss: 0.1157


In [85]:
y_test_all = []
y_pred_all = []

dataset = MyDataset(testset)
data_loader = DataLoader(dataset)

# Disable gradient calculation
with torch.no_grad():
    for inputs, label in data_loader:
        # Forward pass to get predictions
        y_pred = model(inputs.unsqueeze(1))
        if y_pred <= 0.5:
            y_pred_all.append(0)  # Append zero to the list
        else:
            y_pred_all.append(1)  # Append one to the list
        
        # Convert the predictions and targets to numpy arrays
        label_np = label.numpy()

        # Append batch results to the overall lists
        y_test_all.append(label_np)

# Concatenate the lists into a single array
y_test_all = np.array(y_test_all)
y_pred_all = np.array(y_pred_all)

stats(y_pred_all, y_test_all)

[[9545    1]
 [4013    0]]
Precision :  0.9998952440812906
Recall:  0.7040123912081427
Accuracy:  0.7039604690611402
Mean Absolute Error (MAE): 0.2960395309388598
Root Mean Squared Error (RMSE): 0.7376280558600531
