In [None]:
! pip install sklearn

In [1]:
import torch.nn as nn
import torch.nn.functional as F
import torch
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.feature_selection as fs
import pickle

# Grab Dataset

In [2]:
ysi = pd.read_csv("./datasets/ysi_dataset.csv")
weather = pd.read_csv("./datasets/weather_dataset.csv")
# convert to date time and round ysi to nearest 15 mins
ysi['datetime'] = pd.to_datetime(ysi['datetime']).dt.round('15min')
weather['datetime'] = pd.to_datetime(weather['datetime'])

In [3]:
# combine both DF's on datetime
combined = pd.merge(ysi, weather, on='datetime')
combined.reset_index(drop=True, inplace=True)
combined = combined.drop(columns=['experimentid_y'])
combined.columns

Index(['datetime', 'experimentid_x', 'ph', 'temperature_oc',
       'par_umol_photons_m2_s', 'dissolved_oxygen_mg_l', 'airtemp_oc',
       'global_light_energy_w_m2', 'humid_rh', 'wdspd_m_s'],
      dtype='object')

# Data Visualization

In [4]:
combined = combined.drop(combined[combined['ph'] < 6].index)
combined = combined.drop(combined[combined['ph'] > 9].index)
combined = combined.drop(combined[combined['dissolved_oxygen_mg_l'] < 2].index)
combined = combined.drop(combined[combined['dissolved_oxygen_mg_l'] > 20].index)
combined = combined.drop(combined[combined['dissolved_oxygen_mg_l'] < 3/100 * combined['global_light_energy_w_m2']].index)
combined = combined.drop(combined[combined['dissolved_oxygen_mg_l'] >( 3/100 * combined['global_light_energy_w_m2']) + 15].index)

In [None]:
feat = combined.sample(3000)
plt.plot(feat['temperature_oc'],feat['dissolved_oxygen_mg_l'], 'bo')
plt.ylabel("ph")
plt.xlabel("global_light_energy_w_m2")
plt.show()

# Feature Selection

In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
import sklearn.preprocessing as preprocessing
from torch.utils.data import Dataset, DataLoader
from torch.nn.functional import normalize

In [35]:
X = feat[['ph', 'temperature_oc', 'airtemp_oc','global_light_energy_w_m2', 'humid_rh']].values
y = feat['dissolved_oxygen_mg_l'].values
regressor = RandomForestRegressor(n_estimators=50)
regressor.fit(X, y)


print(regressor.feature_importances_)

model = SelectFromModel(regressor, prefit=True)
X_new = model.transform(X)

[0.09478879 0.0554407  0.05450639 0.73811225 0.05715187]


features selected: ph, temperature_oc, global_light_energy_w_m2, humid_rh
not considered: wdspd_m_s (low correlation), par_umol_photons_m2_s (duplicated),

# Training

In [8]:
test = combined.copy(True)
X_train,X_test,y_train,y_test = train_test_split(test[['ph', 'temperature_oc', 'global_light_energy_w_m2', 'humid_rh', 'airtemp_oc']], 
        test['dissolved_oxygen_mg_l'], test_size=0.15)

In [9]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import math

def stats(y_pred_all, y_test_all):
    # Calculate the R2 score
    r2 = r2_score(y_test_all, y_pred_all)

    print(f"R2 Score: {r2:.4f}")
    # Calculate MAE
    mae = mean_absolute_error(y_test_all, y_pred_all)

    # Calculate RMSE
    mse = mean_squared_error(y_test_all, y_pred_all, squared=False)

    print("Mean Absolute Error (MAE):", mae)
    print("Root Mean Squared Error (RMSE):", math.sqrt(mse))

In [27]:
from sklearn.svm import SVR, NuSVR
from sklearn.pipeline import make_pipeline

print("------------ Exponential SVR Results ------------")
clf = make_pipeline(preprocessing.SplineTransformer(), NuSVR(kernel='rbf', shrinking=True, C=1.5))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
stats(y_pred, y_test)

print("------------ Exponential SVR Results ------------")
clf = make_pipeline(preprocessing.SplineTransformer(), SVR(kernel='rbf', shrinking=True, C=1.5))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
stats(y_pred, y_test)

print("------------ Polynomial SVR Results ------------")
clf = make_pipeline(preprocessing.SplineTransformer(), NuSVR(kernel='poly', shrinking=False, C=2.5))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
stats(y_pred, y_test)

------------ Exponential SVR Results ------------
R2 Score: 0.7813
Mean Absolute Error (MAE): 1.396238650639873
Root Mean Squared Error (RMSE): 1.407476062361893
------------ Exponential SVR Results ------------
R2 Score: 0.7786
Mean Absolute Error (MAE): 1.3394126779684952
Root Mean Squared Error (RMSE): 1.4119513960322871
------------ Polynomial SVR Results ------------
R2 Score: 0.7878
Mean Absolute Error (MAE): 1.3097490344471785
Root Mean Squared Error (RMSE): 1.396967943487707


In [31]:
print("------------ Bagging Results ------------")
from sklearn.ensemble import BaggingRegressor
model = make_pipeline(preprocessing.SplineTransformer(), BaggingRegressor(n_jobs=5))
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
stats(y_pred, y_test)

print("------------ Stacking Results ------------")
from sklearn.ensemble import StackingRegressor
from sklearn.svm import LinearSVR
model = StackingRegressor(estimators=[('svr', LinearSVR(random_state=42))], final_estimator=RandomForestRegressor(n_estimators=10,random_state=42), n_jobs=8)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
stats(y_pred, y_test)



------------ Bagging Results ------------
R2 Score: 0.8579
Mean Absolute Error (MAE): 0.9612561608300908
Root Mean Squared Error (RMSE): 1.2637650351739422
------------ Stacking Results ------------




R2 Score: 0.4461
Mean Absolute Error (MAE): 2.486207738867272
Root Mean Squared Error (RMSE): 1.7756900565900569




In [19]:
print("------------ Random Forest Results ------------")
rf_regressor = make_pipeline(preprocessing.SplineTransformer(), RandomForestRegressor(n_estimators=24, max_depth=20))
rf_regressor.fit(X_train, y_train)
y_pred = rf_regressor.predict(X_test)
stats(y_pred, y_test)

print("------------ Poly Regression Results ------------")
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

poly_features = PolynomialFeatures(degree=5, include_bias=False)
X_train_poly = poly_features.fit_transform(X_train)
X_test_poly = poly_features.transform(X_test)
# Initialize and fit the linear regression model
model = LinearRegression()
model.fit(X_train_poly, y_train)
# Predict the target variable for training and test sets
y_pred = model.predict(X_test_poly)
stats(y_pred, y_test)

print("------------ Decision Tree Results ------------")
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(max_depth=24)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
stats(y_pred, y_test)

print("------------ Ada Boost Results ------------")
from sklearn.ensemble import AdaBoostRegressor
model = AdaBoostRegressor(learning_rate=0.01, loss='linear')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
stats(y_pred, y_test)

print("------------ XG Boost Results ------------")
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(learning_rate=0.1, loss='huber', max_depth=25, criterion='squared_error')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
stats(y_pred, y_test)

------------ Random Forest Results ------------
R2 Score: 0.8606
Mean Absolute Error (MAE): 0.9665799515923718
Root Mean Squared Error (RMSE): 1.2576888631401724
------------ Poly Regression Results ------------
R2 Score: 0.7953
Mean Absolute Error (MAE): 1.388614328916912
Root Mean Squared Error (RMSE): 1.3845104817346046
------------ Decision Tree Results ------------
R2 Score: 0.7702
Mean Absolute Error (MAE): 1.0785718996702511
Root Mean Squared Error (RMSE): 1.4250320418935356
------------ Ada Boost Results ------------
R2 Score: 0.7131
Mean Absolute Error (MAE): 1.7049913766321474
Root Mean Squared Error (RMSE): 1.5063490297530813
------------ XG Boost Results ------------
R2 Score: 0.8250
Mean Absolute Error (MAE): 0.9566795232688703
Root Mean Squared Error (RMSE): 1.3312953307935254


In [None]:
model = nn.Sequential(
    nn.Linear(3, 16),
    nn.CELU(),
    nn.Linear(16, 16),
    nn.CELU(),
    nn.Linear(16, 4),
    nn.CELU(),
    nn.Linear(4, 1)
)

In [None]:
class MyDataset(Dataset):
    def __init__(self, dataframe):
        self.numeric_columns = ['ph', 'temperature_oc', 'global_light_energy_w_m2']
        self.quantile_transformer = preprocessing.PowerTransformer()
        transformed_data = self.quantile_transformer.fit_transform(dataframe[self.numeric_columns].values)
        dataframe[self.numeric_columns] = transformed_data
        self.data = dataframe.dropna(subset=['dissolved_oxygen_mg_l']).reset_index(drop=True)

    def __len__(self):
        return len(self.data)
    
    def getInverseTransform(self, inputs):
        return self.quantile_transformer.inverse_transform(inputs)

    def __getitem__(self, idx):
        inputs = self.data[self.numeric_columns].iloc[idx].values.astype(np.float32)
        label = self.data['dissolved_oxygen_mg_l'].iloc[idx].astype(np.float32)
        inputs = torch.tensor(inputs)
        label = torch.tensor(label)
        return inputs, label


In [None]:
loss_fn = nn.SmoothL1Loss()
optimizer = optim.AdamW(model.parameters(), lr=0.005, weight_decay=0.01)

num_epochs = 20

test = combined.copy(True)

# split testset into training and tetsing
trainset, testset = train_test_split(test, test_size=0.3)


dataset = MyDataset(trainset)
data_loader = DataLoader(dataset, shuffle=True)



# Training loop
for epoch in range(num_epochs):
    running_loss = 0.0

    # Set the model to training mode
    model.train()

    # Iterate over the training batches
    for inputs, labels in data_loader:

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Compute the loss
        loss = loss_fn(outputs, labels)

        # Backward pass
        loss.backward()

        # Update the parameters
        optimizer.step()

        running_loss += loss.item()

    # Calculate average loss for the epoch
    epoch_loss = running_loss / len(data_loader)

    # Print the loss for each epoch
    print(f"Epoch {epoch+1}, Training Loss: {epoch_loss:.4f}")

# Training complete

In [None]:
# save weights
torch.save(model.state_dict(), './weights/weights2.sav')
model.load_state_dict(torch.load('./weights/weights2.sav'))

# weights 1 : loss = 30.98
# weights 2 : loss = 1.1218

# Evaluation

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import math

# Set the model to evaluation mode
model.eval()

# Initialize lists to store true and predicted values
y_test_all = []
y_pred_all = []

newset = pd.merge(ysi, weather, on='datetime').sample(1000)

dataset = MyDataset(testset)
data_loader = DataLoader(dataset)

# Disable gradient calculation
with torch.no_grad():
    for inputs, label in data_loader:

        # Forward pass to get predictions
        y_pred = model(inputs)

        # Convert the predictions and targets to numpy arrays
        y_pred_np = y_pred.numpy()
        label_np = label.numpy()
        #print(dataset.quantile_transformer.inverse_transform(inputs.numpy()))
        #print(f"actual: { y_pred.item():3f} predicted: {label.item():3f}")

        # Append batch results to the overall lists
        y_test_all.append(label_np)
        y_pred_all.append(y_pred_np)

# Concatenate the lists into a single array
y_test_all = np.concatenate(y_test_all)
y_pred_all = np.concatenate(y_pred_all)

# Calculate the R2 score
r2 = r2_score(y_test_all, y_pred_all)

print(f"R2 Score: {r2:.4f}")
# Calculate MAE
mae = mean_absolute_error(y_test_all, y_pred_all)

# Calculate RMSE
mse = mean_squared_error(y_test_all, y_pred_all, squared=False)

print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", math.sqrt(mse))

# Pre - Trained Models

In [None]:
play = nn.Sequential(
    nn.Linear(4, 8),
    nn.LeakyReLU(),
    nn.Linear(8, 8),
    nn.LeakyReLU(),
    nn.Linear(8, 1)
)

play.load_state_dict(torch.load('weights.sav'))

# Save Weights

In [22]:
import tarfile
import json
import io

# Save the trained model
model_filename = './weights/do_rf.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(regressor, file)

# Load the saved model
with open(model_filename, 'rb') as file:
    loaded_model = pickle.load(file)

# Specify the name of the pickle file and the .mar archive file
mar_filename = 'model_archive.mar'

# Create a new tar archive
with tarfile.open(mar_filename, 'w') as archive:
    # Add the pickle file to the archive
    archive.add(model_filename)

# Create a manifest file with the required metadata
manifest = {'model-file': model_filename, 'model-name': 'RandomForestRegressor'}

# Add the manifest file to the archive
with tarfile.open(mar_filename, 'a') as archive:
    manifest_string = json.dumps(manifest)
    manifest_bytes = manifest_string.encode('utf-8')
    manifest_file = tarfile.TarInfo('MANIFEST')
    manifest_file.size = len(manifest_bytes)
    archive.addfile(manifest_file, io.BytesIO(manifest_bytes))
