In [82]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

In [79]:
# Load the data
data = pd.read_csv("../datasets/Philly/DO_QAQC.csv")

# Check if DateTime_EST exists in the DataFrame
if 'DateTime_EST' in data.columns:
    # Convert DateTime_EST to datetime
    data['DateTime_EST'] = pd.to_datetime(data['DateTime_EST'])
else:
    raise ValueError("DateTime_EST column doesn't exist in the provided dataset.")


In [101]:
data['Site'].unique()

array(['U_A_0', 'U_A_2.5', 'D_A_4', 'D_A_5.5', 'U_B_12', 'D_B_14.5',
       'D_B_15.5', 'D_All_16.5', 'D_All_33.5'], dtype=object)

In [81]:
weather_data = pd.read_csv("../datasets/Philly/phillyweather.csv")
weather_data = weather_data.rename(columns={'time': 'DateTime_EST'})
weather_data['DateTime_EST'] = pd.to_datetime(weather_data['DateTime_EST'])

# Merge the datasets on 'DateTime_EST'
merged_data = pd.merge(data, weather_data, on='DateTime_EST')

In [83]:
# Select features and target
features = merged_data[['temperature_2m (°C)', 'relativehumidity_2m (%)', 'surface_pressure (hPa)', 
                        'windspeed_10m (km/h)', 'direct_radiation (W/m²)', 'diffuse_radiation (W/m²)', 'Depth_m']]
target = merged_data['DO_mg_L']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)


In [85]:
# Exclude records with missing target values from the features and target datasets
missing_target_index = y_train.index[y_train.isnull()]
X_train = X_train.drop(missing_target_index)
y_train = y_train.dropna()

# Normalize the features using StandardScaler
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply PCA
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Train the SVR model
svr.fit(X_train_pca, y_train)

# Predict on the testing set
y_pred = svr.predict(X_test_pca)

# Calculate evaluation metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print ("SVR on whole dataset")
r2, mae, rmse

(0.2948925792455934, 1.9282660533707225, 2.512321967703182)

SVR model

In [104]:
# Initialize the SVR model
svr = SVR(kernel='rbf')

# Initialize the StandardScaler and PCA
scaler = StandardScaler()
pca = PCA(n_components=0.95, svd_solver='full')  # retain 95% of the variance

# Select the subset of data from site 'D_A_5.5'
data_subset = merged_data[merged_data['Site'] == 'D_A_5.5']
features_subset = data_subset[['temperature_2m (°C)', 'relativehumidity_2m (%)', 'surface_pressure (hPa)', 
                                  'windspeed_10m (km/h)', 'direct_radiation (W/m²)', 'diffuse_radiation (W/m²)', 'Depth_m']]
target_subset = data_subset['DO_mg_L']


# Split the data into training and testing sets
X_train_subset, X_test_subset, y_train_subset, y_test_subset = train_test_split(features_subset, target_subset, 
                                                                                test_size=0.2, random_state=42)

# Normalize the features using StandardScaler
X_train_scaled_subset = scaler.fit_transform(X_train_subset)
X_test_scaled_subset = scaler.transform(X_test_subset)

# Apply PCA
X_train_pca_subset = pca.fit_transform(X_train_scaled_subset)
X_test_pca_subset = pca.transform(X_test_scaled_subset)

# Train the SVR model and compute evaluation metrics
svr.fit(X_train_pca_subset, y_train_subset)
y_pred_subset = svr.predict(X_test_pca_subset)

r2 = r2_score(y_test_subset, y_pred_subset)
mae = mean_absolute_error(y_test_subset, y_pred_subset)
rmse = np.sqrt(mean_squared_error(y_test_subset, y_pred_subset))

r2, mae, rmse


(0.481404427479976, 1.200479630566635, 1.6595924432940934)

46546

Random Forest, Gradient boosting, and LightGMB

In [115]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Initialize the models
models = {
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'LightGBM': LGBMRegressor(random_state=42)
}

data_subset = merged_data[merged_data['Site'] == 'D_A_5.5']
features_subset = data_subset[['temperature_2m (°C)', 'relativehumidity_2m (%)', 'surface_pressure (hPa)', 
                                  'windspeed_10m (km/h)', 'direct_radiation (W/m²)', 'diffuse_radiation (W/m²)', 'Depth_m']]
target_subset = data_subset['DO_mg_L']


# Split the data into training and testing sets
X_train_subset, X_test_subset, y_train_subset, y_test_subset = train_test_split(features_subset, target_subset, 
                                                                                test_size=0.2, random_state=42)

# Normalize the features using StandardScaler
X_train_scaled_subset = scaler.fit_transform(X_train_subset)
X_test_scaled_subset = scaler.transform(X_test_subset)

# Apply PCA
X_train_pca_subset = pca.fit_transform(X_train_scaled_subset)
X_test_pca_subset = pca.transform(X_test_scaled_subset)


# Function to train a model and compute evaluation metrics
def train_and_evaluate(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    return r2, mae, rmse

# Results will be stored in this dictionary
results = {}

# Train each model on the whole dataset
for model_name, model in models.items():
    print(f"Training {model_name} on subset...")
    r2, mae, rmse = train_and_evaluate(model, X_train_pca_subset, X_test_pca_subset, y_train_subset, y_test_subset)
    results[(model_name, 'Whole dataset')] = {'R^2': r2, 'MAE': mae, 'RMSE': rmse}


Training Random Forest on subset...


Training Gradient Boosting on subset...
Training XGBoost on subset...
Training LightGBM on subset...


In [110]:
results #whole dataset

{('Random Forest', 'Whole dataset'): {'R^2': 0.616813397445579,
  'MAE': 1.3651664193232371,
  'RMSE': 1.8520504440251297},
 ('Gradient Boosting', 'Whole dataset'): {'R^2': 0.40270455047862164,
  'MAE': 1.7554510928439677,
  'RMSE': 2.3122897615839766},
 ('XGBoost', 'Whole dataset'): {'R^2': 0.5224408663405107,
  'MAE': 1.5618134436798894,
  'RMSE': 2.0675748034771746},
 ('LightGBM', 'Whole dataset'): {'R^2': 0.49698106610811,
  'MAE': 1.606663644924342,
  'RMSE': 2.121972832598724}}

In [116]:
results

{('Random Forest', 'Whole dataset'): {'R^2': 0.6295954638212846,
  'MAE': 1.1536115570934258,
  'RMSE': 1.5160071120086804},
 ('Gradient Boosting', 'Whole dataset'): {'R^2': 0.48036858619119827,
  'MAE': 1.401986339871773,
  'RMSE': 1.7956049310925377},
 ('XGBoost', 'Whole dataset'): {'R^2': 0.5666777365815845,
  'MAE': 1.26670609478802,
  'RMSE': 1.6397158392227416},
 ('LightGBM', 'Whole dataset'): {'R^2': 0.5854989989644558,
  'MAE': 1.2453077226106546,
  'RMSE': 1.6037101660199813}}