#### Building Advanced Baseline Model - Random Forest
Baseline models serve as a reference point for evaluating the performance of more complex models. They provide a simple, interpretable way to measure improvements and assess whether an advanced model is actually performing better than a naive approach. We started with Linear Regression, then progressing to Random Forest and then finally use Gradient Boosting.

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
root = '/content/drive/MyDrive/SnowPackPredictionChallenge'

In [11]:
# parse swe_data
df = pd.read_csv(root + '/feature_engineered_data.csv')
df

Unnamed: 0,Station,Latitude,Longitude,Elevation,Southness,Date,SWE,nearest_md_latitude,nearest_md_longitude,precip,...,Rmin_roll3,Rmin_roll7,windspeed_roll3,windspeed_roll7,temp_range,snowfall,humidity_diff,day_of_year,humidity_temp_interaction,wind_humidity_interaction
0,Hannagan Meadows,33.65352,-109.30877,9027,0.888152,1991-01-08,279.40,33.65625,-109.28125,0.00,...,45.120000,43.608571,2.963333,4.830000,17.02,0.00,58.98,8,425.8356,155.1174
1,Hannagan Meadows,33.65352,-109.30877,9027,0.888152,1991-01-09,279.40,33.65625,-109.28125,0.35,...,37.123333,40.572857,3.130000,4.901429,15.22,0.00,70.67,9,446.6344,291.8671
2,Hannagan Meadows,33.65352,-109.30877,9027,0.888152,1991-01-10,281.94,33.65625,-109.28125,0.35,...,33.226667,40.642857,3.630000,4.955714,15.22,0.00,70.67,10,446.6344,291.8671
3,Hannagan Meadows,33.65352,-109.30877,9027,0.888152,1991-01-11,281.94,33.65625,-109.28125,0.00,...,28.913333,38.191429,4.096667,4.278571,17.62,0.00,71.92,11,403.4712,289.8376
4,Hannagan Meadows,33.65352,-109.30877,9027,0.888152,1991-01-12,281.94,33.65625,-109.28125,0.00,...,26.910000,35.060000,3.933333,3.545714,19.28,0.00,55.43,12,505.5216,201.7652
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1897995,Garver Creek,48.97523,-115.81915,4250,-0.927766,2016-12-27,73.66,48.96875,-115.84375,0.00,...,55.280000,45.525714,2.180000,2.528571,7.84,0.00,21.00,362,-85.4700,45.7800
1897996,Garver Creek,48.97523,-115.81915,4250,-0.927766,2016-12-28,81.28,48.96875,-115.84375,0.00,...,55.280000,47.964286,2.180000,2.441429,7.84,0.00,21.00,363,-85.4700,45.7800
1897997,Garver Creek,48.97523,-115.81915,4250,-0.927766,2016-12-29,83.82,48.96875,-115.84375,3.70,...,55.673333,50.571429,2.523333,2.501429,9.42,3.70,30.36,364,-17.9124,97.4556
1897998,Garver Creek,48.97523,-115.81915,4250,-0.927766,2016-12-30,83.82,48.96875,-115.84375,3.70,...,56.066667,53.178571,2.866667,2.561429,9.42,3.70,30.36,365,-17.9124,97.4556


In [12]:
# Extract features and target variable
features = ["Latitude", "Longitude", "Elevation", "Southness",
    "precip", "tmin", "tmax", "SPH", "SRAD", "Rmax", "Rmin", "windspeed",
    "SWE_lag1", "SWE_lag3", "SWE_lag7",
    "precip_lag1", "tmin_lag1", "tmax_lag1", "SPH_lag1",
    "SRAD_lag1", "Rmax_lag1", "Rmin_lag1", "windspeed_lag1",
    "SWE_roll3", "SWE_roll7", "precip_roll3", "tmin_roll3"]
target = "SWE"

In [13]:
# Splitting data into training, validation, and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

X_train, y_train = train_df[features], train_df[target]
X_val, y_val = val_df[features], val_df[target]
X_test, y_test = test_df[features], test_df[target]

In [14]:
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [15]:
# Train and evaluate models with validated data
model.fit(X_train, y_train)
y_pred_rf = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2 = r2_score(y_test, y_pred_rf)
model_results = {"RMSE": rmse, "R2": r2}
model_results

{'RMSE': 3.461958103817338, 'R2': 0.9998154625046747}

#### Model Evaluation - Compute NSE, RMSE, R² Score, and Relative Bias

In [18]:
# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))

# Compute R² Score
r2 = r2_score(y_test, y_pred_rf)

# Compute Relative Bias (%)
relative_bias = (np.sum(y_pred_rf - y_test) / np.sum(y_test)) * 100

# Compute Actual Error (Prediction - Observed)
actual_error = y_pred_rf - y_test

# Compute NSE (Nash-Sutcliffe Efficiency)
observed_mean = np.mean(y_test)
nse = 1 - (np.sum((y_pred_rf - y_test) ** 2) / np.sum((y_test - observed_mean) ** 2))

# Create a results DataFrame
evaluation_results = pd.DataFrame({
    "Metric": ["Nash-Sutcliffe Efficiency (NSE)", "Root Mean Square Error (RMSE)", "R² Score", "Relative Bias (%)", 'Prediction Error'],
    "Value": [nse, rmse, r2, relative_bias, actual_error]
})

# Display evaluation metrics
print("\n📊 Model Evaluation Metrics:")
evaluation_results



📊 Model Evaluation Metrics:


Unnamed: 0,Metric,Value
0,Nash-Sutcliffe Efficiency (NSE),0.999815
1,Root Mean Square Error (RMSE),3.461958
2,R² Score,0.999815
3,Relative Bias (%),0.012463
4,Prediction Error,369090 3.7084 266627 0.0000 376545 ...


In [22]:
predictions_df = pd.DataFrame({
        "Date": test_df["Date"],
        "Latitude": test_df["Latitude"],
        "Longitude": test_df["Longitude"],
        "SWE_actual": y_test,
        "SWE_predicted": y_pred_rf
    })
predictions_df.to_csv("predictions.csv", index=False)
evaluation_df = pd.DataFrame(evaluation_results)
evaluation_df.to_csv("evaluation.csv", index=False)

In [21]:
cd /content/drive/MyDrive/SnowPackPredictionChallenge


/content/drive/MyDrive/SnowPackPredictionChallenge
