In [1]:
#Step1: Importing necessary libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np

In [2]:
#Step2: Mount google drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#Step3: Data Preprocessing and Cleaning for Turbidity Dataset.

import pandas as pd
# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/Engg680_Project2024_Group15/dataset/Turbidity_cleaned data.csv')
# Clean the 'Sample Date' column by stripping spaces
data['Sample Date'] = data['Sample Date'].str.strip()
# Convert the 'Sample Date' column to datetime
# Adjust the format if you know it (e.g., '%m/%d/%Y' for MM/DD/YYYY)
data['Sample Date'] = pd.to_datetime(data['Sample Date'], errors='coerce')
# Check for invalid dates
if data['Sample Date'].isna().any():
  print("Warning: Some dates could not be parsed. Check these rows:")
  print(data[data['Sample Date'].isna()])
# Drop rows with invalid dates
data.dropna(subset=['Sample Date'], inplace=True)
# Set 'Sample Date' as the index
data.set_index('Sample Date', inplace=True)

In [4]:
#Step4: Extracting and Cleaning the Numeric Column
data = data['Numeric Result'].dropna()

In [13]:
# Step 5: Generating Lagged Features for Supervised Learning

def create_lagged_features(data, n_lags=12):
    df = pd.DataFrame(data)
    for lag in range(1, n_lags + 1):
        df[f'lag_{lag}'] = df['Numeric Result'].shift(lag)
    df.dropna(inplace=True)
    return df

n_lags = 12
data_lagged = create_lagged_features(data, n_lags)

In [14]:
# Step6: Data Splitting and Training a Random Forest Model
# Split the data into features and target
X = data_lagged.drop(columns='Numeric Result')
y = data_lagged['Numeric Result']
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)
# Train a Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [15]:
# Step7: Model Prediction and RMSE Evaluation
# Make predictions
y_pred = rf.predict(X_test)
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse:.2f}")

RMSE: 25.47


In [17]:
# Step8: Forecasting Future Values Using Random Forest
#Forecasting the next 6 steps (e.g., for years 2025-2030)
forecast_steps = 6
last_observations = X.iloc[-1:].copy()
forecasts = []

for _ in range(forecast_steps):
    forecast = rf.predict(last_observations)[0]
    forecasts.append(forecast)
    # Update lagged features with new forecast
    last_observations = pd.DataFrame([[forecast] + last_observations.iloc[0, :-1].tolist()], columns=last_observations.columns)