<a href="https://colab.research.google.com/github/Madhan-crypto/AI-chatbot/blob/main/Water_demand_forecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from datetime import datetime

# --- Configuration ---
TARGET_COL = 'Total_Consumption_L'
FORECAST_HORIZON = 24  # Forecasting next 24 hours

def load_and_preprocess_data(file_path):
    """Loads and merges historical water and weather data."""
    # Assuming the main data is an hourly time series
    df = pd.read_csv(file_path, parse_dates=['Timestamp'])
    df.set_index('Timestamp', inplace=True)

    # Placeholder: In a real project, you would merge weather and occupancy data here
    # Example: df = pd.merge(df, weather_df, left_index=True, right_index=True)

    return df

def create_time_features(df, include_lags=True):
    """Creates time-based features from the index."""
    df['hour'] = df.index.hour
    df['dayofweek'] = df.index.dayofweek # 0=Monday, 6=Sunday
    df['month'] = df.index.month
    df['dayofyear'] = df.index.dayofyear
    df['weekofyear'] = df.index.isocalendar().week.astype(int)

    # Feature for "Is it a weekend/holiday?"
    df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)

    if include_lags:
        # **Crucial Feature for Water Demand: Lagged Consumption**
        # Demand is highly correlated with past demand (e.g., 24 hours ago, 7 days ago)
        df['lag_24hr'] = df[TARGET_COL].shift(24)
        df['lag_7day'] = df[TARGET_COL].shift(24*7)

        # Drop rows with NaN values created by the lag features
        df.dropna(inplace=True)

    return df

def train_xgboost_model(df):
    """Trains and evaluates the XGBoost Regressor model."""

    features = [col for col in df.columns if col not in [TARGET_COL]]

    X = df[features]
    y = df[TARGET_COL]

    # Use a time-series split (do not shuffle)
    train_size = int(len(df) * 0.8)
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    print(f"Training on {len(X_train)} samples, testing on {len(X_test)} samples.")

    # Initialize and train the XGBoost Regressor
    # Hyperparameters tuned for a typical time-series task
    model = xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.7,
        colsample_bytree=0.7,
        random_state=42,
        n_jobs=-1
    )

    # Removed early stopping arguments due to version incompatibility
    model.fit(X_train, y_train,
              eval_set=[(X_test, y_test)])

    # --- Evaluation ---
    y_pred = model.predict(X_test)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)

    print("\n--- Model Performance ---")
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f} L/hr")
    print(f"Mean Absolute Error (MAE): {mae:.2f} L/hr")

    return model, X_test.index[-1]

def make_future_forecast(model, last_timestamp, num_hours):
    """Generates the forecast for the next N hours."""

    print(f"\nGenerating forecast for the next {num_hours} hours...")

    future_index = pd.date_range(
        start=last_timestamp + pd.Timedelta(hours=1),
        periods=num_hours,
        freq='h'
    )

    # Create an empty DataFrame for future features
    future_df = pd.DataFrame(index=future_index)

    # Generate time features for the future period, excluding lags for now
    future_df = create_time_features(future_df.reset_index(names=['Timestamp']).set_index('Timestamp'), include_lags=False)

    # For a real forecast, you would need to:
    # 1. Fill in future weather forecasts (e.g., predicted T-max)
    # 2. Estimate future lagged consumption using the model's previous forecasts (recursive forecasting)
    # For this simple example, we fill lag features with the last known actual consumption
    # Ensure df is accessible or passed if needed, here it's a global variable from main
    global df # Access the df from the global scope

    last_actual_consumption = df[TARGET_COL].iloc[-1]
    last_week_consumption = df[TARGET_COL].iloc[-24*7]

    future_df['lag_24hr'] = last_actual_consumption
    future_df['lag_7day'] = last_week_consumption

    # Align features with the trained model's feature set
    # Exclude TARGET_COL as it's not present in future_df and shouldn't be a feature anyway
    # We also need to ensure that the features used for prediction match the training features.
    # For simplicity, let's assume all columns in future_df except TARGET_COL (if it existed) are features.
    # The train_xgboost_model already creates the 'features' list correctly.

    # Drop any remaining NaN values that might arise from manual lag creation if num_hours is too small
    future_df.dropna(inplace=True)

    # Now, filter future_df to only include features that the model was trained on
    # This assumes 'model' and its 'feature_names_in_' or similar are available.
    # A more robust solution would pass the feature list from train_xgboost_model
    # For this example, let's derive it from the training phase directly:
    training_features = [col for col in df.columns if col not in [TARGET_COL, 'lag_24hr', 'lag_7day']]
    # And add the explicitly created lag features
    training_features.extend(['lag_24hr', 'lag_7day'])

    # Ensure the order of columns matches the training data
    # This assumes create_time_features creates columns in a consistent order.
    # Re-running create_time_features on a dummy df to get the order might be needed for robustness
    # For now, let's assume `future_df` has all necessary columns due to manual assignment.

    # Temporarily create a dummy dataframe to get the feature order and check for missing columns
    dummy_df_for_features = create_time_features(pd.DataFrame(index=future_index.copy()), include_lags=False)
    dummy_df_for_features['lag_24hr'] = 0 # Placeholder
    dummy_df_for_features['lag_7day'] = 0 # Placeholder

    # Get the feature names used during training from the model itself
    # This is the most reliable way to ensure feature consistency
    expected_features = model.feature_names_in_ if hasattr(model, 'feature_names_in_') else list(X.columns) # X is not in scope here

    # A safer approach for this specific problem (KeyError from TARGET_COL in future_df) is simply
    # to ensure that create_time_features when used for future_df does NOT attempt to use TARGET_COL.
    # The original filtering for model.predict() was:
    # future_df = future_df[[col for col in future_df.columns if col != TARGET_COL]]
    # This line is not needed if TARGET_COL is never added to future_df in the first place.

    # Let's ensure feature_df has only the columns expected by the model in the correct order
    # This requires knowing the exact feature set from training. For now, we trust the manual creation.

    # The original issue was fixed by include_lags=False. The following block should ensure feature consistency.
    # Let's re-align the features of future_df with the features the model was trained on.
    # The 'features' variable from train_xgboost_model is not directly accessible here.
    # We need to ensure that `future_df` only contains the features used for training and in the correct order.

    # Assuming `features` from `train_xgboost_model` included the time features and the two lag features.
    # The simplest way to handle this without changing train_xgboost_model's return is to infer.
    # Let's re-run a small part of create_time_features to determine the features used during training.
    # This is a bit redundant but ensures correctness without refactoring train_xgboost_model's output.

    # Create a dummy row to get feature names in correct order
    dummy_row_for_feature_order = create_time_features(pd.DataFrame(index=[df.index[-1]], columns=[TARGET_COL]), include_lags=True)
    feature_names_in_order = dummy_row_for_feature_order.columns.drop(TARGET_COL).tolist()

    # Ensure future_df has all these columns, filling missing ones with 0 or a sensible default if they were part of training.
    # The `future_df` already has the necessary time features and lags were added manually.
    # We just need to ensure the order is correct and no extra columns are present.

    # Reorder future_df columns to match the training order
    final_future_df_features = []
    for feature in expected_features: # Use expected_features from model.feature_names_in_ or X.columns
        if feature in future_df.columns:
            final_future_df_features.append(feature)
        else:
            # Handle cases where a feature might not be dynamically created (e.g., external weather data)
            # For this example, we assume all features are either time-based or manually assigned lags.
            print(f"Warning: Feature '{feature}' from training is missing in future_df. Filling with 0.")
            future_df[feature] = 0 # Or a more appropriate default/imputation
            final_future_df_features.append(feature)

    future_df = future_df[final_future_df_features]

    # Make prediction
    forecast = model.predict(future_df)

    forecast_series = pd.Series(forecast, index=future_index, name='Forecast_L')
    print("Forecast (First 5 hours):")
    print(forecast_series.head())

    return forecast_series

# --- Main Execution ---
if __name__ == "__main__":
    # --- 1. Create Sample Data (Replace with your actual data path) ---
    # Create a dummy dataset resembling hourly water usage with a trend
    np.random.seed(42)
    timestamps = pd.date_range(start='2024-01-01', periods=8760, freq='h') # 1 year of hourly data
    daily_pattern = np.sin(np.linspace(0, 4*np.pi, 24)) * 500 + 2000 # Daily cycle
    noise = np.random.normal(0, 100, 8760)

    base_consumption = np.tile(daily_pattern, 365) + noise

    # Simulate a weekly/monthly increase and drop on holidays
    holiday_mask = (pd.Series(timestamps).dt.dayofweek >= 5).values
    consumption = base_consumption * (1 - 0.2 * holiday_mask) # 20% drop on weekends

    sample_data = pd.DataFrame({
        'Timestamp': timestamps,
        TARGET_COL: consumption.clip(min=100) # Min 100 L
    })
    sample_data.to_csv('sample_campus_water_data.csv', index=False)
    # -------------------------------------------------------------------

    # 1. Load & Preprocess
    df = load_and_preprocess_data('sample_campus_water_data.csv')
    df = create_time_features(df, include_lags=True)

    # 2. Train Model
    model, last_known_time = train_xgboost_model(df)

    # 3. Generate Forecast
    future_forecast = make_future_forecast(model, last_known_time, FORECAST_HORIZON)

    # 4. (Next Step) Pass 'future_forecast' to the Optimization Engine
    # This forecast is the input for the "optimal pumping schedule" generation.


Training on 6873 samples, testing on 1719 samples.
[0]	validation_0-rmse:371.13841
[1]	validation_0-rmse:354.96163
[2]	validation_0-rmse:341.49406
[3]	validation_0-rmse:326.74318
[4]	validation_0-rmse:313.61235
[5]	validation_0-rmse:300.50084
[6]	validation_0-rmse:288.12324
[7]	validation_0-rmse:276.69197
[8]	validation_0-rmse:265.84706
[9]	validation_0-rmse:255.36153
[10]	validation_0-rmse:245.50059
[11]	validation_0-rmse:236.36830
[12]	validation_0-rmse:227.80341
[13]	validation_0-rmse:219.63684
[14]	validation_0-rmse:212.22769
[15]	validation_0-rmse:205.02813
[16]	validation_0-rmse:198.21447
[17]	validation_0-rmse:192.17834
[18]	validation_0-rmse:186.34182
[19]	validation_0-rmse:180.29013
[20]	validation_0-rmse:175.02253
[21]	validation_0-rmse:170.29058
[22]	validation_0-rmse:165.63899
[23]	validation_0-rmse:161.26627
[24]	validation_0-rmse:157.33026
[25]	validation_0-rmse:153.59062
[26]	validation_0-rmse:150.08750
[27]	validation_0-rmse:146.90063
[28]	validation_0-rmse:144.42155
[2