In [None]:
#importing modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
from math import sqrt
from sklearn.preprocessing import MinMaxScaler
# import xgboost as xgb
from statsmodels.tsa.arima.model import ARIMA

# from fbprophet import Prophet
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split
from pmdarima import auto_arima
import xgboost as xgb
from datetime import timedelta
import warnings
warnings.filterwarnings(action='ignore')

In [None]:
modelling_data = pd.read_parquet("modelling_data.parquet")
#inspecting the head
modelling_data.head()

XGBOOST MODEL

In [None]:
xg_dummies = pd.get_dummies(modelling_data[['day_of_week', 'time_category', 'month', 'is_holiday']],
                            columns=["day_of_week", "time_category", 'month', 'is_holiday'], prefix= ["day_of_the_week", "time_period", 'month', 'is_holiday'],
                            sparse= False, drop_first= True)
#converting dummies to numerical
xg_dummies = xg_dummies.astype(int)
#joining the dummies and original df
xg_encoded = pd.concat( [modelling_data, xg_dummies] ,axis=1)
#dropping cat columns
xg_encoded.drop(["longitude", "latitude", "occupancy", "facility_name", "month", "time_category", "day_of_week", "is_holiday"], axis= 1, inplace= True)

xg_encoded = xg_encoded.set_index("datetime")

In [None]:
xg_grouped = xg_encoded.groupby("facility_id")
xg_grouped.head(20)

In [None]:
# Dictionary to store XGBoost models for each parking lot
xg_models = {}

# Train a model for each parking lot
for facility_id, group_data in xg_grouped:
    # Split the data into features and target variable
    X = group_data.drop(columns=["parking_availability"], axis=1)
    y = group_data["parking_availability"]

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create an XGBoost model
    xg_model = xgb.XGBRegressor(objective='reg:squarederror')

    # Train the model
    xg_model.fit(X_train, y_train)

    # Make predictions on the test set
    # group_data["prediction"] = xg_model.predict(X_test)
    predictions = xg_model.predict(X_test)

    # Evaluate the model
    mse = mean_squared_error(y_test, predictions)
    rmse = sqrt(mse)
    mae = mean_absolute_error(y_test, predictions)
    print(f'Mean Squared Error for {facility_id}: {mse}')
    print(f'Root Mean Squared Error for {facility_id}: {rmse}')
    print(f'Mean Absolute Error for {facility_id}: {mae}')
    
    # Store the trained model in the dictionary
    xg_models[facility_id] = xg_model

    # Plot actual vs predicted values
    plt.figure(figsize=(15, 5))
    plt.plot(X_test.index, y_test, label='Actual', marker='o')
    plt.plot(X_test.index, predictions, label='Predicted', marker='o')
    plt.title(f'Actual vs Predicted for {facility_id}')
    plt.xlabel('Date')
    plt.ylabel('Parking Availability')
    plt.legend()
    plt.show()


Hyperparameter tuned model

In [None]:
# Define the parameter grid for GridSearchCV
param_grid = { 
    "learning_rate": [0.01, 0.2],
    "max_depth": [5, 6, 7],
    "gamma": [0.1, 0.2],
    "reg_lambda": [0.01,  0.1],
    "reg_alpha": [0, 0.01, 0.1, 0.2],
    "n_estimators": [30, 50]
}
class XGBoostTimeSeriesModel:
    def __init__(self, param_grid, target_variable='parking_availability', cv=3, n_jobs=-1):
        self.param_grid = param_grid
        self.target_variable = target_variable
        self.cv = cv
        self.n_jobs = n_jobs
        self.models = {}

    def _get_features_target(self, group_data):
        X = group_data.drop(columns=[self.target_variable, "facility_name"], axis=1)
        y = group_data[self.target_variable]
        return X, y

    def train_models(self, xg_grouped):
        for facility_name, group_data in xg_grouped:
            X, y = self._get_features_target(group_data)

            # Create an XGBoost model
            xg_model = xgb.XGBRegressor(objective='reg:squarederror')

            # Perform GridSearchCV
            grid_search = GridSearchCV(xg_model, param_grid=self.param_grid, cv=self.cv, n_jobs=self.n_jobs)
            grid_search.fit(X, y)

            # Get the best hyperparameters
            best_params = grid_search.best_params_
            print(f'Best Hyperparameters for {facility_name}: {best_params}')

            # Use the best model from GridSearchCV
            best_model = grid_search.best_estimator_

            # Train the model
            best_model.fit(X, y)

            # Store the trained model in the dictionary
            self.models[facility_name] = best_model

    def predict_and_plot(self, xg_grouped):
        predictions_df = pd.DataFrame()  # Initialize the DataFrame to store predictions

        for facility_name, model in self.models.items():
            # Get the last available data for each facility
            last_data = xg_grouped.get_group(facility_name).tail(1).drop(columns=["parking_availability"])

            # Make predictions for the next time period
            facility_predictions = model.predict(last_data)

            # Add 'prediction' column to the predictions DataFrame
            predictions_df = pd.concat([predictions_df, pd.DataFrame({
                'facility_name': [facility_name] * len(facility_predictions),
                'prediction': facility_predictions
                })])

        # Merge predictions with the original DataFrame based on the 'facility_name'
        xg_grouped_predictions = pd.merge(xg_grouped, predictions_df, on='facility_name', how='left')

        # Plot actual vs predicted values for each facility
        for facility_name, group_data in xg_grouped_predictions.groupby('facility_name'):
            plt.figure(figsize=(10, 5))
            plt.plot(group_data.index, group_data[self.target_variable], label='Actual', marker='o')
            plt.plot(group_data.index, group_data['prediction'], label='Predicted', marker='o')
            plt.title(f'Actual vs Predicted for {facility_name}')
            plt.xlabel('Date')
            plt.ylabel('Parking Availability')
            plt.legend()
            plt.show()

    
    def predict_and_plot(self, xg_grouped):
        predictions_df = pd.DataFrame()  # Initialize the DataFrame to store predictions

        for facility_name, model in self.models.items():
            # Get the last available data for each facility
            last_data = xg_grouped.get_group(facility_name).tail(1).drop(columns=["parking_availability"])

            # Make predictions for the next time period
            facility_predictions = model.predict(last_data)

            # Add 'prediction' column to the predictions DataFrame
            predictions_df = pd.concat([predictions_df, pd.DataFrame({
                'facility_name': [facility_name] * len(facility_predictions),
                'prediction': facility_predictions
            })])



        # Transform the groupby object to a DataFrame
        xg_grouped_df = xg_grouped.transform('last').reset_index()

        # Merge predictions with the original DataFrame based on the 'facility_name'
        xg_grouped_predictions = pd.merge(xg_grouped_df, predictions_df, on='facility_name', how='left')

        # Plot actual vs predicted values for each facility
        for facility_name, group_data in xg_grouped_predictions.groupby('facility_name'):
            plt.figure(figsize=(10, 5))
            plt.plot(group_data.index, group_data[self.target_variable], label='Actual', marker='o')
            plt.plot(group_data.index, group_data['prediction'], label='Predicted', marker='o')
            plt.title(f'Actual vs Predicted for {facility_name}')
            plt.xlabel('Date')
            plt.ylabel('Parking Availability')
            plt.legend()
            plt.show()
        


# # Example usage:
xg_model = XGBoostTimeSeriesModel(param_grid, target_variable= "parking_availability", cv=3, n_jobs= -1)
xg_model.train_models(xg_grouped)
xg_model.predict_and_plot(xg_grouped)



