In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [27]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [3]:
# Import the necessary packages for EDA
import pandas as pd
# Set option to display all columns
pd.set_option('display.max_columns', None)
import numpy as np
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt
# Import datetime for datetime processing
from datetime import datetime, timedelta

In [4]:
df = pd.read_csv("../data/output.csv")
df.head()

Unnamed: 0,charging_event_id,user_id,cp_id,con_num,start_date,start_time,end_date,end_time,total_kwh,site,charger_model,start_datetime,end_datetime,duration_hr,start_day,end_day,start_period,end_period
0,383322,14Q3User 31,70206,2,2014-12-31,11:25,2014-12-31,11:35,0.57,Elland Road Park and Ride,APT 7kW Dual Outlet,2014-12-31 11:25:00,2014-12-31 11:35:00,0.166667,Wednesday,Wednesday,morning,morning
1,383031,14Q3User 635,70206,1,2014-12-30,13:58,2014-12-30,15:24,4.52,Elland Road Park and Ride,APT 7kW Dual Outlet,2014-12-30 13:58:00,2014-12-30 15:24:00,1.433333,Tuesday,Tuesday,afternoon,afternoon
2,380951,14Q3User 629,70208,1,2014-12-24,11:37,2014-12-24,12:06,3.16,Elland Road Park and Ride,APT 7kW Dual Outlet,2014-12-24 11:37:00,2014-12-24 12:06:00,0.483333,Wednesday,Wednesday,morning,afternoon
3,377470,14Q3User 626,70204,2,2014-12-18,09:29,2014-12-18,11:56,3.16,Woodhouse Lane Car Park,APT 7kW Dual Outlet,2014-12-18 09:29:00,2014-12-18 11:56:00,2.45,Thursday,Thursday,morning,morning
4,377434,14Q3User 44,70204,1,2014-12-18,08:36,2014-12-18,16:56,7.15,Woodhouse Lane Car Park,APT 7kW Dual Outlet,2014-12-18 08:36:00,2014-12-18 16:56:00,8.333333,Thursday,Thursday,morning,afternoon


In [5]:
# Change the relavent columns to datetime format
df[["start_datetime", "end_datetime"]] = df[["start_datetime", "end_datetime"]].apply(pd.to_datetime, yearfirst=True, format="%Y-%m-%d %H:%M:%S")

In [6]:
df = df[~df['site'].str.contains('test site', case=False)]
df.reset_index(drop=True, inplace=True)
df["site"].unique()

array(['Elland Road Park and Ride', 'Woodhouse Lane Car Park',
       'Temple Green Park and Ride', 'Wellington Place ',
       'Torre Road Council Depot'], dtype=object)

In [7]:
weekdays = ['Wednesday', 'Tuesday', 'Thursday', 'Friday', 'Monday']
df_weekday = df.loc[df['start_day'].isin(weekdays)]
df_weekday.shape

(19306, 18)

In [8]:
df_weekend = df.loc[df['start_day'].isin(['Saturday', 'Sunday'])]
df_weekend.shape

(3231, 18)

In [9]:
user_frequency = df.groupby('user_id').size().reset_index(name='usage_count')
df = df.merge(user_frequency, on='user_id', how='left')

In [10]:
# These information need to be extracted out to be used as features
# in a decision tree model that I plan to use.
# Extract year from start_datetime column
df['year'] = df['start_datetime'].dt.year
# Extract the quarter
df['start_quarter'] = df['start_datetime'].dt.quarter
# Extract the start month
df['start_month'] = df['start_datetime'].dt.month
# Extract the start day
df['start_day_of_month'] = df['start_datetime'].dt.day
# Extract the start hours
df['start_hour'] = df['start_datetime'].dt.hour
# Extract the start minutes
df['start_minute'] = df['start_datetime'].dt.minute
# Classify if the start time is on weekday or weekend
df['start_on_weekday'] = df['start_day'].apply(lambda x: 1 if x in weekdays else 0)

In [11]:
# Drop duration less than 0.25 hrs.
# This is because duration below 0.25 hours are very small values.
# Drop duration more than 24 hrs.
# This is just hogging the charging station and really shouldn't be entertained
df = df.loc[(df['duration_hr'] >= 0.25) & (df['duration_hr'] <= 24)]

In [12]:
# Calculate Q1, Q3 and IQR
Q1 = df['duration_hr'].quantile(0.25)
Q3 = df['duration_hr'].quantile(0.75)
IQR = Q3 - Q1

# Define outliers as those outside of Q1 - 1.5*IQR and Q3 + 1.5*IQR
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out the outliers
df = df.loc[(df['duration_hr'] >= lower_bound) & (df['duration_hr'] <= upper_bound)]

In [13]:
def count_simultaneous_chargers(row, df):
    # Select events at the same site.
    same_site_events = df[df['site'] == row['site']]

    # Count events where the time period overlaps with the current event's period.
    overlap_count = same_site_events[
        (same_site_events['start_datetime'] < row['end_datetime']) &
        (same_site_events['end_datetime'] > row['start_datetime'])
    ].shape[0]

    # Subtract 1 to exclude the current event from its own count.
    return overlap_count - 1

In [14]:
# Ensure that the DataFrame is sorted by start_datetime.
df = df.sort_values(by='start_datetime')
# Apply the function to each row
df['simultaneous_users'] = df.apply(lambda row: count_simultaneous_chargers(row, df), axis=1)

In [15]:
# Functional imports
import os

# Standard library imports
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import AgglomerativeClustering
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectFromModel
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

# Third-party imports
from scipy.cluster.hierarchy import dendrogram
import joblib as joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from xgboost import XGBRegressor

In [16]:
woodhouse_df = df.loc[df['site'] == 'Woodhouse Lane Car Park'].reset_index(drop=True).copy()
woodhouse_df.head()

Unnamed: 0,charging_event_id,user_id,cp_id,con_num,start_date,start_time,end_date,end_time,total_kwh,site,charger_model,start_datetime,end_datetime,duration_hr,start_day,end_day,start_period,end_period,usage_count,year,start_quarter,start_month,start_day_of_month,start_hour,start_minute,start_on_weekday,simultaneous_users
0,233978,14Q2User 15,70204,2,2014-04-10,07:41,2014-04-10,17:12,5.63,Woodhouse Lane Car Park,APT 7kW Dual Outlet,2014-04-10 07:41:00,2014-04-10 17:12:00,9.516667,Thursday,Thursday,morning,afternoon,3,2014,2,4,10,7,41,1,0
1,234600,14Q2User 15,70204,2,2014-04-12,11:55,2014-04-12,21:12,5.39,Woodhouse Lane Car Park,APT 7kW Dual Outlet,2014-04-12 11:55:00,2014-04-12 21:12:00,9.283333,Saturday,Saturday,morning,evening,3,2014,2,4,12,11,55,0,0
2,236159,14Q2User 15,70204,2,2014-04-18,08:41,2014-04-18,18:11,4.7,Woodhouse Lane Car Park,APT 7kW Dual Outlet,2014-04-18 08:41:00,2014-04-18 18:11:00,9.5,Friday,Friday,morning,evening,3,2014,2,4,18,8,41,1,0
3,243232,14Q2User 11,70202,1,2014-05-10,10:18,2014-05-10,11:04,1.26,Woodhouse Lane Car Park,APT 7kW Dual Outlet,2014-05-10 10:18:00,2014-05-10 11:04:00,0.766667,Saturday,Saturday,morning,morning,2,2014,2,5,10,10,18,0,0
4,243750,14Q2User 17,70202,1,2014-05-12,15:55,2014-05-12,17:27,5.7,Woodhouse Lane Car Park,APT 7kW Dual Outlet,2014-05-12 15:55:00,2014-05-12 17:27:00,1.533333,Monday,Monday,afternoon,afternoon,1,2014,2,5,12,15,55,1,0


In [17]:
columns_to_drop = ['charging_event_id',	'user_id',	'cp_id',	'con_num',
                   'start_date',	'start_time',	'end_date',	'end_time',	'year',
                   'total_kwh',	'site',	'start_datetime',	'end_datetime',
                   'start_period', 'start_quarter',	'end_day',	'end_period']
woodhouse_drop_df = woodhouse_df.drop(columns=columns_to_drop)
woodhouse_drop_df.head()

Unnamed: 0,charger_model,duration_hr,start_day,usage_count,start_month,start_day_of_month,start_hour,start_minute,start_on_weekday,simultaneous_users
0,APT 7kW Dual Outlet,9.516667,Thursday,3,4,10,7,41,1,0
1,APT 7kW Dual Outlet,9.283333,Saturday,3,4,12,11,55,0,0
2,APT 7kW Dual Outlet,9.5,Friday,3,4,18,8,41,1,0
3,APT 7kW Dual Outlet,0.766667,Saturday,2,5,10,10,18,0,0
4,APT 7kW Dual Outlet,1.533333,Monday,1,5,12,15,55,1,0


In [18]:
# Encoding categorical variables
label_encoders = {}
encoding_mapping = {}
for column in woodhouse_drop_df.select_dtypes(include=["object", "category"]).columns:
    le = LabelEncoder()
    woodhouse_drop_df[column] = le.fit_transform(woodhouse_drop_df[column])
    label_encoders[column] = le

    # Save the mapping relation of each label encoder
    label_mapping = {original: encoded for encoded, original in enumerate(le.classes_)}
    encoding_mapping[column] = label_mapping

woodhouse_drop_df.head()

Unnamed: 0,charger_model,duration_hr,start_day,usage_count,start_month,start_day_of_month,start_hour,start_minute,start_on_weekday,simultaneous_users
0,0,9.516667,4,3,4,10,7,41,1,0
1,0,9.283333,2,3,4,12,11,55,0,0
2,0,9.5,0,3,4,18,8,41,1,0
3,0,0.766667,2,2,5,10,10,18,0,0
4,0,1.533333,1,1,5,12,15,55,1,0


In [19]:
# Drop charger_model as this site is using the same charger model.
# If site have more than 1 charger model, then it should not be droppped.
woodhouse_drop_df.drop(columns=['charger_model'], inplace=True)
woodhouse_drop_df.head()

Unnamed: 0,duration_hr,start_day,usage_count,start_month,start_day_of_month,start_hour,start_minute,start_on_weekday,simultaneous_users
0,9.516667,4,3,4,10,7,41,1,0
1,9.283333,2,3,4,12,11,55,0,0
2,9.5,0,3,4,18,8,41,1,0
3,0.766667,2,2,5,10,10,18,0,0
4,1.533333,1,1,5,12,15,55,1,0


In [20]:
def convert_ordinal_2_cyclic(df, **kwargs):
    for kw, arg in kwargs.items():
        df[f"{kw}_sin"] = np.sin(2 * np.pi * df[kw]/arg)
        df[f"{kw}_cos"] = np.cos(2 * np.pi * df[kw]/arg)
        df.drop(columns=[kw], inplace=True)
    return df


In [21]:
woodhouse_prep_df = convert_ordinal_2_cyclic(woodhouse_drop_df, start_day=7, start_month=12, start_day_of_month=31, start_hour=24, start_minute=60)
woodhouse_prep_df.head()

Unnamed: 0,duration_hr,usage_count,start_on_weekday,simultaneous_users,start_day_sin,start_day_cos,start_month_sin,start_month_cos,start_day_of_month_sin,start_day_of_month_cos,start_hour_sin,start_hour_cos,start_minute_sin,start_minute_cos
0,9.516667,3,1,0,-0.433884,-0.900969,0.866025,-0.5,0.897805,-0.440394,0.965926,-0.258819,-0.913545,-0.406737
1,9.283333,3,0,0,0.974928,-0.222521,0.866025,-0.5,0.651372,-0.758758,0.258819,-0.965926,-0.5,0.866025
2,9.5,3,1,0,0.0,1.0,0.866025,-0.5,-0.485302,-0.874347,0.866025,-0.5,-0.913545,-0.406737
3,0.766667,2,0,0,0.974928,-0.222521,0.5,-0.866025,0.897805,-0.440394,0.5,-0.866025,0.951057,-0.309017
4,1.533333,1,1,0,0.781831,0.62349,0.5,-0.866025,0.651372,-0.758758,-0.707107,-0.707107,-0.5,0.866025


In [24]:
# Splitting the data into features and target
X = woodhouse_prep_df.drop('duration_hr', axis=1)
y = woodhouse_prep_df['duration_hr']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Defining the XGBRegressor and grid search parameters
xgb_params = {
    'xgbregressor__n_estimators': [370],
    'xgbregressor__max_depth': [29, 30],
    'xgbregressor__learning_rate': [0.01],
    'xgbregressor__colsample_bynode': [0.1],
    'xgbregressor__tree_method': ['gpu_hist'],  # For GPU support
    'xgbregressor__predictor': ['gpu_predictor']  # For GPU support
}

# Creating the pipeline
pipeline = Pipeline([
    ('feature_selector', SelectFromModel(XGBRegressor(tree_method='gpu_hist', predictor='gpu_predictor', random_state=42))),
    ('xgbregressor', XGBRegressor(tree_method='gpu_hist', predictor='gpu_predictor', random_state=42))
])

In [25]:
### Warning: This code may take a long time to run! ####
# Grid Search with Cross-Validation
grid_search = GridSearchCV(pipeline, xgb_params, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [26]:
# Best model after grid search
best_model = grid_search.best_estimator_

# Displaying best parameters
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'xgbregressor__colsample_bynode': 0.1, 'xgbregressor__learning_rate': 0.01, 'xgbregressor__max_depth': 29, 'xgbregressor__n_estimators': 370, 'xgbregressor__predictor': 'gpu_predictor', 'xgbregressor__tree_method': 'gpu_hist'}
