In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, OrthogonalMatchingPursuit
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# import r2_score module
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error


In [2]:
# Create filepath to dataset
path = "https://ev-project-datasets.s3.us-east-2.amazonaws.com/clean_table.csv"
EV_stations_df = pd.read_csv(path, encoding='latin-1', on_bad_lines="skip")

In [3]:
EV_stations_df

Unnamed: 0,id,city,province_id,province_name,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,City_EV_stations_locations,incentives_status,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc
0,1124279679,Toronto,ON,Ontario,5429524,41200,9.4,13.0,54739,294,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0
1,1124586170,Montreal,QC,Quebec,3519595,40800,7.2,7.3,61203,709,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5
2,1124825478,Vancouver,BC,British Columbia,2264823,40800,7.4,12.6,68894,218,YES,148116,15.5,29.4,8.8,18.1,3.6,24.6
3,1124690423,Calgary,AB,Alberta,1239220,44800,10.9,16.6,0,93,YES,0,16.9,27.9,9.7,19.2,3.0,23.4
4,1124290735,Edmonton,AB,Alberta,1062643,44800,10.9,16.6,0,70,YES,0,16.9,27.9,9.7,19.2,3.0,23.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1699,1124105436,Durham-Sud,QC,Quebec,1008,40800,7.2,7.3,0,0,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5
1700,1124850489,Melbourne,QC,Quebec,1004,40800,7.2,7.3,0,1,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5
1701,1124001339,Nipawin No. 487,SK,Saskatchewan,1004,42400,7.0,18.1,0,0,YES,2222,20.7,30.5,10.4,17.1,3.3,18.0
1702,1124001661,Duck Lake No. 463,SK,Saskatchewan,1004,42400,7.0,18.1,0,0,YES,2222,20.7,30.5,10.4,17.1,3.3,18.0


In [4]:
# Copy original EV_stations_df DataFrame to features_df
features_df = EV_stations_df.copy()
features_df.head()

Unnamed: 0,id,city,province_id,province_name,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,City_EV_stations_locations,incentives_status,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc
0,1124279679,Toronto,ON,Ontario,5429524,41200,9.4,13.0,54739,294,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0
1,1124586170,Montreal,QC,Quebec,3519595,40800,7.2,7.3,61203,709,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5
2,1124825478,Vancouver,BC,British Columbia,2264823,40800,7.4,12.6,68894,218,YES,148116,15.5,29.4,8.8,18.1,3.6,24.6
3,1124690423,Calgary,AB,Alberta,1239220,44800,10.9,16.6,0,93,YES,0,16.9,27.9,9.7,19.2,3.0,23.4
4,1124290735,Edmonton,AB,Alberta,1062643,44800,10.9,16.6,0,70,YES,0,16.9,27.9,9.7,19.2,3.0,23.4


In [5]:
# Withhold 20% of dataset before training model in order to test accuracy
features_data = features_df.sample(frac=0.75, random_state=2)
unseen_data = features_df.drop(features_data.index).reset_index(drop=True)

# review shape of modeling and prediction datasets
print('Data for Modeling: ' + str(features_data.shape))
print('Data for Prediction: ' + str(unseen_data.shape))

Data for Modeling: (1278, 18)
Data for Prediction: (426, 18)


In [6]:
# Reset the index of the features_data
features_data = features_data.reset_index(drop=True)
features_data

Unnamed: 0,id,city,province_id,province_name,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,City_EV_stations_locations,incentives_status,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc
0,1124885955,Battleford,SK,Saskatchewan,4065,42400,7.0,18.1,1,0,YES,2222,20.7,30.5,10.4,17.1,3.3,18.0
1,1124001550,Saint David,NB,New Brunswick,1529,37600,9.6,12.7,0,0,YES,2112,22.0,28.5,9.1,21.8,1.9,16.7
2,1124178262,St. George's,NL,Newfoundland and Labrador,1203,36800,12.5,13.8,0,0,YES,0,23.4,25.0,11.3,23.1,2.4,14.8
3,1124324905,Courtenay,BC,British Columbia,25599,40800,7.4,12.6,880,9,YES,148116,15.5,29.4,8.8,18.1,3.6,24.6
4,1124175333,Henryville,QC,Quebec,1464,40800,7.2,7.3,0,2,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1273,1124139264,Hamiota,MB,Manitoba,2011,39200,7.6,9.9,0,0,YES,3290,22.0,29.6,7.7,17.7,2.9,20.1
1274,1124000427,Beaubassin East / Beaubassin-est,NB,New Brunswick,6376,37600,9.6,12.7,0,0,YES,2112,22.0,28.5,9.1,21.8,1.9,16.7
1275,1124735582,Kamloops,BC,British Columbia,100046,40800,7.4,12.6,1202,19,YES,148116,15.5,29.4,8.8,18.1,3.6,24.6
1276,1124715267,Beauport,QC,Quebec,7281,40800,7.2,7.3,0,1,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5


In [7]:
# Drop the columns identifying each city: id, city, province_id, province_name
features_train = features_data.drop(columns=["id", "province_id", "city", "province_name"])
features_unseen = unseen_data.drop(columns=["id", "province_id", "city", "province_name"])
# print(features_df.shape)
features_train

Unnamed: 0,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,City_EV_stations_locations,incentives_status,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc
0,4065,42400,7.0,18.1,1,0,YES,2222,20.7,30.5,10.4,17.1,3.3,18.0
1,1529,37600,9.6,12.7,0,0,YES,2112,22.0,28.5,9.1,21.8,1.9,16.7
2,1203,36800,12.5,13.8,0,0,YES,0,23.4,25.0,11.3,23.1,2.4,14.8
3,25599,40800,7.4,12.6,880,9,YES,148116,15.5,29.4,8.8,18.1,3.6,24.6
4,1464,40800,7.2,7.3,0,2,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1273,2011,39200,7.6,9.9,0,0,YES,3290,22.0,29.6,7.7,17.7,2.9,20.1
1274,6376,37600,9.6,12.7,0,0,YES,2112,22.0,28.5,9.1,21.8,1.9,16.7
1275,100046,40800,7.4,12.6,1202,19,YES,148116,15.5,29.4,8.8,18.1,3.6,24.6
1276,7281,40800,7.2,7.3,0,1,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5


In [8]:
# Save categorical variable column incentives_status to variable for encoding
incentives = features_train.dtypes[features_train.dtypes == "object"].index.tolist()
incentives_unseen = features_unseen.dtypes[features_unseen.dtypes == "object"].index.tolist()

In [9]:
# Create a OneHotEncoder instance
from base64 import encode


enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable column incentive_status
encode_df = pd.DataFrame(enc.fit_transform(features_train[incentives]))
unseen_encode = pd.DataFrame(enc.fit_transform(features_unseen[incentives_unseen]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(incentives)
unseen_encode.columns = enc.get_feature_names_out(incentives_unseen)
encode_df

Unnamed: 0,incentives_status_NO,incentives_status_YES
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0
...,...,...
1273,0.0,1.0
1274,0.0,1.0
1275,0.0,1.0
1276,0.0,1.0


In [10]:
# Merge the incentive_status_YES column back to original dataframe and drop the original incentives_status column
features_train = features_train.merge(encode_df, left_index=True, right_index=True, how="outer")
features_train = features_train.drop(columns=["incentives_status", "incentives_status_NO"], axis=1)
# features_train.head()

# Merge the unseen incentives back to unseen features
features_unseen = features_unseen.merge(unseen_encode, left_index=True, right_index=True, how="outer")
features_unseen = features_unseen.drop(columns=["incentives_status", "incentives_status_NO"], axis=1)

In [11]:
# Split preprocessed data and define target variable and features
y = features_train["City_EV_stations_locations"]
X = features_train.drop(["City_EV_stations_locations"],axis=1).values
y_unseen = features_unseen["City_EV_stations_locations"]
X_unseen = features_unseen.drop(["City_EV_stations_locations"],axis=1).values

# # Split the preprocessed data into a training and testing dataset
# X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

In [12]:
# # Create a StandardScaler instance
# scaler = StandardScaler()

# # fit the StandardScaler
# X_scaler = scaler.fit(X_train)

# # Scale the data
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)
# X_unseen_scaled = X_scaler.transform(X_unseen)
# print(len(X_test_scaled))
# print(len(y_test))


## Use RandomForestRegressor model

In [13]:
# Define the model
regr = RandomForestRegressor()

# Fit the model
regr.fit(X, y)

# Make predictions
y_pred = regr.predict(X)
print(y_pred.shape)

(1278,)


In [14]:
# predict the accuracy score
score = r2_score(y, y_pred)
print('r2 score is: ', score)
print('mean squared error is : ', mean_squared_error(y, y_pred))
print('root mean squared error is : ', np.sqrt(mean_squared_error(y, y_pred)))

r2 score is:  0.9443830378157063
mean squared error is :  34.6351544487008
root mean squared error is :  5.885163927088251


In [15]:
# Integrate predictions to features_data dataframe
features_data["model_predictions"] = y_pred

In [16]:
# # Reorder columns to provide direct comparison of existing EV_charing_locations to model predictions
features_data = features_data[['id', 'city', 'province_id', 'province_name', 'City_Population',
       'Median_Income', 'Unemployment_Rate', 'City_Electricity_Rate',
       'City_EV_registrations','incentives_status', 'EV_Per_Province', 'No_Certificate_perc',
       'Secondary_HS_perc', 'Apprenticeship_perc', 'College_CEGEP_perc',
       'Univ_diploma_Below_Bachelor_perc', 'Univ_diploma_Above_Bachelor_perc',
       'City_EV_stations_locations','model_predictions']]

In [17]:
features_data

Unnamed: 0,id,city,province_id,province_name,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,incentives_status,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc,City_EV_stations_locations,model_predictions
0,1124885955,Battleford,SK,Saskatchewan,4065,42400,7.0,18.1,1,YES,2222,20.7,30.5,10.4,17.1,3.3,18.0,0,0.25
1,1124001550,Saint David,NB,New Brunswick,1529,37600,9.6,12.7,0,YES,2112,22.0,28.5,9.1,21.8,1.9,16.7,0,0.04
2,1124178262,St. George's,NL,Newfoundland and Labrador,1203,36800,12.5,13.8,0,YES,0,23.4,25.0,11.3,23.1,2.4,14.8,0,0.00
3,1124324905,Courtenay,BC,British Columbia,25599,40800,7.4,12.6,880,YES,148116,15.5,29.4,8.8,18.1,3.6,24.6,9,8.88
4,1124175333,Henryville,QC,Quebec,1464,40800,7.2,7.3,0,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5,2,1.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1273,1124139264,Hamiota,MB,Manitoba,2011,39200,7.6,9.9,0,YES,3290,22.0,29.6,7.7,17.7,2.9,20.1,0,0.00
1274,1124000427,Beaubassin East / Beaubassin-est,NB,New Brunswick,6376,37600,9.6,12.7,0,YES,2112,22.0,28.5,9.1,21.8,1.9,16.7,0,0.37
1275,1124735582,Kamloops,BC,British Columbia,100046,40800,7.4,12.6,1202,YES,148116,15.5,29.4,8.8,18.1,3.6,24.6,19,16.19
1276,1124715267,Beauport,QC,Quebec,7281,40800,7.2,7.3,0,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5,1,3.17


In [18]:
# Save prediction DataFrame to csv file
# Create filepath
filepath = Path('..\Datasets\Training_Station_Predictions.csv')

# Save the file
features_data.to_csv(filepath)

In [19]:
# Save the model
import pickle

# Save the trained model
RFREG_model = "manual_split_model.pkl"
# OMP_model = "omp_station_prediction_model.pkl"

with open(RFREG_model, 'wb') as file:
    pickle.dump(regr, file)

# with open(OMP_model, 'wb') as file:
#     pickle.dump(omp,file)

In [20]:
# Load the model
with open(RFREG_model, 'rb') as file:
    rfreg_model = pickle.load(file)

# with open(OMP_model, 'rb') as file:
#     omp_model = pickle.load(file)

rfreg_model
# omp_model

RandomForestRegressor()

In [21]:
# # Split features_unseen into features and target variables
# y_test = features_unseen["City_EV_stations_locations"]
# X_test = features_unseen.drop(["City_EV_stations_locations"],axis=1).values

In [22]:
# Use the reloaded model to calculate accuracy score and predict target values
rfreg_predictions = rfreg_model.predict(X_unseen)

In [23]:
# Check predictions score
rfreg_model.score(X_unseen,y_unseen)

0.46183377282326754

In [24]:
# # Use the reloaded model to calculate accuracy score and predict target values
# omp_predictions = omp_model.predict(X_unseen)

In [25]:
# # Check predictions score
# omp_model.score(X_unseen,y_unseen)

In [26]:
# Integrate predictions to features_unseen dataframe
unseen_data["model_predictions"] = rfreg_predictions

In [27]:
# Reorder columns to provide direct comparison of existing EV_charing_locations to model predictions
unseen_data = unseen_data[['id', 'city', 'province_id', 'province_name', 'City_Population',
       'Median_Income', 'Unemployment_Rate', 'City_Electricity_Rate',
       'City_EV_registrations','incentives_status', 'EV_Per_Province', 'No_Certificate_perc',
       'Secondary_HS_perc', 'Apprenticeship_perc', 'College_CEGEP_perc',
       'Univ_diploma_Below_Bachelor_perc', 'Univ_diploma_Above_Bachelor_perc',
       'City_EV_stations_locations','model_predictions']]

In [28]:
unseen_data

Unnamed: 0,id,city,province_id,province_name,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,incentives_status,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc,City_EV_stations_locations,model_predictions
0,1124823933,Quebec City,QC,Quebec,705103,40800,7.2,7.3,0,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5,149,76.71
1,1124567288,Hamilton,ON,Ontario,693645,41200,9.4,13.0,7616,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0,43,131.81
2,1124704011,Niagara Falls,ON,Ontario,308596,41200,9.4,13.0,269,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0,18,26.14
3,1124364273,Richmond Hill,ON,Ontario,195022,41200,9.4,13.0,2395,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0,20,47.46
4,1124541904,Oshawa,ON,Ontario,166000,41200,9.4,13.0,2274,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0,23,39.59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,1124324069,Saint-Bonaventure,QC,Quebec,1017,40800,7.2,7.3,0,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5,0,0.06
422,1124000772,Longlaketon No. 219,SK,Saskatchewan,1016,42400,7.0,18.1,3,YES,2222,20.7,30.5,10.4,17.1,3.3,18.0,0,0.05
423,1124260692,Hudson Hope,BC,British Columbia,1012,40800,7.4,12.6,0,YES,148116,15.5,29.4,8.8,18.1,3.6,24.6,0,0.27
424,1124000733,Prince,ON,Ontario,1010,41200,9.4,13.0,0,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0,0,0.00


In [29]:
# Save prediction DataFrame to csv file
# Create filepath
filepath = Path('..\Datasets\Testing_Station_Predictions.csv')

# Save the file
unseen_data.to_csv(filepath)