In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# import r2_score module
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error


In [2]:
# Create filepath to dataset
path = "https://ev-project-datasets.s3.us-east-2.amazonaws.com/clean_table.csv"
EV_stations_df = pd.read_csv(path, encoding='latin-1', on_bad_lines="skip")

In [3]:
EV_stations_df

Unnamed: 0,id,city,province_id,province_name,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,City_EV_stations_locations,incentives_status,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc
0,1124279679,Toronto,ON,Ontario,5429524,41200,9.4,13.0,54739,294,NO,64941,17.5,27.4,6.0,20.8,2.2,26.0
1,1124586170,Montreal,QC,Quebec,3519595,40800,7.2,7.3,0,709,YES,115611,19.9,21.5,16.9,17.6,3.6,20.5
2,1124825478,Vancouver,BC,British Columbia,2264823,40800,7.4,12.6,68894,218,YES,67370,15.5,29.4,8.8,18.1,3.6,24.6
3,1124690423,Calgary,AB,Alberta,1239220,44800,10.9,16.6,0,93,YES,0,16.9,27.9,9.7,19.2,3.0,23.4
4,1124290735,Edmonton,AB,Alberta,1062643,44800,10.9,16.6,0,70,YES,0,16.9,27.9,9.7,19.2,3.0,23.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1697,1124105436,Durham-Sud,QC,Quebec,1008,40800,7.2,7.3,0,0,YES,115611,19.9,21.5,16.9,17.6,3.6,20.5
1698,1124850489,Melbourne,QC,Quebec,1004,40800,7.2,7.3,0,1,YES,115611,19.9,21.5,16.9,17.6,3.6,20.5
1699,1124001339,Nipawin No. 487,SK,Saskatchewan,1004,42400,7.0,18.1,0,0,YES,981,20.7,30.5,10.4,17.1,3.3,18.0
1700,1124001661,Duck Lake No. 463,SK,Saskatchewan,1004,42400,7.0,18.1,0,0,YES,981,20.7,30.5,10.4,17.1,3.3,18.0


In [4]:
# Check data types
EV_stations_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1702 entries, 0 to 1701
Data columns (total 18 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                1702 non-null   int64  
 1   city                              1702 non-null   object 
 2   province_id                       1702 non-null   object 
 3   province_name                     1702 non-null   object 
 4   City_Population                   1702 non-null   int64  
 5   Median_Income                     1702 non-null   int64  
 6   Unemployment_Rate                 1702 non-null   float64
 7   City_Electricity_Rate             1702 non-null   float64
 8   City_EV_registrations             1702 non-null   int64  
 9   City_EV_stations_locations        1702 non-null   int64  
 10  incentives_status                 1702 non-null   object 
 11  EV_Per_Province                   1702 non-null   int64  
 12  No_Cer

In [5]:
EV_stations_df.columns

Index(['id', 'city', 'province_id', 'province_name', 'City_Population',
       'Median_Income', 'Unemployment_Rate', 'City_Electricity_Rate',
       'City_EV_registrations', 'City_EV_stations_locations',
       'incentives_status', 'EV_Per_Province', 'No_Certificate_perc',
       'Secondary_HS_perc', 'Apprenticeship_perc', 'College_CEGEP_perc',
       'Univ_diploma_Below_Bachelor_perc', 'Univ_diploma_Above_Bachelor_perc'],
      dtype='object')

In [6]:
# Create a new DataFrame to hold the City and Province
city_prov_df = pd.DataFrame(EV_stations_df[['city', 'province_name']])
print(city_prov_df.shape)
city_prov_df.head()

(1702, 2)


Unnamed: 0,city,province_name
0,Toronto,Ontario
1,Montreal,Quebec
2,Vancouver,British Columbia
3,Calgary,Alberta
4,Edmonton,Alberta


In [7]:
# Copy original EV_stations_df DataFrame to features_df
features_df = EV_stations_df.copy()
features_df.head()

Unnamed: 0,id,city,province_id,province_name,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,City_EV_stations_locations,incentives_status,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc
0,1124279679,Toronto,ON,Ontario,5429524,41200,9.4,13.0,54739,294,NO,64941,17.5,27.4,6.0,20.8,2.2,26.0
1,1124586170,Montreal,QC,Quebec,3519595,40800,7.2,7.3,0,709,YES,115611,19.9,21.5,16.9,17.6,3.6,20.5
2,1124825478,Vancouver,BC,British Columbia,2264823,40800,7.4,12.6,68894,218,YES,67370,15.5,29.4,8.8,18.1,3.6,24.6
3,1124690423,Calgary,AB,Alberta,1239220,44800,10.9,16.6,0,93,YES,0,16.9,27.9,9.7,19.2,3.0,23.4
4,1124290735,Edmonton,AB,Alberta,1062643,44800,10.9,16.6,0,70,YES,0,16.9,27.9,9.7,19.2,3.0,23.4


In [8]:
# Drop the columns identifying each city: id, city, province_id, province_name
features_df = features_df.drop(columns=["id", "province_id", "city", "province_name"])
# print(features_df.shape)
features_df.head(10)

Unnamed: 0,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,City_EV_stations_locations,incentives_status,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc
0,5429524,41200,9.4,13.0,54739,294,NO,64941,17.5,27.4,6.0,20.8,2.2,26.0
1,3519595,40800,7.2,7.3,0,709,YES,115611,19.9,21.5,16.9,17.6,3.6,20.5
2,2264823,40800,7.4,12.6,68894,218,YES,67370,15.5,29.4,8.8,18.1,3.6,24.6
3,1239220,44800,10.9,16.6,0,93,YES,0,16.9,27.9,9.7,19.2,3.0,23.4
4,1062643,44800,10.9,16.6,0,70,YES,0,16.9,27.9,9.7,19.2,3.0,23.4
5,989567,41200,9.4,13.0,6091,159,NO,64941,17.5,27.4,6.0,20.8,2.2,26.0
6,721599,41200,9.4,13.0,4863,171,NO,64941,17.5,27.4,6.0,20.8,2.2,26.0
7,705244,39200,7.6,9.9,0,45,YES,1444,22.0,29.6,7.7,17.7,2.9,20.1
8,705103,40800,7.2,7.3,0,149,YES,115611,19.9,21.5,16.9,17.6,3.6,20.5
9,693645,41200,9.4,13.0,7616,43,NO,64941,17.5,27.4,6.0,20.8,2.2,26.0


In [9]:
# Save categorical variable column incentives_status to variable for encoding
incentives = features_df.dtypes[features_df.dtypes == "object"].index.tolist()

In [10]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable column incentive_status
encode_df = pd.DataFrame(enc.fit_transform(features_df[incentives]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(incentives)
encode_df.head()

Unnamed: 0,incentives_status_NO,incentives_status_YES
0,1.0,0.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0


In [11]:
# Merge the incentive_status_YES column back to original dataframe and drop the original incentives_status column
features_df = features_df.merge(encode_df, left_index=True, right_index=True)
features_df = features_df.drop(columns=["incentives_status", "incentives_status_NO"], axis=1)
features_df.head()

Unnamed: 0,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,City_EV_stations_locations,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc,incentives_status_YES
0,5429524,41200,9.4,13.0,54739,294,64941,17.5,27.4,6.0,20.8,2.2,26.0,0.0
1,3519595,40800,7.2,7.3,0,709,115611,19.9,21.5,16.9,17.6,3.6,20.5,1.0
2,2264823,40800,7.4,12.6,68894,218,67370,15.5,29.4,8.8,18.1,3.6,24.6,1.0
3,1239220,44800,10.9,16.6,0,93,0,16.9,27.9,9.7,19.2,3.0,23.4,1.0
4,1062643,44800,10.9,16.6,0,70,0,16.9,27.9,9.7,19.2,3.0,23.4,1.0


In [12]:
# Split preprocessed data and define target variable and features
y = features_df["City_EV_stations_locations"]
X = features_df.drop(["City_EV_stations_locations"],axis=1).values

## Begin Tuning of Model

In [13]:
# Define the model
regr = RandomForestRegressor()

# Fit the model
regr.fit(X, y)

# Make predictions
y_pred = regr.predict(X)
print(y_pred.shape)

(1702,)


In [14]:
# predict the accuracy score
score = r2_score(y, y_pred)
print('r2 score is: ', score)
print('mean squared error is : ', mean_squared_error(y, y_pred))
print('root mean squared error is : ', np.sqrt(mean_squared_error(y, y_pred)))

r2 score is:  0.9231148950932653
mean squared error is :  37.63993963222776
root mean squared error is :  6.135139740236383


In [15]:
print(y_pred)

[3.8449e+02 4.9671e+02 1.9706e+02 ... 1.2000e-01 1.2000e-01 0.0000e+00]


In [16]:
# check score
regr.score(X, y)

0.9231148950932653

In [17]:
# Integrate predictions to EV_stations_df dataframe
EV_stations_df["model_predictions"] = y_pred
EV_stations_df

Unnamed: 0,id,city,province_id,province_name,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,City_EV_stations_locations,incentives_status,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc,model_predictions
0,1124279679,Toronto,ON,Ontario,5429524,41200,9.4,13.0,54739,294,NO,64941,17.5,27.4,6.0,20.8,2.2,26.0,384.49
1,1124586170,Montreal,QC,Quebec,3519595,40800,7.2,7.3,0,709,YES,115611,19.9,21.5,16.9,17.6,3.6,20.5,496.71
2,1124825478,Vancouver,BC,British Columbia,2264823,40800,7.4,12.6,68894,218,YES,67370,15.5,29.4,8.8,18.1,3.6,24.6,197.06
3,1124690423,Calgary,AB,Alberta,1239220,44800,10.9,16.6,0,93,YES,0,16.9,27.9,9.7,19.2,3.0,23.4,93.76
4,1124290735,Edmonton,AB,Alberta,1062643,44800,10.9,16.6,0,70,YES,0,16.9,27.9,9.7,19.2,3.0,23.4,85.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1697,1124105436,Durham-Sud,QC,Quebec,1008,40800,7.2,7.3,0,0,YES,115611,19.9,21.5,16.9,17.6,3.6,20.5,0.20
1698,1124850489,Melbourne,QC,Quebec,1004,40800,7.2,7.3,0,1,YES,115611,19.9,21.5,16.9,17.6,3.6,20.5,0.71
1699,1124001339,Nipawin No. 487,SK,Saskatchewan,1004,42400,7.0,18.1,0,0,YES,981,20.7,30.5,10.4,17.1,3.3,18.0,0.12
1700,1124001661,Duck Lake No. 463,SK,Saskatchewan,1004,42400,7.0,18.1,0,0,YES,981,20.7,30.5,10.4,17.1,3.3,18.0,0.12


In [18]:
# Reorder columns to provide direct comparison of existing EV_charing_locations to model predictions
EV_stations_df.columns
EV_stations_df = EV_stations_df[['id', 'city', 'province_id', 'province_name', 'City_Population',
       'Median_Income', 'Unemployment_Rate', 'City_Electricity_Rate',
       'City_EV_registrations','incentives_status', 'EV_Per_Province', 'No_Certificate_perc',
       'Secondary_HS_perc', 'Apprenticeship_perc', 'College_CEGEP_perc',
       'Univ_diploma_Below_Bachelor_perc', 'Univ_diploma_Above_Bachelor_perc',
       'City_EV_stations_locations','model_predictions']]

In [19]:
EV_stations_df.head()

Unnamed: 0,id,city,province_id,province_name,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,incentives_status,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc,City_EV_stations_locations,model_predictions
0,1124279679,Toronto,ON,Ontario,5429524,41200,9.4,13.0,54739,NO,64941,17.5,27.4,6.0,20.8,2.2,26.0,294,384.49
1,1124586170,Montreal,QC,Quebec,3519595,40800,7.2,7.3,0,YES,115611,19.9,21.5,16.9,17.6,3.6,20.5,709,496.71
2,1124825478,Vancouver,BC,British Columbia,2264823,40800,7.4,12.6,68894,YES,67370,15.5,29.4,8.8,18.1,3.6,24.6,218,197.06
3,1124690423,Calgary,AB,Alberta,1239220,44800,10.9,16.6,0,YES,0,16.9,27.9,9.7,19.2,3.0,23.4,93,93.76
4,1124290735,Edmonton,AB,Alberta,1062643,44800,10.9,16.6,0,YES,0,16.9,27.9,9.7,19.2,3.0,23.4,70,85.25


In [22]:
# Save the model
import pickle

# Save the trained model
EV_station_predictions = "station_prediction_model.pkl"

with open(EV_station_predictions, 'wb') as file:
    pickle.dump(regr, file)

In [21]:
# Save prediction DataFrame to csv file
# Create filepath
filepath = Path('Station_Predictions.csv')

# Save the file
EV_stations_df.to_csv(filepath)