In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# import r2_score module
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error


In [2]:
# Create filepath to dataset
path = "https://ev-project-datasets.s3.us-east-2.amazonaws.com/clean_table.csv"
EV_stations_df = pd.read_csv(path, encoding='latin-1', on_bad_lines="skip")

In [3]:
EV_stations_df

Unnamed: 0,id,city,province_id,province_name,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,City_EV_stations_locations,incentives_status,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc
0,1124279679,Toronto,ON,Ontario,5429524,41200,9.4,13.0,54739,294,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0
1,1124586170,Montreal,QC,Quebec,3519595,40800,7.2,7.3,61203,709,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5
2,1124825478,Vancouver,BC,British Columbia,2264823,40800,7.4,12.6,68894,218,YES,148116,15.5,29.4,8.8,18.1,3.6,24.6
3,1124690423,Calgary,AB,Alberta,1239220,44800,10.9,16.6,0,93,YES,0,16.9,27.9,9.7,19.2,3.0,23.4
4,1124290735,Edmonton,AB,Alberta,1062643,44800,10.9,16.6,0,70,YES,0,16.9,27.9,9.7,19.2,3.0,23.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1699,1124105436,Durham-Sud,QC,Quebec,1008,40800,7.2,7.3,0,0,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5
1700,1124850489,Melbourne,QC,Quebec,1004,40800,7.2,7.3,0,1,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5
1701,1124001339,Nipawin No. 487,SK,Saskatchewan,1004,42400,7.0,18.1,0,0,YES,2222,20.7,30.5,10.4,17.1,3.3,18.0
1702,1124001661,Duck Lake No. 463,SK,Saskatchewan,1004,42400,7.0,18.1,0,0,YES,2222,20.7,30.5,10.4,17.1,3.3,18.0


In [4]:
# Copy original EV_stations_df DataFrame to features_df
features_df = EV_stations_df.copy()
features_df.head()

Unnamed: 0,id,city,province_id,province_name,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,City_EV_stations_locations,incentives_status,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc
0,1124279679,Toronto,ON,Ontario,5429524,41200,9.4,13.0,54739,294,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0
1,1124586170,Montreal,QC,Quebec,3519595,40800,7.2,7.3,61203,709,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5
2,1124825478,Vancouver,BC,British Columbia,2264823,40800,7.4,12.6,68894,218,YES,148116,15.5,29.4,8.8,18.1,3.6,24.6
3,1124690423,Calgary,AB,Alberta,1239220,44800,10.9,16.6,0,93,YES,0,16.9,27.9,9.7,19.2,3.0,23.4
4,1124290735,Edmonton,AB,Alberta,1062643,44800,10.9,16.6,0,70,YES,0,16.9,27.9,9.7,19.2,3.0,23.4


In [5]:
# Withhold 20% of dataset before training model in order to test accuracy
features_data = features_df.sample(frac=0.65, random_state=42).reset_index(drop=True)
unseen_data = features_df.drop(features_data.index).reset_index(drop=True)

# review shape of modeling and prediction datasets
print('Data for Modeling: ' + str(features_data.shape))
print('Data for Prediction: ' + str(unseen_data.shape))

Data for Modeling: (1108, 18)
Data for Prediction: (596, 18)


In [6]:
# Drop the columns identifying each city: id, city, province_id, province_name
features_train = features_data.drop(columns=["id", "province_id", "city", "province_name"])
features_unseen = unseen_data.drop(columns=["id", "province_id", "city", "province_name"])
# print(features_df.shape)
features_train.head(10)

Unnamed: 0,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,City_EV_stations_locations,incentives_status,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc
0,2114,40800,7.2,7.3,0,0,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5
1,3570,37600,9.6,12.7,0,0,YES,2112,22.0,28.5,9.1,21.8,1.9,16.7
2,3277,44800,10.9,16.6,0,0,YES,0,16.9,27.9,9.7,19.2,3.0,23.4
3,10215,41200,9.4,13.0,56,0,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0
4,2196,36800,12.5,13.8,0,0,YES,0,23.4,25.0,11.3,23.1,2.4,14.8
5,1061,42400,7.0,18.1,0,0,YES,2222,20.7,30.5,10.4,17.1,3.3,18.0
6,14558,40800,7.2,7.3,0,1,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5
7,1218,41200,9.4,13.0,0,2,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0
8,6408,41200,9.4,13.0,17,8,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0
9,1713,40800,7.4,12.6,0,0,YES,148116,15.5,29.4,8.8,18.1,3.6,24.6


In [7]:
# Save categorical variable column incentives_status to variable for encoding
incentives = features_train.dtypes[features_train.dtypes == "object"].index.tolist()
incentives_unseen = features_unseen.dtypes[features_unseen.dtypes == "object"].index.tolist()

In [8]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable column incentive_status
encode_df = pd.DataFrame(enc.fit_transform(features_train[incentives]))
unseen_encode = pd.DataFrame(enc.fit_transform(features_unseen[incentives_unseen]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(incentives)
unseen_encode.columns = enc.get_feature_names_out(incentives_unseen)

In [9]:
# Merge the incentive_status_YES column back to original dataframe and drop the original incentives_status column
features_train = features_train.merge(encode_df, left_index=True, right_index=True)
features_train = features_train.drop(columns=["incentives_status", "incentives_status_NO"], axis=1)
# features_train.head()

# Merge the unseen incentives back to unseen features
features_unseen = features_unseen.merge(unseen_encode, left_index=True, right_index=True)
features_unseen = features_unseen.drop(columns=["incentives_status", "incentives_status_NO"], axis=1)

In [10]:
# Split preprocessed data and define target variable and features
y = features_train["City_EV_stations_locations"]
X = features_train.drop(["City_EV_stations_locations"],axis=1).values

## Use RandomForestRegressor model

In [11]:
# Define the model
regr = RandomForestRegressor()

# Fit the model
regr.fit(X, y)

# Make predictions
y_pred = regr.predict(X)
print(y_pred.shape)

(1108,)


In [12]:
# predict the accuracy score
score = r2_score(y, y_pred)
print('r2 score is: ', score)
print('mean squared error is : ', mean_squared_error(y, y_pred))
print('root mean squared error is : ', np.sqrt(mean_squared_error(y, y_pred)))

r2 score is:  0.9576581805188837
mean squared error is :  10.038258047219232
root mean squared error is :  3.168321013915609


In [13]:
print(y_pred)

[0.11316667 0.06       0.11       ... 0.         1.13       0.90666667]


In [14]:
# check score
regr.score(X, y)

0.9576581805188837

In [15]:
# Integrate predictions to features_data dataframe
features_data["model_predictions"] = y_pred

In [16]:
# # Reorder columns to provide direct comparison of existing EV_charing_locations to model predictions
features_data = features_data[['id', 'city', 'province_id', 'province_name', 'City_Population',
       'Median_Income', 'Unemployment_Rate', 'City_Electricity_Rate',
       'City_EV_registrations','incentives_status', 'EV_Per_Province', 'No_Certificate_perc',
       'Secondary_HS_perc', 'Apprenticeship_perc', 'College_CEGEP_perc',
       'Univ_diploma_Below_Bachelor_perc', 'Univ_diploma_Above_Bachelor_perc',
       'City_EV_stations_locations','model_predictions']]

In [17]:
features_data.head()

Unnamed: 0,id,city,province_id,province_name,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,incentives_status,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc,City_EV_stations_locations,model_predictions
0,1124583281,Saint-Nazaire,QC,Quebec,2114,40800,7.2,7.3,0,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5,0,0.113167
1,1124001302,Atholville,NB,New Brunswick,3570,37600,9.6,12.7,0,YES,2112,22.0,28.5,9.1,21.8,1.9,16.7,0,0.06
2,1124360682,Penhold,AB,Alberta,3277,44800,10.9,16.6,0,YES,0,16.9,27.9,9.7,19.2,3.0,23.4,0,0.11
3,1124000802,North Dumfries,ON,Ontario,10215,41200,9.4,13.0,56,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0,0,0.19
4,1124836835,Twillingate,NL,Newfoundland and Labrador,2196,36800,12.5,13.8,0,YES,0,23.4,25.0,11.3,23.1,2.4,14.8,0,0.12


In [18]:
# Save prediction DataFrame to csv file
# Create filepath
filepath = Path('Training_Station_Predictions.csv')

# Save the file
features_data.to_csv(filepath)

In [19]:
# Save the model
import pickle

# Save the trained model
EV_station_predictions = "station_prediction_model.pkl"

with open(EV_station_predictions, 'wb') as file:
    pickle.dump(regr, file)

In [20]:
# Load the model
with open(EV_station_predictions, 'rb') as file:
    prediction_model = pickle.load(file)

prediction_model

RandomForestRegressor()

In [21]:
# Split features_unseen into features and target variables
y_test = features_unseen["City_EV_stations_locations"]
X_test = features_unseen.drop(["City_EV_stations_locations"],axis=1).values

In [22]:
# Use the reloaded model to calculate accuracy score and predict target values
predictions = prediction_model.predict(X_test)
print(predictions.shape)

(596,)


In [23]:
# Check predictions score
prediction_model.score(X_test,y_test)

0.44369597876312905

In [24]:
# Integrate predictions to features_unseen dataframe
unseen_data["model_predictions"] = predictions

In [25]:
# Reorder columns to provide direct comparison of existing EV_charing_locations to model predictions
unseen_data = unseen_data[['id', 'city', 'province_id', 'province_name', 'City_Population',
       'Median_Income', 'Unemployment_Rate', 'City_Electricity_Rate',
       'City_EV_registrations','incentives_status', 'EV_Per_Province', 'No_Certificate_perc',
       'Secondary_HS_perc', 'Apprenticeship_perc', 'College_CEGEP_perc',
       'Univ_diploma_Below_Bachelor_perc', 'Univ_diploma_Above_Bachelor_perc',
       'City_EV_stations_locations','model_predictions']]

In [26]:
unseen_data.sample(10)

Unnamed: 0,id,city,province_id,province_name,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,incentives_status,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc,City_EV_stations_locations,model_predictions
28,1124216164,Pointe-Lebel,QC,Quebec,1973,40800,7.2,7.3,0,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5,0,1.668333
5,1124001796,Clanwilliam-Erickson,MB,Manitoba,2011,39200,7.6,9.9,0,YES,3290,22.0,29.6,7.7,17.7,2.9,20.1,0,0.0
535,1124000632,Weyburn No. 67,SK,Saskatchewan,1064,42400,7.0,18.1,0,YES,2222,20.7,30.5,10.4,17.1,3.3,18.0,0,0.0
120,1124722091,Saint-Placide,QC,Quebec,1715,40800,7.2,7.3,0,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5,0,0.0
477,1124591131,Saint-Modeste,QC,Quebec,1128,40800,7.2,7.3,0,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5,0,0.0
212,1124744995,Hilliers,BC,British Columbia,1540,40800,7.4,12.6,0,YES,148116,15.5,29.4,8.8,18.1,3.6,24.6,0,0.17
24,1124759374,Kedgwick,NB,New Brunswick,1979,37600,9.6,12.7,0,YES,2112,22.0,28.5,9.1,21.8,1.9,16.7,0,0.05
498,1124001996,Air Ronge,SK,Saskatchewan,1106,42400,7.0,18.1,0,YES,2222,20.7,30.5,10.4,17.1,3.3,18.0,0,0.0
169,1124000692,Royston,BC,British Columbia,1616,40800,7.4,12.6,0,YES,148116,15.5,29.4,8.8,18.1,3.6,24.6,0,0.29
428,1124001357,Manning,AB,Alberta,1183,44800,10.9,16.6,0,YES,0,16.9,27.9,9.7,19.2,3.0,23.4,0,0.28


In [27]:
# Save prediction DataFrame to csv file
# Create filepath
filepath = Path('Testing_Station_Predictions.csv')

# Save the file
unseen_data.to_csv(filepath)