In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# import r2_score module
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error


In [2]:
# Create filepath to dataset
path = "https://ev-project-datasets.s3.us-east-2.amazonaws.com/clean_table.csv"
EV_stations_df = pd.read_csv(path, encoding='latin-1', on_bad_lines="skip")

In [3]:
EV_stations_df

Unnamed: 0,id,city,province_id,province_name,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,City_EV_stations_locations,incentives_status,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc
0,1124279679,Toronto,ON,Ontario,5429524,41200,9.4,13.0,54739,294,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0
1,1124586170,Montreal,QC,Quebec,3519595,40800,7.2,7.3,61203,709,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5
2,1124825478,Vancouver,BC,British Columbia,2264823,40800,7.4,12.6,68894,218,YES,148116,15.5,29.4,8.8,18.1,3.6,24.6
3,1124690423,Calgary,AB,Alberta,1239220,44800,10.9,16.6,0,93,YES,0,16.9,27.9,9.7,19.2,3.0,23.4
4,1124290735,Edmonton,AB,Alberta,1062643,44800,10.9,16.6,0,70,YES,0,16.9,27.9,9.7,19.2,3.0,23.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1699,1124105436,Durham-Sud,QC,Quebec,1008,40800,7.2,7.3,0,0,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5
1700,1124850489,Melbourne,QC,Quebec,1004,40800,7.2,7.3,0,1,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5
1701,1124001339,Nipawin No. 487,SK,Saskatchewan,1004,42400,7.0,18.1,0,0,YES,2222,20.7,30.5,10.4,17.1,3.3,18.0
1702,1124001661,Duck Lake No. 463,SK,Saskatchewan,1004,42400,7.0,18.1,0,0,YES,2222,20.7,30.5,10.4,17.1,3.3,18.0


In [4]:
# Copy original EV_stations_df DataFrame to features_df
features_df = EV_stations_df.copy()
features_df.head()

Unnamed: 0,id,city,province_id,province_name,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,City_EV_stations_locations,incentives_status,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc
0,1124279679,Toronto,ON,Ontario,5429524,41200,9.4,13.0,54739,294,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0
1,1124586170,Montreal,QC,Quebec,3519595,40800,7.2,7.3,61203,709,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5
2,1124825478,Vancouver,BC,British Columbia,2264823,40800,7.4,12.6,68894,218,YES,148116,15.5,29.4,8.8,18.1,3.6,24.6
3,1124690423,Calgary,AB,Alberta,1239220,44800,10.9,16.6,0,93,YES,0,16.9,27.9,9.7,19.2,3.0,23.4
4,1124290735,Edmonton,AB,Alberta,1062643,44800,10.9,16.6,0,70,YES,0,16.9,27.9,9.7,19.2,3.0,23.4


In [5]:
# Drop the columns identifying each city: id, city, province_id, province_name
features_df = features_df.drop(columns=["id", "province_id", "city", "province_name"])

features_df.head(10)

Unnamed: 0,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,City_EV_stations_locations,incentives_status,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc
0,5429524,41200,9.4,13.0,54739,294,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0
1,3519595,40800,7.2,7.3,61203,709,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5
2,2264823,40800,7.4,12.6,68894,218,YES,148116,15.5,29.4,8.8,18.1,3.6,24.6
3,1239220,44800,10.9,16.6,0,93,YES,0,16.9,27.9,9.7,19.2,3.0,23.4
4,1062643,44800,10.9,16.6,0,70,YES,0,16.9,27.9,9.7,19.2,3.0,23.4
5,989567,41200,9.4,13.0,6091,159,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0
6,721599,41200,9.4,13.0,4863,171,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0
7,705244,39200,7.6,9.9,0,45,YES,3290,22.0,29.6,7.7,17.7,2.9,20.1
8,705103,40800,7.2,7.3,0,149,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5
9,693645,41200,9.4,13.0,7616,43,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0


In [6]:
# Save categorical variable column incentives_status to variable for encoding
incentives = features_df.dtypes[features_df.dtypes == "object"].index.tolist()

In [7]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable column incentive_status
encode_df = pd.DataFrame(enc.fit_transform(features_df[incentives]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(incentives)

In [8]:
# Merge the incentive_status_YES column back to original dataframe and drop the original incentives_status column
features_df = features_df.merge(encode_df, left_index=True, right_index=True)
features_df = features_df.drop(columns=["incentives_status", "incentives_status_NO"], axis=1)

In [9]:
# Split preprocessed data and define target variable and features
y = features_df["City_EV_stations_locations"]
X = features_df.drop(["City_EV_stations_locations"],axis=1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

In [10]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Use RandomForestRegressor model

In [11]:
# Define the model
regr = RandomForestRegressor()

# Fit the model
regr_model = regr.fit(X_train_scaled, y_train)

# Make predictions
y_pred = regr_model.predict(X_test_scaled)
print(y_pred.shape)

(426,)


In [12]:
# predict the accuracy score
score = r2_score(y_test, y_pred)
print('r2 score is: ', score)
print('mean squared error is : ', mean_squared_error(y_test, y_pred))
print('root mean squared error is : ', np.sqrt(mean_squared_error(y_test, y_pred)))

r2 score is:  0.7498756844115535
mean squared error is :  12.935804250097146
root mean squared error is :  3.596637909228165


In [13]:
print(y_pred)

[2.61000000e+00 1.72000000e+00 1.13000000e+00 2.58500000e-01
 4.50000000e-01 4.06000000e+00 6.00000000e-01 4.59000000e+00
 3.23000000e+00 7.80000000e-01 8.70000000e-01 2.70000000e-01
 0.00000000e+00 2.36500000e+01 6.10000000e-01 2.39000000e+00
 4.40000000e-01 2.22000000e+00 1.93000000e+00 1.70000000e-01
 4.32000000e+00 3.71000000e+00 1.60000000e+00 7.40000000e-01
 5.73000000e+00 7.27000000e+00 4.00000000e-02 7.70000000e-01
 0.00000000e+00 3.88000000e+00 1.21100000e+01 5.41666667e-02
 1.07000000e+00 4.90000000e-01 2.93000000e+00 2.00000000e-02
 1.26000000e+00 2.56500000e-01 2.50000000e-01 6.80000000e-01
 1.93666667e-01 1.25000000e+00 9.00000000e-02 1.16000000e+00
 6.00000000e-01 6.90000000e-01 4.60000000e-01 1.50000000e-02
 1.93666667e-01 1.10000000e-01 4.90000000e-01 8.60000000e-01
 0.00000000e+00 4.56000000e+00 1.22000000e+00 2.61800000e+01
 1.00500000e+01 2.45000000e+00 3.25000000e+00 5.54000000e+00
 4.91000000e+00 6.32400000e+01 7.66666667e-01 0.00000000e+00
 1.73333333e-01 6.000000

In [14]:
# check score
regr_model.score(X_test_scaled, y_test)

0.7498756844115535

In [15]:
# Save the model
import pickle

# Save the trained model
EV_station_predictions = "station_prediction_model.pkl"

with open(EV_station_predictions, 'wb') as file:
    pickle.dump(regr_model, file)

In [16]:
# Load the model
with open(EV_station_predictions, 'rb') as file:
    prediction_model = pickle.load(file)

prediction_model

RandomForestRegressor()

In [17]:
full_preds = prediction_model.predict(X_train_scaled)
print(full_preds.shape)
full_preds

(1278,)


array([ 0.        ,  6.76      ,  0.08333333, ..., 26.34      ,
        0.92      ,  0.07      ])

In [18]:
exist_train = pd.DataFrame(y_train)
exist_train

Unnamed: 0,City_EV_stations_locations
1354,0
298,8
1660,0
49,18
623,2
...,...
310,0
105,0
40,31
470,0


In [19]:
train_predictions = exist_train
train_predictions["model_predictions"] = full_preds
train_predictions

Unnamed: 0,City_EV_stations_locations,model_predictions
1354,0,0.000000
298,8,6.760000
1660,0,0.083333
49,18,14.320000
623,2,1.510000
...,...,...
310,0,0.690000
105,0,1.980000
40,31,26.340000
470,0,0.920000


In [26]:
# Merge the training predictions and the EV_stations_df
EV_train_predictions = EV_stations_df.merge(train_predictions, left_index=True, right_index=True)
EV_train_predictions = EV_train_predictions.drop(columns="City_EV_stations_locations_x", axis=1)
EV_train_predictions

Unnamed: 0,id,city,province_id,province_name,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,incentives_status,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc,City_EV_stations_locations_y,model_predictions
0,1124279679,Toronto,ON,Ontario,5429524,41200,9.4,13.0,54739,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0,294,355.20
1,1124586170,Montreal,QC,Quebec,3519595,40800,7.2,7.3,61203,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5,709,515.86
2,1124825478,Vancouver,BC,British Columbia,2264823,40800,7.4,12.6,68894,YES,148116,15.5,29.4,8.8,18.1,3.6,24.6,218,293.29
3,1124690423,Calgary,AB,Alberta,1239220,44800,10.9,16.6,0,YES,0,16.9,27.9,9.7,19.2,3.0,23.4,93,110.13
5,1124399363,Ottawa,ON,Ontario,989567,41200,9.4,13.0,6091,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0,159,154.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1694,1124000091,Assiginack,ON,Ontario,1013,41200,9.4,13.0,1,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0,0,0.09
1696,1124260692,Hudson Hope,BC,British Columbia,1012,40800,7.4,12.6,0,YES,148116,15.5,29.4,8.8,18.1,3.6,24.6,0,0.12
1699,1124105436,Durham-Sud,QC,Quebec,1008,40800,7.2,7.3,0,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5,0,0.01
1701,1124001339,Nipawin No. 487,SK,Saskatchewan,1004,42400,7.0,18.1,0,YES,2222,20.7,30.5,10.4,17.1,3.3,18.0,0,0.22


In [28]:
# Save prediction DataFrame to csv file
# Create filepath
filepath = Path('..\Datasets\Training_Station_Predictions.csv')

# Save the file
EV_train_predictions.to_csv(filepath)

In [29]:
existing = pd.DataFrame(y_test)
existing

Unnamed: 0,City_EV_stations_locations
220,5
504,3
488,0
1262,5
1135,0
...,...
693,7
1109,0
1189,0
990,0


In [30]:
predictions_df = existing
predictions_df["model_predictions"] = y_pred
predictions_df

Unnamed: 0,City_EV_stations_locations,model_predictions
220,5,2.6100
504,3,1.7200
488,0,1.1300
1262,5,0.2585
1135,0,0.4500
...,...,...
693,7,0.4100
1109,0,0.0000
1189,0,0.0000
990,0,0.0400


In [31]:
# Merge the training predictions and the EV_stations_df
EV_test_predictions = EV_stations_df.merge(predictions_df, left_index=True, right_index=True)
EV_test_predictions = EV_test_predictions.drop(columns="City_EV_stations_locations_x", axis=1)
EV_test_predictions

Unnamed: 0,id,city,province_id,province_name,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,incentives_status,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc,City_EV_stations_locations_y,model_predictions
4,1124290735,Edmonton,AB,Alberta,1062643,44800,10.9,16.6,0,YES,0,16.9,27.9,9.7,19.2,3.0,23.4,70,112.01
12,1124158530,Kitchener,ON,Ontario,470015,41200,9.4,13.0,1231,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0,38,56.68
19,1124704011,Niagara Falls,ON,Ontario,308596,41200,9.4,13.0,269,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0,18,26.18
22,1124261024,Windsor,ON,Ontario,276165,41200,9.4,13.0,1601,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0,33,30.50
25,1124817304,Burnaby,BC,British Columbia,232755,40800,7.4,12.6,4454,YES,148116,15.5,29.4,8.8,18.1,3.6,24.6,71,63.24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1695,1124001084,Brébeuf,QC,Quebec,1012,40800,7.2,7.3,0,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5,0,0.01
1697,1124000733,Prince,ON,Ontario,1010,41200,9.4,13.0,0,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0,0,0.09
1698,1124218916,Baie-du-Febvre,QC,Quebec,1010,40800,7.2,7.3,0,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5,1,0.01
1700,1124850489,Melbourne,QC,Quebec,1004,40800,7.2,7.3,0,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5,1,0.01


In [32]:
# Save prediction DataFrame to csv file
# Create filepath
filepath = Path('..\Datasets\Testing_Station_Predictions.csv')

# Save the file
EV_test_predictions.to_csv(filepath)