In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
# import r2_score module
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error


In [2]:
# Create filepath to dataset
path = "https://ev-project-datasets.s3.us-east-2.amazonaws.com/clean_table.csv"
EV_stations_df = pd.read_csv(path, encoding='latin-1', on_bad_lines="skip")

In [3]:
EV_stations_df

Unnamed: 0,id,city,province_id,province_name,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,City_EV_stations_locations,incentives_status,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc
0,1124279679,Toronto,ON,Ontario,5429524,41200,9.4,13.0,54739,294,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0
1,1124586170,Montreal,QC,Quebec,3519595,40800,7.2,7.3,61203,709,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5
2,1124825478,Vancouver,BC,British Columbia,2264823,40800,7.4,12.6,68894,218,YES,148116,15.5,29.4,8.8,18.1,3.6,24.6
3,1124690423,Calgary,AB,Alberta,1239220,44800,10.9,16.6,0,93,YES,0,16.9,27.9,9.7,19.2,3.0,23.4
4,1124290735,Edmonton,AB,Alberta,1062643,44800,10.9,16.6,0,70,YES,0,16.9,27.9,9.7,19.2,3.0,23.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1699,1124105436,Durham-Sud,QC,Quebec,1008,40800,7.2,7.3,0,0,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5
1700,1124850489,Melbourne,QC,Quebec,1004,40800,7.2,7.3,0,1,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5
1701,1124001339,Nipawin No. 487,SK,Saskatchewan,1004,42400,7.0,18.1,0,0,YES,2222,20.7,30.5,10.4,17.1,3.3,18.0
1702,1124001661,Duck Lake No. 463,SK,Saskatchewan,1004,42400,7.0,18.1,0,0,YES,2222,20.7,30.5,10.4,17.1,3.3,18.0


In [4]:
# Copy original EV_stations_df DataFrame to features_df
features_df = EV_stations_df.copy()
features_df.head()

Unnamed: 0,id,city,province_id,province_name,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,City_EV_stations_locations,incentives_status,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc
0,1124279679,Toronto,ON,Ontario,5429524,41200,9.4,13.0,54739,294,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0
1,1124586170,Montreal,QC,Quebec,3519595,40800,7.2,7.3,61203,709,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5
2,1124825478,Vancouver,BC,British Columbia,2264823,40800,7.4,12.6,68894,218,YES,148116,15.5,29.4,8.8,18.1,3.6,24.6
3,1124690423,Calgary,AB,Alberta,1239220,44800,10.9,16.6,0,93,YES,0,16.9,27.9,9.7,19.2,3.0,23.4
4,1124290735,Edmonton,AB,Alberta,1062643,44800,10.9,16.6,0,70,YES,0,16.9,27.9,9.7,19.2,3.0,23.4


In [5]:
# Drop the columns identifying each city: id, city, province_id, province_name
features_df = features_df.drop(columns=["id", "province_id", "city", "province_name"])

features_df.head(10)

Unnamed: 0,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,City_EV_stations_locations,incentives_status,EV_Per_Province,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc
0,5429524,41200,9.4,13.0,54739,294,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0
1,3519595,40800,7.2,7.3,61203,709,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5
2,2264823,40800,7.4,12.6,68894,218,YES,148116,15.5,29.4,8.8,18.1,3.6,24.6
3,1239220,44800,10.9,16.6,0,93,YES,0,16.9,27.9,9.7,19.2,3.0,23.4
4,1062643,44800,10.9,16.6,0,70,YES,0,16.9,27.9,9.7,19.2,3.0,23.4
5,989567,41200,9.4,13.0,6091,159,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0
6,721599,41200,9.4,13.0,4863,171,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0
7,705244,39200,7.6,9.9,0,45,YES,3290,22.0,29.6,7.7,17.7,2.9,20.1
8,705103,40800,7.2,7.3,0,149,YES,252020,19.9,21.5,16.9,17.6,3.6,20.5
9,693645,41200,9.4,13.0,7616,43,NO,144128,17.5,27.4,6.0,20.8,2.2,26.0


In [6]:
# Save categorical variable column incentives_status to variable for encoding
incentives = features_df.dtypes[features_df.dtypes == "object"].index.tolist()

In [7]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable column incentive_status
encode_df = pd.DataFrame(enc.fit_transform(features_df[incentives]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(incentives)

In [8]:
# Merge the incentive_status_YES column back to original dataframe and drop the original incentives_status column
features_df = features_df.merge(encode_df, left_index=True, right_index=True)
features_df = features_df.drop(columns=["incentives_status", "incentives_status_NO"], axis=1)

In [9]:
# Split preprocessed data and define target variable and features
y = features_df["City_EV_stations_locations"]
X = features_df.drop(["City_EV_stations_locations"],axis=1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

In [10]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Use RandomForestRegressor model

In [11]:
# Define the model
regr = RandomForestRegressor()

# Fit the model
regr_model = regr.fit(X_train_scaled, y_train)

# Make predictions
y_pred = regr_model.predict(X_test_scaled)
print(y_pred.shape)

(426,)


In [12]:
# predict the accuracy score
score = r2_score(y_test, y_pred)
print('r2 score is: ', score)
print('mean squared error is : ', mean_squared_error(y_test, y_pred))
print('root mean squared error is : ', np.sqrt(mean_squared_error(y_test, y_pred)))

r2 score is:  0.7446640591727156
mean squared error is :  13.205336477525204
root mean squared error is :  3.633914759254158


In [13]:
print(y_pred)

[2.12000000e+00 1.98000000e+00 8.80000000e-01 2.62500000e-01
 4.10000000e-01 4.04000000e+00 9.80000000e-01 4.58000000e+00
 2.90000000e+00 6.20000000e-01 8.00000000e-01 4.60000000e-01
 0.00000000e+00 2.78200000e+01 6.10000000e-01 2.50000000e+00
 4.30000000e-01 2.59000000e+00 2.75000000e+00 2.00000000e-01
 4.09000000e+00 2.67000000e+00 1.52000000e+00 7.50000000e-01
 5.71000000e+00 7.21000000e+00 4.00000000e-02 7.80000000e-01
 0.00000000e+00 3.51000000e+00 1.29300000e+01 1.05666667e-01
 6.60000000e-01 4.10000000e-01 2.91000000e+00 7.00000000e-02
 2.36000000e+00 2.85000000e-01 3.70000000e-01 6.90000000e-01
 1.66666667e-01 1.20000000e+00 9.00000000e-02 9.70000000e-01
 5.40000000e-01 1.14000000e+00 3.90000000e-01 1.55000000e-02
 1.66666667e-01 1.20000000e-01 6.20000000e-01 7.40000000e-01
 0.00000000e+00 5.09000000e+00 1.34000000e+00 2.63700000e+01
 1.00200000e+01 2.27000000e+00 3.13000000e+00 5.89000000e+00
 5.27000000e+00 6.48100000e+01 7.99166667e-01 0.00000000e+00
 8.83333333e-02 6.800000

In [14]:
# check score
regr_model.score(X_test_scaled, y_test)

0.7446640591727156

In [15]:
# Save the model
import pickle

# Save the trained model
EV_station_predictions = "station_prediction_model.pkl"

with open(EV_station_predictions, 'wb') as file:
    pickle.dump(regr_model, file)

In [16]:
# Load the model
with open(EV_station_predictions, 'rb') as file:
    prediction_model = pickle.load(file)

prediction_model

RandomForestRegressor()

In [21]:
# Load test data to DataFrame
test_features_df = pd.DataFrame(X_test)
# Rename columns to match original column titles
test_features_df = test_features_df.rename(columns={0:'City_Population',
       1:'Median_Income', 2: 'Unemployment_Rate', 3:'City_Electricity_Rate',
       4:'City_EV_registrations', 5:'EV_Per_Province',
       6:'EV_Per_Province', 7:'No_Certificate_perc',
       8:'Secondary_HS_perc', 9:'Apprenticeship_perc', 10:'College_CEGEP_perc',
       11:'Univ_diploma_Below_Bachelor_perc', 12:'Univ_diploma_Above_Bachelor_perc',
       13: 'incentives_status'})


In [22]:
# Add existing station locations to the dataframe
test_features_df["existing_stations"] = y_test
test_features_df["existing_stations"] = test_features_df["existing_stations"].fillna(0)
# Add predicted station numbers to the dataframe
test_features_df["station_predictions"] = y_pred

test_features_df

Unnamed: 0,City_Population,Median_Income,Unemployment_Rate,City_Electricity_Rate,City_EV_registrations,EV_Per_Province,EV_Per_Province.1,No_Certificate_perc,Secondary_HS_perc,Apprenticeship_perc,College_CEGEP_perc,Univ_diploma_Below_Bachelor_perc,Univ_diploma_Above_Bachelor_perc,existing_stations,station_predictions
0,17678.0,40800.0,7.4,12.6,192.0,148116.0,15.5,29.4,8.8,18.1,3.6,24.6,1.0,0.0,2.1200
1,6651.0,40800.0,7.2,7.3,0.0,252020.0,19.9,21.5,16.9,17.6,3.6,20.5,1.0,0.0,1.9800
2,7009.0,41200.0,9.4,13.0,10.0,144128.0,17.5,27.4,6.0,20.8,2.2,26.0,0.0,0.0,0.8800
3,1642.0,41200.0,9.4,13.0,0.0,144128.0,17.5,27.4,6.0,20.8,2.2,26.0,0.0,0.0,0.2625
4,1975.0,41200.0,9.4,13.0,2.0,144128.0,17.5,27.4,6.0,20.8,2.2,26.0,0.0,70.0,0.4100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,4012.0,40800.0,7.2,7.3,0.0,252020.0,19.9,21.5,16.9,17.6,3.6,20.5,1.0,0.0,0.5600
422,2011.0,39200.0,7.6,9.9,0.0,3290.0,22.0,29.6,7.7,17.7,2.9,20.1,1.0,0.0,0.0000
423,1825.0,42400.0,7.0,18.1,0.0,2222.0,20.7,30.5,10.4,17.1,3.3,18.0,1.0,0.0,0.0000
424,2341.0,37600.0,9.6,12.7,0.0,2112.0,22.0,28.5,9.1,21.8,1.9,16.7,1.0,0.0,0.0700
