In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path

In [2]:
# Create reference paths to the raw dataset
path = "raw_datasets/"
EV_registrations_path = path + "EV_sales_table_raw.csv"
EV_stations_raw = path + "EV_stations_raw.csv"


In [3]:
EV__registrations_df = pd.read_csv(EV_registrations_path, index_col=0)
# Confirm file is read to DataFrame correctly
EV__registrations_df.head(10)

Unnamed: 0,REF_DATE,GEO,DGUID,Zero-Emission Vehicles Fuel Type,Vehicle type,Statistics,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,2017-01,Canada,2016A000011124,All zero-emission vehicles,"Total, vehicle type",Number of vehicles,Units,300,units,0,v1277479871,1.1.1.1,3488.0,,,,0
1,2017-01,Canada,2016A000011124,Battery electric,"Total, vehicle type",Number of vehicles,Units,300,units,0,v1277485216,1.2.1.1,1664.0,,,,0
2,2017-01,Canada,2016A000011124,Plug-in hybrid electric,"Total, vehicle type",Number of vehicles,Units,300,units,0,v1277490561,1.3.1.1,1824.0,,,,0
3,2017-01,Newfoundland and Labrador,2016A000210,All zero-emission vehicles,"Total, vehicle type",Number of vehicles,Units,300,units,0,v1277479872,2.1.1.1,,..,,,0
4,2017-01,Newfoundland and Labrador,2016A000210,Battery electric,"Total, vehicle type",Number of vehicles,Units,300,units,0,v1277485217,2.2.1.1,,..,,,0
5,2017-01,Newfoundland and Labrador,2016A000210,Plug-in hybrid electric,"Total, vehicle type",Number of vehicles,Units,300,units,0,v1277490562,2.3.1.1,,..,,,0
6,2017-01,"St. John's, Newfoundland and Labrador",2016S0503001,All zero-emission vehicles,"Total, vehicle type",Number of vehicles,Units,300,units,0,v1277479873,3.1.1.1,,..,,,0
7,2017-01,"St. John's, Newfoundland and Labrador",2016S0503001,Battery electric,"Total, vehicle type",Number of vehicles,Units,300,units,0,v1277485218,3.2.1.1,,..,,,0
8,2017-01,"St. John's, Newfoundland and Labrador",2016S0503001,Plug-in hybrid electric,"Total, vehicle type",Number of vehicles,Units,300,units,0,v1277490563,3.3.1.1,,..,,,0
9,2017-01,Conception Bay South,2016A00051001485,All zero-emission vehicles,"Total, vehicle type",Number of vehicles,Units,300,units,0,v1277479874,4.1.1.1,,..,,,0


In [4]:
# Check column headers
EV__registrations_df.columns

Index(['REF_DATE', 'GEO', 'DGUID', 'Zero-Emission Vehicles Fuel Type',
       'Vehicle type', 'Statistics', 'UOM', 'UOM_ID', 'SCALAR_FACTOR',
       'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL',
       'TERMINATED', 'DECIMALS'],
      dtype='object')

In [5]:
# Check data types
EV__registrations_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 320700 entries, 0 to 320699
Data columns (total 17 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   REF_DATE                          320700 non-null  object 
 1   GEO                               320700 non-null  object 
 2   DGUID                             319920 non-null  object 
 3   Zero-Emission Vehicles Fuel Type  320700 non-null  object 
 4   Vehicle type                      320700 non-null  object 
 5   Statistics                        320700 non-null  object 
 6   UOM                               320700 non-null  object 
 7   UOM_ID                            320700 non-null  int64  
 8   SCALAR_FACTOR                     320700 non-null  object 
 9   SCALAR_ID                         320700 non-null  int64  
 10  VECTOR                            320700 non-null  object 
 11  COORDINATE                        320700 non-null  o

In [6]:
# Check number of unique values in all columns
EV__registrations_df.nunique()

REF_DATE                               20
GEO                                  5128
DGUID                                5332
Zero-Emission Vehicles Fuel Type        3
Vehicle type                            1
Statistics                              1
UOM                                     1
UOM_ID                                  1
SCALAR_FACTOR                           1
SCALAR_ID                               1
VECTOR                              16035
COORDINATE                          16035
VALUE                                 848
STATUS                                  1
SYMBOL                                  0
TERMINATED                              0
DECIMALS                                1
dtype: int64

In [7]:
# Drop columns with 1 or 0 number of unique values and "Unnamed: 0" as it matches the index
# Drop Vector and Coordinate columns, as they do not provide meaningful data for this analysis
EV__registrations_df = EV__registrations_df.drop([ 
                "Vehicle type", 
                "Statistics", 
                "UOM",
                "UOM_ID",
                "SCALAR_FACTOR",
                "SCALAR_ID",
                "VECTOR",
                "COORDINATE", 
                "STATUS", 
                "SYMBOL", 
                "TERMINATED", 
                "DECIMALS"], axis=1)
EV__registrations_df.sample(5)

Unnamed: 0,REF_DATE,GEO,DGUID,Zero-Emission Vehicles Fuel Type,VALUE
197840,2020-01,Fassett,2016A00052480005,Plug-in hybrid electric,
192180,2019-10,Stewart Crossing,2016A00056001050,All zero-emission vehicles,0.0
49041,2017-10,"Division No. 7, Subd. D",2016A00051007038,All zero-emission vehicles,
86227,2018-04,Mont-Alexandre,2016A00052402902,Battery electric,
73094,2018-01,Oxford House 24,2016A00054622050,Plug-in hybrid electric,0.0


In [8]:
# Drop rows with any NaN values remaining
EV_cleaned_df = EV__registrations_df.dropna()
print(EV_cleaned_df.shape[0])
EV_cleaned_df.sample(5)

161820


Unnamed: 0,REF_DATE,GEO,DGUID,Zero-Emission Vehicles Fuel Type,VALUE
153492,2019-04,Lajord No. 128,2016A00054706011,All zero-emission vehicles,0.0
89876,2018-04,Lampman,2016A00054701032,Plug-in hybrid electric,0.0
103580,2018-07,Zorra,2016A00053532027,Plug-in hybrid electric,3.0
174065,2019-07,Musqueam 4,2016A00055915810,Plug-in hybrid electric,0.0
123291,2018-10,Osage,2016A00054702077,All zero-emission vehicles,0.0


In [9]:
# Filter rows for All zero-emission vehicles to remove duplicated information
all_zev = EV_cleaned_df["Zero-Emission Vehicles Fuel Type"] == "All zero-emission vehicles"
EV_filtered_df = EV_cleaned_df[all_zev]
print(EV_filtered_df.shape)
EV_filtered_df.sample(5)

(53940, 5)


Unnamed: 0,REF_DATE,GEO,DGUID,Zero-Emission Vehicles Fuel Type,VALUE
9483,2017-01,Vibank,2016A00054706007,All zero-emission vehicles,0.0
62091,2017-10,Cariboo D,2016A00055941010,All zero-emission vehicles,0.0
247389,2020-10,Port Colborne,2016A00053526011,All zero-emission vehicles,1.0
127704,2018-10,Central Coast D,2016A00055945012,All zero-emission vehicles,0.0
297195,2021-07,Niverville,2016A00054602046,All zero-emission vehicles,0.0


In [10]:
# Save Geographic Unique Identifier from DGUID column to new column, then drop DGUID
EV_filtered_df["Geo-ID"] = [x[9:] for x in EV_filtered_df["DGUID"]]
EV_filtered_df.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,REF_DATE,GEO,DGUID,Zero-Emission Vehicles Fuel Type,VALUE,Geo-ID
24780,2017-04,Clanwilliam-Erickson,2016A00054615092,All zero-emission vehicles,0.0,4615092
231834,2020-07,West Lincoln,2016A00053526021,All zero-emission vehicles,1.0,3526021
46707,2017-07,Princeton,2016A00055907024,All zero-emission vehicles,0.0,5907024
223902,2020-04,Thomas Point 5,2016A00055943817,All zero-emission vehicles,0.0,5943817
223548,2020-04,Towinock 2,2016A00055931832,All zero-emission vehicles,0.0,5931832


In [11]:
# Drop DGUID and Zero-Emission Vehicles Fuel Type columns, save to new DataFrame
EV_locations_df = EV_filtered_df.drop(columns=["DGUID", "Zero-Emission Vehicles Fuel Type"], axis=1)

In [12]:
# Reset the index
EV_locations_df.reset_index(drop=True)

Unnamed: 0,REF_DATE,GEO,VALUE,Geo-ID
0,2017-01,Canada,3488.0,11124
1,2017-01,Prince Edward Island,0.0,11
2,2017-01,New Brunswick,5.0,13
3,2017-01,Quebec,1247.0,24
4,2017-01,"Campbellton, Quebec part",0.0,24330
...,...,...,...,...
53935,2021-10,Lutselk'e,0.0,6105020
53936,2021-10,Reliance,0.0,6105026
53937,2021-10,"Region 5, Unorganized",0.0,6105097
53938,2021-10,Detah,0.0,6106021


In [13]:
# Reorder the columns for ease of reading
EV_locations_df = EV_locations_df[['GEO', 'Geo-ID', 'VALUE', 'REF_DATE']]
EV_locations_df.head()

Unnamed: 0,GEO,Geo-ID,VALUE,REF_DATE
0,Canada,11124,3488.0,2017-01
1140,Prince Edward Island,11,0.0,2017-01
1797,New Brunswick,13,5.0,2017-01
2643,Quebec,24,1247.0,2017-01
2646,"Campbellton, Quebec part",24330,0.0,2017-01


In [14]:
# Save Canada to separate DataFrame, drop from locations DataFrame
EV_country_df = EV_locations_df.loc[lambda EV_locations_df: EV_locations_df['GEO'] == 'Canada']
EV_country_df

Unnamed: 0,GEO,Geo-ID,VALUE,REF_DATE
0,Canada,11124,3488.0,2017-01
16035,Canada,11124,5025.0,2017-04
32070,Canada,11124,5184.0,2017-07
48105,Canada,11124,5999.0,2017-10
64140,Canada,11124,6844.0,2018-01
80175,Canada,11124,14879.0,2018-04
96210,Canada,11124,12622.0,2018-07
112245,Canada,11124,9938.0,2018-10
128280,Canada,11124,8275.0,2019-01
144315,Canada,11124,19446.0,2019-04


In [15]:
# Save provinces to separate dataframe, drop from original locations dataframe
provinces = ['Alberta', 'British Columbia', 'Manitoba', 'New Brunswick', 'Newfoundland and Labrador', 'Nova Scotia', 'Ontario', 'Prince Edward Island', 'Quebec', 'Saskatchewan', 'Yukon', 'Northwest Territories', 'Nunavut']
EV_provinces_df = EV_locations_df[EV_locations_df.GEO.isin(provinces)]
EV_provinces_df

Unnamed: 0,GEO,Geo-ID,VALUE,REF_DATE
1140,Prince Edward Island,11,0.0,2017-01
1797,New Brunswick,13,5.0,2017-01
2643,Quebec,24,1247.0,2017-01
6594,Ontario,35,1319.0,2017-01
8460,Manitoba,46,10.0,2017-01
...,...,...,...,...
313125,Manitoba,46,184.0,2021-10
313836,Saskatchewan,47,143.0,2021-10
318057,British Columbia,59,5482.0,2021-10
320352,Yukon,60,8.0,2021-10


In [16]:
# Save cleaned data to CSV file
# Define file path to save data
filepath = Path('cleaned_datasets/EV_registrations_provinces.csv')
# Allow folders and subfolders to be created is needed
filepath.parent.mkdir(parents=True, exist_ok=True)
# Save the file
EV_provinces_df.to_csv(filepath)

In [17]:
# Drop rows for Canada and each Province/Territory, save to EV_cities_df
EV_cities_df = EV_locations_df.loc[(EV_locations_df.GEO.values != "Canada") & (~EV_locations_df["GEO"].isin(provinces))]
EV_cities_df

Unnamed: 0,GEO,Geo-ID,VALUE,REF_DATE
2646,"Campbellton, Quebec part",24330,0.0,2017-01
2655,"Matane, Quebec",403,2.0,2017-01
2670,"Rimouski, Quebec",404,8.0,2017-01
2688,"Rivière-du-Loup, Quebec",405,5.0,2017-01
2709,"Baie-Comeau, Quebec",406,3.0,2017-01
...,...,...,...,...
320586,Lutselk'e,6105020,0.0,2021-10
320589,Reliance,6105026,0.0,2021-10
320592,"Region 5, Unorganized",6105097,0.0,2021-10
320595,Detah,6106021,0.0,2021-10


In [18]:
# Save cleaned data to CSV file
# Define file path to save data
filepath = Path('cleaned_datasets/EV_registrations_cities.csv')
# Allow folders and subfolders to be created is needed
filepath.parent.mkdir(parents=True, exist_ok=True)
# Save the file
EV_cities_df.to_csv(filepath)

In [19]:
# Read in the stations dataset, set index as column 0
stations_data = pd.read_csv(EV_stations_raw, low_memory=False, index_col=0)
stations_data

Unnamed: 0,Fuel Type Code,Station Name,Street Address,Intersection Directions,City,State,ZIP,Plus4,Station Phone,Status Code,...,EV Pricing (French),LPG Nozzle Types,Hydrogen Pressures,Hydrogen Standards,CNG Fill Type Code,CNG PSI,CNG Vehicle Class,LNG Vehicle Class,EV On-Site Renewable Source,Restricted Access
0,ELEC,Ramada,1319 2nd St W,,Brooks,AB,T1R 1P7,,403-362-6440,E,...,Gratuit,,,,,,,,,False
1,ELEC,Davis Chevrolet,149 E Lake Crescent NE,,Airdrie,AB,T4A 2H9,,403-948-6909,E,...,Gratuit,,,,,,,,,False
2,ELEC,Go Nissan South,1275 101 St SW,,Edmonton,AB,T6X 1A1,,780-463-5700,T,...,Gratuit,,,,,,,,,False
3,ELEC,Don Wheaton Chevrolet,10727 - 82 Ave,,Edmonton,AB,T6E 2B1,,,E,...,Gratuit,,,,,,,,,False
4,ELEC,Gasonic Instruments,8-823 41st Ave NE,,Calgary,AB,T2E 6Y3,,403-276-2201,E,...,Gratuit,,,,,,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8954,ELEC,City_of_Penticton,171 main street,,Penticton,BC,V2A 5A9,,888-356-8911,E,...,,,,,,,,,,
8955,ELEC,13601 Glenoaks blvd,13601 Glenoaks blvd,,Los Angeles,CA,91342,,888-356-8911,E,...,,,,,,,,,,
8956,ELEC,6723 Van Nuys blvd,6723 Van Nuys Blvd,,Los Angeles,CA,91405,,888-356-8911,E,...,,,,,,,,,,
8957,ELEC,12225 Grenet,12225 Rue Grenet,,Montréal,QC,H4J 2N7,,855-999-8378,E,...,,,,,,,,,,


In [20]:
# Save relevant columns to new stations_df DataFrame
stations_df = stations_data[["ID", "Fuel Type Code", "City", "State", "Country", "ZIP", "Status Code"]].copy()
stations_df.head()

Unnamed: 0,ID,Fuel Type Code,City,State,Country,ZIP,Status Code
0,82833,ELEC,Brooks,AB,CA,T1R 1P7,E
1,82834,ELEC,Airdrie,AB,CA,T4A 2H9,E
2,82835,ELEC,Edmonton,AB,CA,T6X 1A1,T
3,82836,ELEC,Edmonton,AB,CA,T6E 2B1,E
4,82837,ELEC,Calgary,AB,CA,T2E 6Y3,E


In [21]:
# Rename State column to Province column
stations_df = stations_df.rename(columns={"State": "Province"})
stations_df.head()

Unnamed: 0,ID,Fuel Type Code,City,Province,Country,ZIP,Status Code
0,82833,ELEC,Brooks,AB,CA,T1R 1P7,E
1,82834,ELEC,Airdrie,AB,CA,T4A 2H9,E
2,82835,ELEC,Edmonton,AB,CA,T6X 1A1,T
3,82836,ELEC,Edmonton,AB,CA,T6E 2B1,E
4,82837,ELEC,Calgary,AB,CA,T2E 6Y3,E


In [22]:
# Check data types of columns
stations_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8959 entries, 0 to 8958
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   ID              8959 non-null   int64 
 1   Fuel Type Code  8959 non-null   object
 2   City            8959 non-null   object
 3   Province        8959 non-null   object
 4   Country         8959 non-null   object
 5   ZIP             8958 non-null   object
 6   Status Code     8959 non-null   object
dtypes: int64(1), object(6)
memory usage: 559.9+ KB


In [23]:
# Check data for missing values
stations_df.isnull().sum()

ID                0
Fuel Type Code    0
City              0
Province          0
Country           0
ZIP               1
Status Code       0
dtype: int64

In [24]:
# Drop rows with missing values
stations_df = stations_df.dropna()


In [25]:
# Check the number of unique values for each column to confirm all rows are Country=Canada
stations_df.nunique()

ID                8958
Fuel Type Code       6
City              1865
Province            14
Country              1
ZIP               6841
Status Code          3
dtype: int64

In [26]:
# Get value counts for each value in "State" column to confirm all canadian provinces and territories
stations_df["Province"].value_counts()

QC    3551
ON    2515
BC    1592
AB     510
NB     165
NS     154
SK     149
MB     147
PE      71
NL      61
YT      37
NT       3
CA       2
OH       1
Name: Province, dtype: int64

In [27]:
# View rows of stations where Province = "OH" or "CA"
stations_df.loc[(stations_df.Province.values == "OH") | (stations_df.Province.values == "CA")]

Unnamed: 0,ID,Fuel Type Code,City,Province,Country,ZIP,Status Code
7133,194299,ELEC,Guelph,OH,CA,N1K 1X3,E
8955,223527,ELEC,Los Angeles,CA,CA,91342,E
8956,223530,ELEC,Los Angeles,CA,CA,91405,E


In [28]:
# Remove rows where Province = "CA"
stations_cleaned_df = stations_df.loc[stations_df.Province.values != "CA"]
# Confirm new value counts
stations_cleaned_df["Province"].value_counts()

QC    3551
ON    2515
BC    1592
AB     510
NB     165
NS     154
SK     149
MB     147
PE      71
NL      61
YT      37
NT       3
OH       1
Name: Province, dtype: int64

In [29]:
# Replace miscoded value of "OH" with "ON" 
stations_cleaned_df = stations_cleaned_df.replace(to_replace="OH",
                            value="ON")
# Confirm new value counts
stations_cleaned_df["Province"].value_counts()

QC    3551
ON    2516
BC    1592
AB     510
NB     165
NS     154
SK     149
MB     147
PE      71
NL      61
YT      37
NT       3
Name: Province, dtype: int64

In [30]:
# Filter for Electric fuel type
stations_cleaned_df = stations_cleaned_df[stations_cleaned_df["Fuel Type Code"] == "ELEC"]
stations_cleaned_df.sample(10)

Unnamed: 0,ID,Fuel Type Code,City,Province,Country,ZIP,Status Code
6956,190764,ELEC,Ucluelet,BC,CA,V0R 3A0,E
7059,193093,ELEC,Burnaby,BC,CA,V5H3G4,E
6039,169218,ELEC,Nicolet,QC,CA,J3T 1C3,E
8882,222132,ELEC,Ottawa,ON,CA,K1J 8G9,E
3544,130509,ELEC,Gatineau,QC,CA,J8R 0E1,E
8871,221867,ELEC,Prince Albert,SK,CA,S6V 0C7,E
6000,168624,ELEC,Mississauga,ON,CA,L5T 2T9,E
1925,103868,ELEC,Ottawa,ON,CA,K1J 1A5,E
3261,130205,ELEC,Compton,QC,CA,J0B 1L0,E
6969,190881,ELEC,Saint-Constant,QC,CA,J5A 0W6,E


In [31]:
# Save cleaned data to CSV file
# Define file path to save data
filepath = Path('cleaned_datasets/EV_stations_locations.csv')
# Allow folders and subfolders to be created is needed
filepath.parent.mkdir(parents=True, exist_ok=True)
# Save the file
stations_cleaned_df.to_csv(filepath)

In [32]:
stations_cleaned_df

Unnamed: 0,ID,Fuel Type Code,City,Province,Country,ZIP,Status Code
0,82833,ELEC,Brooks,AB,CA,T1R 1P7,E
1,82834,ELEC,Airdrie,AB,CA,T4A 2H9,E
2,82835,ELEC,Edmonton,AB,CA,T6X 1A1,T
3,82836,ELEC,Edmonton,AB,CA,T6E 2B1,E
4,82837,ELEC,Calgary,AB,CA,T2E 6Y3,E
...,...,...,...,...,...,...,...
8952,223523,ELEC,Québec,QC,CA,G1V 1T2,E
8953,223524,ELEC,Rocky Mountain House,AB,CA,T4T 1A6,E
8954,223525,ELEC,Penticton,BC,CA,V2A 5A9,E
8957,223532,ELEC,Montréal,QC,CA,H4J 2N7,E


In [33]:
EV_cities_df

Unnamed: 0,GEO,Geo-ID,VALUE,REF_DATE
2646,"Campbellton, Quebec part",24330,0.0,2017-01
2655,"Matane, Quebec",403,2.0,2017-01
2670,"Rimouski, Quebec",404,8.0,2017-01
2688,"Rivière-du-Loup, Quebec",405,5.0,2017-01
2709,"Baie-Comeau, Quebec",406,3.0,2017-01
...,...,...,...,...
320586,Lutselk'e,6105020,0.0,2021-10
320589,Reliance,6105026,0.0,2021-10
320592,"Region 5, Unorganized",6105097,0.0,2021-10
320595,Detah,6106021,0.0,2021-10
