In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path

In [2]:
# Create reference paths to the raw dataset
path = "raw_datasets/"
EV_path = path + "EV_sales_table_raw.csv"
EV_df = pd.read_csv(EV_path)
# Confirm file is read to DataFrame correctly
EV_df.head(10)

Unnamed: 0.1,Unnamed: 0,REF_DATE,GEO,DGUID,Zero-Emission Vehicles Fuel Type,Vehicle type,Statistics,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,0,2017-01,Canada,2016A000011124,All zero-emission vehicles,"Total, vehicle type",Number of vehicles,Units,300,units,0,v1277479871,1.1.1.1,3488.0,,,,0
1,1,2017-01,Canada,2016A000011124,Battery electric,"Total, vehicle type",Number of vehicles,Units,300,units,0,v1277485216,1.2.1.1,1664.0,,,,0
2,2,2017-01,Canada,2016A000011124,Plug-in hybrid electric,"Total, vehicle type",Number of vehicles,Units,300,units,0,v1277490561,1.3.1.1,1824.0,,,,0
3,3,2017-01,Newfoundland and Labrador,2016A000210,All zero-emission vehicles,"Total, vehicle type",Number of vehicles,Units,300,units,0,v1277479872,2.1.1.1,,..,,,0
4,4,2017-01,Newfoundland and Labrador,2016A000210,Battery electric,"Total, vehicle type",Number of vehicles,Units,300,units,0,v1277485217,2.2.1.1,,..,,,0
5,5,2017-01,Newfoundland and Labrador,2016A000210,Plug-in hybrid electric,"Total, vehicle type",Number of vehicles,Units,300,units,0,v1277490562,2.3.1.1,,..,,,0
6,6,2017-01,"St. John's, Newfoundland and Labrador",2016S0503001,All zero-emission vehicles,"Total, vehicle type",Number of vehicles,Units,300,units,0,v1277479873,3.1.1.1,,..,,,0
7,7,2017-01,"St. John's, Newfoundland and Labrador",2016S0503001,Battery electric,"Total, vehicle type",Number of vehicles,Units,300,units,0,v1277485218,3.2.1.1,,..,,,0
8,8,2017-01,"St. John's, Newfoundland and Labrador",2016S0503001,Plug-in hybrid electric,"Total, vehicle type",Number of vehicles,Units,300,units,0,v1277490563,3.3.1.1,,..,,,0
9,9,2017-01,Conception Bay South,2016A00051001485,All zero-emission vehicles,"Total, vehicle type",Number of vehicles,Units,300,units,0,v1277479874,4.1.1.1,,..,,,0


In [3]:
# Check column headers
EV_df.columns

Index(['Unnamed: 0', 'REF_DATE', 'GEO', 'DGUID',
       'Zero-Emission Vehicles Fuel Type', 'Vehicle type', 'Statistics', 'UOM',
       'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR', 'COORDINATE', 'VALUE',
       'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS'],
      dtype='object')

In [4]:
# Check data types
EV_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 320700 entries, 0 to 320699
Data columns (total 18 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   Unnamed: 0                        320700 non-null  int64  
 1   REF_DATE                          320700 non-null  object 
 2   GEO                               320700 non-null  object 
 3   DGUID                             319920 non-null  object 
 4   Zero-Emission Vehicles Fuel Type  320700 non-null  object 
 5   Vehicle type                      320700 non-null  object 
 6   Statistics                        320700 non-null  object 
 7   UOM                               320700 non-null  object 
 8   UOM_ID                            320700 non-null  int64  
 9   SCALAR_FACTOR                     320700 non-null  object 
 10  SCALAR_ID                         320700 non-null  int64  
 11  VECTOR                            320700 non-null  o

In [5]:
# Check number of unique values in all columns
EV_df.nunique()

Unnamed: 0                          320700
REF_DATE                                20
GEO                                   5128
DGUID                                 5332
Zero-Emission Vehicles Fuel Type         3
Vehicle type                             1
Statistics                               1
UOM                                      1
UOM_ID                                   1
SCALAR_FACTOR                            1
SCALAR_ID                                1
VECTOR                               16035
COORDINATE                           16035
VALUE                                  848
STATUS                                   1
SYMBOL                                   0
TERMINATED                               0
DECIMALS                                 1
dtype: int64

In [6]:
# Drop columns with 1 or 0 number of unique values and "Unnamed: 0" as it matches the index
# Drop Vector and Coordinate columns, as they do not provide meaningful data for this analysis
EV_df = EV_df.drop(["Unnamed: 0", 
                "Vehicle type", 
                "Statistics", 
                "UOM",
                "UOM_ID",
                "SCALAR_FACTOR",
                "SCALAR_ID",
                "VECTOR",
                "COORDINATE", 
                "STATUS", 
                "SYMBOL", 
                "TERMINATED", 
                "DECIMALS"], axis=1)
EV_df.sample(5)

Unnamed: 0,REF_DATE,GEO,DGUID,Zero-Emission Vehicles Fuel Type,VALUE
205168,2020-01,Cold Lake 149,2016A00054812810,Battery electric,
148151,2019-04,Eastman,2016A00052445093,Plug-in hybrid electric,
74048,2018-01,Hodgeville,2016A00054707016,Plug-in hybrid electric,0.0
269284,2021-01,Louis Bull 138B,2016A00054811803,Battery electric,
6787,2017-01,Markham,2016A00053519036,Battery electric,39.0


In [7]:
# Drop rows with any NaN values remaining
EV_cleaned_df = EV_df.dropna()
print(EV_cleaned_df.shape[0])
EV_cleaned_df.sample(5)

161820


Unnamed: 0,REF_DATE,GEO,DGUID,Zero-Emission Vehicles Fuel Type,VALUE
74401,2018-01,Great Bend No. 405,2016A00054716008,Battery electric,0.0
298452,2021-07,Montmartre No. 126,2016A00054706001,All zero-emission vehicles,0.0
15934,2017-01,"Region 6, Unorganized",2016A00056106097,Battery electric,0.0
314575,2021-10,Coulee No. 136,2016A00054707018,Battery electric,0.0
254829,2020-10,Keremeos,2016A00055907009,All zero-emission vehicles,0.0


In [8]:
# Filter rows for All zero-emission vehicles to remove duplicated information
all_zev = EV_cleaned_df["Zero-Emission Vehicles Fuel Type"] == "All zero-emission vehicles"
EV_filtered_df = EV_cleaned_df[all_zev]
print(EV_filtered_df.shape)
EV_filtered_df.sample(5)

(53940, 5)


Unnamed: 0,REF_DATE,GEO,DGUID,Zero-Emission Vehicles Fuel Type,VALUE
320565,2021-10,Wrigley,2016A00056104044,All zero-emission vehicles,0.0
191289,2019-10,Boothroyd 13,2016A00055909850,All zero-emission vehicles,0.0
215538,2020-04,Tay,2016A00053543071,All zero-emission vehicles,1.0
186024,2019-10,Paddockwood No. 520,2016A00054715099,All zero-emission vehicles,0.0
248331,2020-10,Mattagami 71,2016A00053552052,All zero-emission vehicles,0.0


In [9]:
# Save Geographic Unique Identifier from DGUID column to new column, then drop DGUID
EV_filtered_df["Geo-ID"] = [x[9:] for x in EV_filtered_df["DGUID"]]
EV_filtered_df.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,REF_DATE,GEO,DGUID,Zero-Emission Vehicles Fuel Type,VALUE,Geo-ID
24912,2017-04,Dauphin,2016A00054617050,All zero-emission vehicles,0.0,4617050
175455,2019-07,Thompson-Nicola B (Thompson Headwaters),2016A00055933070,All zero-emission vehicles,0.0,5933070
67032,2018-01,"Thetford Mines, Quebec",2016S0504430,All zero-emission vehicles,4.0,430
7281,2017-01,Marmora and Lake,2016A00053512046,All zero-emission vehicles,0.0,3512046
283425,2021-04,Spiritwood No. 496,2016A00054716056,All zero-emission vehicles,0.0,4716056


In [14]:
# Drop DGUID and Zero-Emission Vehicles Fuel Type columns, save to new DataFrame
EV_locations_df = EV_filtered_df.drop(columns=["DGUID", "Zero-Emission Vehicles Fuel Type"], axis=1)

In [15]:
# Reset the index
EV_locations_df.reset_index(drop=True)

Unnamed: 0,REF_DATE,GEO,VALUE,Geo-ID
0,2017-01,Canada,3488.0,11124
1,2017-01,Prince Edward Island,0.0,11
2,2017-01,New Brunswick,5.0,13
3,2017-01,Quebec,1247.0,24
4,2017-01,"Campbellton, Quebec part",0.0,24330
...,...,...,...,...
53935,2021-10,Lutselk'e,0.0,6105020
53936,2021-10,Reliance,0.0,6105026
53937,2021-10,"Region 5, Unorganized",0.0,6105097
53938,2021-10,Detah,0.0,6106021


In [17]:
# Reorder the columns for ease of reading
EV_locations_df = EV_locations_df[['GEO', 'Geo-ID', 'VALUE', 'REF_DATE']]
EV_locations_df.head()

Unnamed: 0,GEO,Geo-ID,VALUE,REF_DATE
0,Canada,11124,3488.0,2017-01
1140,Prince Edward Island,11,0.0,2017-01
1797,New Brunswick,13,5.0,2017-01
2643,Quebec,24,1247.0,2017-01
2646,"Campbellton, Quebec part",24330,0.0,2017-01


In [19]:
# Save Canada to separate DataFrame, drop from locations DataFrame
EV_country_df = EV_locations_df.loc[lambda EV_locations_df: EV_locations_df['GEO'] == 'Canada']
EV_country_df

Unnamed: 0,GEO,Geo-ID,VALUE,REF_DATE
0,Canada,11124,3488.0,2017-01
16035,Canada,11124,5025.0,2017-04
32070,Canada,11124,5184.0,2017-07
48105,Canada,11124,5999.0,2017-10
64140,Canada,11124,6844.0,2018-01
80175,Canada,11124,14879.0,2018-04
96210,Canada,11124,12622.0,2018-07
112245,Canada,11124,9938.0,2018-10
128280,Canada,11124,8275.0,2019-01
144315,Canada,11124,19446.0,2019-04


In [31]:
# Save provinces to separate dataframe, drop from original locations dataframe
provinces = ['Alberta', 'British Columbia', 'Manitoba', 'New Brunswick', 'Newfoundland and Labrador', 'Nova Scotia', 'Ontario', 'Prince Edward Island', 'Quebec', 'Saskatchewan', 'Yukon', 'Northwest Territories', 'Nunavut']
EV_provinces_df = EV_locations_df[EV_locations_df.GEO.isin(provinces)]
EV_provinces_df

Unnamed: 0,GEO,Geo-ID,VALUE,REF_DATE
1140,Prince Edward Island,11,0.0,2017-01
1797,New Brunswick,13,5.0,2017-01
2643,Quebec,24,1247.0,2017-01
6594,Ontario,35,1319.0,2017-01
8460,Manitoba,46,10.0,2017-01
...,...,...,...,...
313125,Manitoba,46,184.0,2021-10
313836,Saskatchewan,47,143.0,2021-10
318057,British Columbia,59,5482.0,2021-10
320352,Yukon,60,8.0,2021-10


In [40]:
# Drop rows for Canada and each Province/Territory, save to EV_cities_df
EV_cities_df = EV_locations_df.loc[(EV_locations_df.GEO.values != "Canada") & (~EV_locations_df["GEO"].isin(provinces))]
EV_cities_df

Unnamed: 0,GEO,Geo-ID,VALUE,REF_DATE
2646,"Campbellton, Quebec part",24330,0.0,2017-01
2655,"Matane, Quebec",403,2.0,2017-01
2670,"Rimouski, Quebec",404,8.0,2017-01
2688,"Rivière-du-Loup, Quebec",405,5.0,2017-01
2709,"Baie-Comeau, Quebec",406,3.0,2017-01
...,...,...,...,...
320586,Lutselk'e,6105020,0.0,2021-10
320589,Reliance,6105026,0.0,2021-10
320592,"Region 5, Unorganized",6105097,0.0,2021-10
320595,Detah,6106021,0.0,2021-10


In [41]:
# Save cleaned data to CSV file
# Define file path to save data
filepath = Path('cleaned_datasets/EV_stations_cities.csv')
# Allow folders and subfolders to be created is needed
filepath.parent.mkdir(parents=True, exist_ok=True)
# Save the file
EV_cities_df.to_csv(filepath)