# Introduction

In this NoteBook, we will clean the dataset of EV sales in different states from 2016-2019.

# Set up

In [1]:
import pandas as pd

### Read the data

In [2]:
#read the data file, skip the third row and rows contain 'mean' and 'median' 
data = pd.read_csv('../Raw_Data/EV_sales_by_state_2016-2019.csv', skiprows=[0,2,54,55,56], delimiter = ',')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,2016,2017,2018,2019
0,California,73854,94873,153442,145020
1,New York,6043,10090,15752,14823
2,Washington,5363,7068,12650,12172
3,Florida,6255,6573,13705,15794
4,Texas,4510,5419,11764,5780


#### We find that, the type of data are string, we need int type.

In [4]:
data["2016"] = pd.to_numeric(data["2016"].str.replace(',', ''))
data["2017"] = pd.to_numeric(data["2017"].str.replace(',', ''))
data["2018"] = pd.to_numeric(data["2018"].str.replace(',', ''))
data["2019"] = pd.to_numeric(data["2019"].str.replace(',', ''))

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,2016,2017,2018,2019
0,California,73854,94873,153442,145020
1,New York,6043,10090,15752,14823
2,Washington,5363,7068,12650,12172
3,Florida,6255,6573,13705,15794
4,Texas,4510,5419,11764,5780


In [6]:
data = data.rename(columns={'Unnamed: 0':'State'})
data.head()

Unnamed: 0,State,2016,2017,2018,2019
0,California,73854,94873,153442,145020
1,New York,6043,10090,15752,14823
2,Washington,5363,7068,12650,12172
3,Florida,6255,6573,13705,15794
4,Texas,4510,5419,11764,5780


In [7]:
# Now we have cleanned data, save it into processed file
path = '../Processed_Data/Ev_sales.csv'
data.to_csv(path)

Find EV sale per capita

In [8]:
population = pd.read_csv('../Processed_Data/population.csv', delimiter = ',', index_col=0)
population.head()

Unnamed: 0_level_0,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
GeoName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama,3058000,3059000,3068000,3053000,3014000,3050000,3071000,3109000,3163000,3204000,...,4839261,4864399,4886793,4908162,4930595,4952202,4976395,5003418,5024803,5039877
Alaska,135000,158000,189000,205000,215000,222000,224000,231000,224000,224000,...,731106,738057,737638,739127,743410,741949,737717,734823,732441,732673
Arizona,756000,785000,842000,894000,933000,987000,1053000,1125000,1193000,1261000,...,6507351,6563216,6638196,6714328,6801591,6879830,6969944,7070511,7177986,7276316
Arkansas,1908000,1901000,1838000,1780000,1734000,1725000,1704000,1733000,1726000,1756000,...,2948887,2954685,2961191,2970352,2980602,2990801,2997271,3004248,3012232,3025891
California,10677000,11134000,11635000,12251000,12746000,13133000,13713000,14264000,14880000,15467000,...,37970368,38291358,38636290,38966055,39223210,39424071,39535906,39547996,39499738,39237836


In [9]:
data.set_index('State',inplace=True)
data.head()

Unnamed: 0_level_0,2016,2017,2018,2019
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
California,73854,94873,153442,145020
New York,6043,10090,15752,14823
Washington,5363,7068,12650,12172
Florida,6255,6573,13705,15794
Texas,4510,5419,11764,5780


In [10]:
ev_sale_per_capita = data/population[['2016','2017','2018','2019']]
ev_sale_per_capita.head()

Unnamed: 0,2016,2017,2018,2019
Alabama,6.7e-05,7.7e-05,0.000174,0.000201
Alaska,0.000126,0.000115,0.00021,0.000195
Arizona,0.000333,0.000433,0.001017,0.001074
Arkansas,4.6e-05,6.3e-05,0.000145,0.000179
California,0.001883,0.002406,0.003881,0.003667


In [11]:
path = '../Processed_Data/Ev_sales_per_capita.csv'
ev_sale_per_capita.to_csv(path)