# Introduction

In this NoteBook, we will clean the dataset of "charging station"

## Set up

In [1]:
import pandas as pd
import numpy as np
import csv
from datetime import datetime

## Read the file

In [8]:
#read the file
path = '../raw/charging.csv'
df = pd.read_csv(path)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
df.head()

Unnamed: 0,Fuel Type Code,Fuel Type,Station Name,Street Address,Intersection Directions,City,State Code,RegionId,State,ZIP,...,E85 Other Ethanol Blends,EV Pricing,LPG Nozzle Types,Hydrogen Pressures,Hydrogen Standards,CNG Fill Type Code,CNG PSI,CNG Vehicle Class,LNG Vehicle Class,EV On-Site Renewable Source
0,E85,Ethanol (E85),Kwik Trip #937,2550 Highway 12 E,,Willmar,MN,US_MN,Minnesota,56201,...,"[""E15""]",,,,,,,,,
1,ELEC,Electric,Hollywood Plaza Port 1 2,8050 International Dr,,Orlando,FL,US_FL,Florida,32819,...,,,,,,,,,,
2,ELEC,Electric,Dwntwn Stations Law Ll2 North,Law School Underground Garage 111 N First Str,,Phoenix,AZ,US_AZ,Arizona,85004,...,,,,,,,,,,
3,ELEC,Electric,The Shops At La Cantera - Cheesecake Factory,15900 La Cantera Pkwy,,San Antonio,TX,US_TX,Texas,78256,...,,,,,,,,,,
4,ELEC,Electric,Endeavor Reg 901 E 6Th St 3,901 E 6th St,,Austin,TX,US_TX,Texas,78702,...,,,,,,,,,,


#### There are too many data(each row corresponding to the information of a station), So we need to clean it. 
#### The columns we need is state of station, and their open date

In [5]:
list(df)

['Fuel Type Code',
 'Fuel Type',
 'Station Name',
 'Street Address',
 'Intersection Directions',
 'City',
 'State Code',
 'RegionId',
 'State',
 'ZIP',
 'Plus4',
 'Station Phone',
 'Status Code',
 'Status',
 'Status note',
 'Expected Date',
 'Groups With Access Code',
 'Groups with access code note',
 'Access Days Time',
 'Cards Accepted',
 'BD Blends',
 'NG Fill Type Code',
 'NG PSI',
 'EV Level1 EVSE Num',
 'EV Level2 EVSE Num',
 'EV DC Fast Count',
 'EV Other Info',
 'EV Network',
 'EV Network Web',
 'Geocode Status',
 'Latitude',
 'Longitude',
 'Date Last Confirmed',
 'ID',
 'Updated At',
 'Owner Type Code',
 'Federal Agency ID',
 'Federal Agency Name',
 'Open Date',
 'Hydrogen Status Link',
 'NG Vehicle Class',
 'LPG Primary',
 'E85 Blender Pump',
 'EV Connector Types',
 'Country',
 'Groups With Access Code (French)',
 'Hydrogen Is Retail',
 'Access Code',
 'Access Detail Code',
 'Federal Agency Code',
 'Facility Type',
 'CNG Dispenser Num',
 'CNG On-Site Renewable Source',
 'CNG 

In [6]:
## drop the columns we don't need
df = df.drop(['Fuel Type Code',
 'Fuel Type',
 'Station Name',
 'Street Address',
 'Intersection Directions',
 'City',
 'State Code',
 'RegionId',
 'ZIP',
 'Plus4',
 'Station Phone',
 'Status Code',
 'Status',
 'Status note',
 'Expected Date',
 'Groups With Access Code',
 'Groups with access code note',
 'Access Days Time',
 'Cards Accepted',
 'BD Blends',
 'NG Fill Type Code',
 'NG PSI',
 'EV Level1 EVSE Num',
 'EV Level2 EVSE Num',
 'EV DC Fast Count',
 'EV Other Info',
 'EV Network',
 'EV Network Web',
 'Geocode Status',
 'Latitude',
 'Longitude',
 'Date Last Confirmed',
 'ID',
 'Updated At',
 'Owner Type Code',
 'Federal Agency ID',
 'Federal Agency Name',
 'Hydrogen Status Link',
 'NG Vehicle Class',
 'LPG Primary',
 'E85 Blender Pump',
 'EV Connector Types',
 'Country',
 'Groups With Access Code (French)',
 'Hydrogen Is Retail',
 'Access Code',
 'Access Detail Code',
 'Federal Agency Code',
 'Facility Type',
 'CNG Dispenser Num',
 'CNG On-Site Renewable Source',
 'CNG Total Compression Capacity',
 'CNG Storage Capacity',
 'E85 Other Ethanol Blends',
 'EV Pricing',
 'LPG Nozzle Types',
 'Hydrogen Pressures',
 'Hydrogen Standards',
 'CNG Fill Type Code',
 'CNG PSI',
 'CNG Vehicle Class',
 'LNG Vehicle Class',
 'EV On-Site Renewable Source'],axis=1)

In [7]:
df.head()

Unnamed: 0,State,Open Date
0,Minnesota,09/15/2018
1,Florida,04/03/2019
2,Arizona,10/22/2019
3,Texas,11/23/2020
4,Texas,04/02/2021


In [8]:
# we just need the data in year, not exactly date. So clean it
df['Open Date'] = pd.to_datetime(df['Open Date'])

df['Open Date'] = df['Open Date'].dt.year

In [9]:
df

Unnamed: 0,State,Open Date
0,Minnesota,2018.0
1,Florida,2019.0
2,Arizona,2019.0
3,Texas,2020.0
4,Texas,2021.0
...,...,...
60466,Minnesota,2020.0
60467,California,2020.0
60468,Utah,2021.0
60469,Nebraska,2021.0


In [10]:
# there are some data of 'Open Date' is not avaliable, so we need to clean it
# fill the 'NA' as value 0
df = df.fillna(0)
#convert the data type to int
df['Open Date'] = df['Open Date'].astype(int)
#clean the unavailable data
list = []
for index, row in df.iterrows():
    if df.iloc[index,1] == 0:
        list.append(index)
df = df.drop(list,axis=0)
df

Unnamed: 0,State,Open Date
0,Minnesota,2018
1,Florida,2019
2,Arizona,2019
3,Texas,2020
4,Texas,2021
...,...,...
60466,Minnesota,2020
60467,California,2020
60468,Utah,2021
60469,Nebraska,2021


## Now we have data of all 60470 stations' information, we need to clean them as:
## "The number of stations for each state by year"

In [10]:
#create an array to store the year
a = np.sort(df['Open Date'].unique())
a

array([1974, 1976, 1978, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991,
       1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002,
       2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013,
       2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022])

In [11]:
#creat a list to store the columns name
col = []
col.append('state')
for i in a:
    col.append(i)

In [12]:
# creat a new DataFrame that ready to get value for "The number of stations for each state by year"
df1 = pd.DataFrame(columns = col)
df1['state'] = df['State'].unique()
df1 = df1.fillna(0)
df1 = df1.set_index('state')
df1.head()

Unnamed: 0_level_0,1974,1976,1978,1984,1985,1986,1987,1988,1989,1990,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Minnesota,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Florida,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Arizona,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Texas,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Illinois,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
print(df['State'].unique())

['Minnesota' 'Florida' 'Arizona' 'Texas' 'Illinois' 'California'
 'North Carolina' 'Ohio' 'Georgia' 'Massachusetts' 'Kentucky' 'Michigan'
 'Iowa' 'Washington' 'New York' 'West Virginia' 'Colorado' 'Idaho'
 'Nevada' 'Kansas' 'Wisconsin' 'Indiana' 'Pennsylvania' 'Maryland'
 'South Carolina' 'Oklahoma' 'South Dakota' 'Tennessee' 'Alabama' 'Utah'
 'New Jersey' 'Oregon' 'Missouri' 'New Mexico' 'Virginia' 'Mississippi'
 'Maine' 'Vermont' 'Delaware' 'Louisiana' 'Rhode Island' 'Montana'
 'Connecticut' 'Hawaii' 'Nebraska' 'Arkansas' 'Ontario'
 'District of Columbia' 'New Hampshire' 'Alaska' 'Wyoming' 'North Dakota'
 'Puerto Rico']


In [14]:
# iterrate the dataframe, using condition to count the number of stations then put it into new DataFrame
for state in df['State'].unique():
    for year in a:
        count=0
        for index, row in df.iterrows():
            if df['State'][index] == state and df['Open Date'][index] <= year:
                count += 1
        df1.at[state,year] = count

In [15]:
df1

Unnamed: 0_level_0,1974,1976,1978,1984,1985,1986,1987,1988,1989,1990,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Minnesota,0,0,0,0,0,0,0,0,0,0,...,287,340,414,478,563,675,778,1011,1225,1252
Florida,0,0,0,0,0,0,0,0,0,0,...,334,379,552,687,864,1062,1380,1829,2813,2962
Arizona,0,0,0,1,1,1,1,1,1,1,...,210,231,313,350,383,434,532,642,1033,1123
Texas,0,0,0,0,0,0,0,0,0,0,...,551,634,840,955,1075,1282,1564,2038,2938,3097
Illinois,0,0,0,0,0,0,0,0,0,0,...,363,418,510,590,671,785,901,1079,1454,1623
California,0,0,0,0,0,0,0,0,0,2,...,1036,1235,1691,2128,2469,2973,3693,7163,14608,15312
North Carolina,0,0,0,0,0,0,0,0,0,0,...,336,382,454,541,639,729,846,1013,1369,1444
Ohio,0,1,1,1,1,2,2,2,2,2,...,229,301,369,466,520,603,738,893,1278,1371
Georgia,0,0,0,0,0,0,1,1,1,1,...,155,189,293,406,490,607,715,979,1655,1740
Massachusetts,0,0,0,0,0,0,0,0,0,0,...,105,132,200,225,246,331,487,933,2113,2261


In [16]:
# Now we have cleanned data, save it into processed file
path = '../processed/charging_station.csv'
df1.to_csv(path)