# Introduction

In this NoteBook we will clean the dataset of NEV stations.

# Set up

In [1]:
import pandas as pd
from glob import glob
from pathlib import Path
import os

# Read the data
**Show total number of csv files in current directory**

In [2]:
files = glob('../Raw_Data/Station Counts by State and Fuel Type/*.csv')
len(files)

15

**Read and concat all files into one dataframe**

In [3]:
files = Path('../Raw_Data/Station Counts by State and Fuel Type').glob('*.csv') # get all csvs in your dir.

df = pd.DataFrame()
for file in files:
    #name = os.path.basename(file)
    data = pd.read_csv(file,index_col = 0,nrows=51,thousands=',')
    # Find the file name which is the year of current df, split with dot, year is the first part.
    year = os.path.basename(file).split('.')[0]
    # Add year as a new column
    data.insert(data.shape[1], 'Year', year)
    # Change specific column name
    if 'Electric(stations / charging outlets)' in data.columns:
        data = data.rename(columns = {'Electric(stations / charging outlets)':'Electric'})
        # Remove thousand comma mark
        data['Electric'] = data['Electric'].str.replace(',', '')
        # Keep last number in cell
        data['Electric'] = data['Electric'].str.split("/").str[-1]
    df = pd.concat([df,data])
print(df.shape)
df.head()

(765, 9)


Unnamed: 0_level_0,Biodiesel,CNG,E85,Electric,Hydrogen,LNG,Propane,Total,Year
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama,11,3,6,0,0,0,40,60,2008
Alaska,0,1,0,0,0,0,10,11,2008
Arizona,10,40,23,5,1,5,51,135,2008
Arkansas,2,3,7,0,0,0,37,49,2008
California,36,184,13,376,26,28,199,862,2008


**Check states number & Year range**

In [4]:
df.groupby('State').size().count(),df['Year'].min(),df['Year'].max()

(51, '2007', '2021')

# Output Cleaned Data Version

In [5]:
path = '../Processed_Data/station.csv'
df.to_csv(path)