In [137]:
# Dependencies and Setup
import pandas as pd
from pathlib import Path
import scipy.stats as st
import matplotlib.pyplot as plt

# reading each file
data_2020 = pd.read_csv(vehicle_data_2020, low_memory=False)
data_2021 = pd.read_csv(vehicle_data_2021, low_memory=False)
data_2022 = pd.read_csv(vehicle_data_2022, low_memory=False)
county_data = pd.read_csv(census_data)

# adding a year column
data_2020['Year'] = 2020
data_2021['Year'] = 2021
data_2022['Year'] = 2022

# Concatenate the DataFrames
merged_data = pd.concat([data_2020, data_2021, data_2022], ignore_index=True)

# resetting the index of the DataFrame
merged_data.reset_index(drop=True, inplace=True)

# Display the merged DataFrame
print(merged_data)


merged_data.head()

             Date Zip Code Model Year              Fuel       Make   Duty  \
0        1/1/2020    90001       2007          Gasoline      ACURA  Light   
1        1/1/2020    90002       2007          Gasoline      ACURA  Light   
2        1/1/2020    90003       2007          Gasoline      ACURA  Light   
3        1/1/2020    90004       2007          Gasoline      ACURA  Light   
4        1/1/2020    90006       2007          Gasoline      ACURA  Light   
...           ...      ...        ...               ...        ...    ...   
2002823  1/1/2022      OOS      <2008         Flex-Fuel   CHRYSLER  Light   
2002824  1/1/2022      OOS      <2008         Flex-Fuel   CHRYSLER    Unk   
2002825  1/1/2022      OOS      <2008         Flex-Fuel      MAZDA  Light   
2002826  1/1/2022      OOS      <2008         Flex-Fuel    MERCURY  Light   
2002827  1/1/2022      OOS      <2008  Battery Electric  OTHER/UNK  Light   

         Vehicles  Year  
0              15  2020  
1              20  2020

Unnamed: 0,Date,Zip Code,Model Year,Fuel,Make,Duty,Vehicles,Year
0,1/1/2020,90001,2007,Gasoline,ACURA,Light,15,2020
1,1/1/2020,90002,2007,Gasoline,ACURA,Light,20,2020
2,1/1/2020,90003,2007,Gasoline,ACURA,Light,29,2020
3,1/1/2020,90004,2007,Gasoline,ACURA,Light,19,2020
4,1/1/2020,90006,2007,Gasoline,ACURA,Light,15,2020


In [138]:
records_count = len(merged_data)
records_count

2002828

In [139]:
fuel_types = merged_data['Fuel'].unique()

# Print the unique fuel types in this set
print("Fuel types:", fuel_types)

Fuel types: ['Gasoline' 'Natural Gas' 'Diesel and Diesel Hybrid' 'Flex-Fuel'
 'Hybrid Gasoline' 'Other' 'Battery Electric' 'Hydrogen Fuel Cell'
 'Plug-in Hybrid']


In [140]:
# Dropping the columns for model year, make, and duty
merged_data.drop(['Model Year', 'Make', 'Duty'], axis=1, inplace=True)
merged_data.rename(columns={'Zip Code': 'Zipcode'}, inplace=True)
merged_data.head()

Unnamed: 0,Date,Zipcode,Fuel,Vehicles,Year
0,1/1/2020,90001,Gasoline,15,2020
1,1/1/2020,90002,Gasoline,20,2020
2,1/1/2020,90003,Gasoline,29,2020
3,1/1/2020,90004,Gasoline,19,2020
4,1/1/2020,90006,Gasoline,15,2020


In [141]:
# saving the merged data to a new csv, after the dropped fields
merged_data.to_csv('merged_data_modified.csv', index=False)

In [142]:
# merging the vehicle data to the zip and county data 
census_data = ('census_data.csv')
vehicle_data_updt = ('merged_data_modified.csv')

# reading each file
county_data = pd.read_csv(census_data, dtype={
    'Zipcode': 'object',
    'county': 'object'})
all_vehicle_data = pd.read_csv(vehicle_data_updt, low_memory=False)

In [143]:
county_data.drop(['Unnamed: 0', 'Population', 'Household Income', 'state_fips', 'state', 'state_abbr', 'city', 'Year'], axis=1, inplace=True)
county_data.head()

Unnamed: 0,Zipcode,county
0,90001,Los Angeles
1,90001,Los Angeles
2,90001,Los Angeles
3,90001,Los Angeles
4,90001,Los Angeles


In [144]:
# Merge the dataframes on the 'Zip Code' column
merged_data_all = pd.merge(all_vehicle_data, county_data, on='Zipcode', how='left')

# Display the merged dataframe
merged_data_all.head()
merged_data_all

Unnamed: 0,Date,Zipcode,Fuel,Vehicles,Year,county
0,1/1/2020,90001,Gasoline,15,2020,Los Angeles
1,1/1/2020,90001,Gasoline,15,2020,Los Angeles
2,1/1/2020,90001,Gasoline,15,2020,Los Angeles
3,1/1/2020,90001,Gasoline,15,2020,Los Angeles
4,1/1/2020,90001,Gasoline,15,2020,Los Angeles
...,...,...,...,...,...,...
8762195,1/1/2022,OOS,Flex-Fuel,24,2022,
8762196,1/1/2022,OOS,Flex-Fuel,26,2022,
8762197,1/1/2022,OOS,Flex-Fuel,14,2022,
8762198,1/1/2022,OOS,Flex-Fuel,20,2022,


In [145]:
desired_county = 'Alameda'
desired_year = 2022

# filtering dataframe for the desired county and year
county_year_data = merged_data_all.loc[(merged_data_all['county'] == desired_county) & (merged_data_all['Year'] == desired_year)]
county_year_data

Unnamed: 0,Date,Zipcode,Fuel,Vehicles,Year,county
7676193,1/1/2022,94501,Gasoline,4,2022,Alameda
7676194,1/1/2022,94501,Gasoline,4,2022,Alameda
7676195,1/1/2022,94501,Gasoline,4,2022,Alameda
7676196,1/1/2022,94501,Gasoline,4,2022,Alameda
7676197,1/1/2022,94501,Gasoline,4,2022,Alameda
...,...,...,...,...,...,...
7911785,1/1/2022,94710,Flex-Fuel,10,2022,Alameda
7911786,1/1/2022,94710,Flex-Fuel,10,2022,Alameda
7911787,1/1/2022,94710,Flex-Fuel,10,2022,Alameda
7911788,1/1/2022,94710,Flex-Fuel,10,2022,Alameda


In [146]:
battery_electric_data = merged_data_all[merged_data_all['Fuel'] == 'Battery Electric']
#battery_electric_data = battery_electric_data.drop_duplicates()
battery_electric_data

Unnamed: 0,Date,Zipcode,Fuel,Vehicles,Year,county
94987,1/1/2020,92220,Battery Electric,1,2020,Riverside
94988,1/1/2020,92220,Battery Electric,1,2020,Riverside
94989,1/1/2020,92220,Battery Electric,1,2020,Riverside
94990,1/1/2020,92220,Battery Electric,1,2020,Riverside
94991,1/1/2020,92220,Battery Electric,1,2020,Riverside
...,...,...,...,...,...,...
8762058,1/1/2022,OOS,Battery Electric,17,2022,
8762059,1/1/2022,OOS,Battery Electric,22,2022,
8762060,1/1/2022,OOS,Battery Electric,14,2022,
8762061,1/1/2022,OOS,Battery Electric,278,2022,


In [148]:
# filtering the DataFrame by 'Zip Code' to calculate the sum of 'Vehicles' for each zip code
battery_electric_totals = battery_electric_data.groupby(['county','Fuel', 'Year']).agg({'Vehicles': 'count'}).reset_index()

battery_electric_totals


Unnamed: 0,county,Fuel,Year,Vehicles
0,Alameda,Battery Electric,2020,4340
1,Alameda,Battery Electric,2021,4750
2,Alameda,Battery Electric,2022,5580
3,Alpine,Battery Electric,2020,5
4,Alpine,Battery Electric,2021,15
...,...,...,...,...
165,Yolo,Battery Electric,2021,615
166,Yolo,Battery Electric,2022,765
167,Yuba,Battery Electric,2020,150
168,Yuba,Battery Electric,2021,160


In [153]:
# dropping two counties: Sierra and Modoc, total counties will be 56

# filtering the DataFrame by ‘Zip Code’ to calculate the sum of ‘Vehicles’ for each zip code
battery_electric_totals = battery_electric_data.groupby(['county','Fuel', 'Year']).agg({'Vehicles': 'count'}).reset_index()
# We also need to drop any counties where we don’t have all three years of data...
# To do that, we want to find a list of counties
keepcounty = battery_electric_totals.groupby('county').count()
keepcounty = keepcounty.loc[keepcounty['Fuel'] == 3]
# Now we can delete any rows from multi_census_pd
# Where the Zipcode is not in the ‘keepzip’ file
battery_electric_totals = battery_electric_totals[battery_electric_totals['county'].isin(keepcounty.index)]
# totals for each zip code
battery_electric_totals

Unnamed: 0,county,Fuel,Year,Vehicles
0,Alameda,Battery Electric,2020,4340
1,Alameda,Battery Electric,2021,4750
2,Alameda,Battery Electric,2022,5580
3,Alpine,Battery Electric,2020,5
4,Alpine,Battery Electric,2021,15
...,...,...,...,...
165,Yolo,Battery Electric,2021,615
166,Yolo,Battery Electric,2022,765
167,Yuba,Battery Electric,2020,150
168,Yuba,Battery Electric,2021,160


In [154]:
# saving the merged data to a new csv, after the dropped fields
battery_electric_totals.to_csv('battery_electric_totals.csv', index=False)

In [155]:
num_counties = battery_electric_totals['county'].nunique()
num_counties

56