In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, sum, col, lower
import os

In [2]:
aws_key = os.getenv('AWS_API_ID')
aws_secret = os.environ.get('AWS_API_SECRET')

In [3]:
spark = SparkSession \
        .builder \
        .appName("Wrangling Data") \
        .getOrCreate()

spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", aws_key)
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", aws_secret)
spark._jsc.hadoopConfiguration().set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")

ev_bucket = "udac-evs"
ev_path = "Light_Duty_Vehicles.csv"
ev_population_path = "Electric_Vehicle_Population_Data.csv"
us_population_path = "us_population.csv"
us_state_code_path = "state_code.csv"

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/27 17:01:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# EV Vehicles

In [4]:
ev_df = spark.read.option("header",True).csv(f"s3a://{ev_bucket}/{ev_path}")
ev_df.show(truncate=False)

22/12/27 17:01:19 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


[Stage 0:>                                                          (0 + 1) / 1]                                                                                

22/12/27 17:01:45 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


[Stage 1:>                                                          (0 + 1) / 1]

+----------------------+-------+----------------------------+---------------+-----------+---------------------------+----------+-----------------------------+--------------------------------+---------------------------------+------------------------------+---------------------------------+----------------------------------+-----------------+-----------+--------------------------------+---------------------+----------------------------------+------------+----------------------------+-----------+---------+---------------+-----------------------+-------------------+----------------+---------+-----+----------+
|Vehicle ID            |Fuel ID|Fuel Configuration ID       |Manufacturer ID|Category ID|Model                      |Model Year|Alternative Fuel Economy City|Alternative Fuel Economy Highway|Alternative Fuel Economy Combined|Conventional Fuel Economy City|Conventional Fuel Economy Highway|Conventional Fuel Economy Combined|Transmission Type|Engine Type|Engine Size                    

                                                                                

In [5]:
ev_df = ev_df.where(col('Model').isNotNull())  \
    .where(col('Model Year').isNotNull())  \
    .where(col('Manufacturer').isNotNull())  \
    .select(
    lower(col('Model')).alias('model_name'), 
    col('Model Year').alias('year'), 
    col('Transmission Type').alias('transmission_type'), 
    col('Engine Type').alias('engine_type'), 
    col('Engine Size').alias('engine_size'), 
    lower(col('Manufacturer')).alias('manufacturer'), 
    col('Category').alias('category'), 
    col('Fuel').alias('fuel')) \
    .dropDuplicates()
ev_df.count()


                                                                                

2477

# EV Population Vehicles

In [6]:
ev_pop_df = spark.read.option("header",True).csv(f"s3a://{ev_bucket}/{ev_population_path}")
ev_pop_df.printSchema()
ev_pop_df.count()

[Stage 8:>                                                          (0 + 1) / 1]                                                                                

root
 |-- VIN (1-10): string (nullable = true)
 |-- County: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- ZIP Code: string (nullable = true)
 |-- Model Year: string (nullable = true)
 |-- Make: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Electric Vehicle Type: string (nullable = true)
 |-- Clean Alternative Fuel Vehicle (CAFV) Eligibility: string (nullable = true)
 |-- Electric Range: string (nullable = true)
 |-- Base MSRP: string (nullable = true)
 |-- Legislative District: string (nullable = true)
 |-- DOL Vehicle ID: string (nullable = true)
 |-- Vehicle Location: string (nullable = true)



                                                                                

62261

In [7]:
ev_pop_df = ev_pop_df.where(col('State').isNotNull())  \
    .where(col('Model Year').isNotNull())  \
    .where(col('Make').isNotNull())  \
    .where(col('Model').isNotNull())  \
    .select(col('County').alias('county'), 
            col('State').alias('state'), 
            col('Model Year').alias('model_year'), 
            lower(col('Make')).alias('make'), 
            lower(col('Model')).alias('model'), 
            col('Electric Vehicle Type').alias('electric_vehicle_type'))
ev_pop_df.show(truncate=False)


+---------+-----+----------+---------+---------------+--------------------------------------+
|county   |state|model_year|make     |model          |electric_vehicle_type                 |
+---------+-----+----------+---------+---------------+--------------------------------------+
|King     |WA   |2020      |kia      |niro           |Battery Electric Vehicle (BEV)        |
|King     |WA   |2019      |kia      |niro electric  |Battery Electric Vehicle (BEV)        |
|King     |WA   |2020      |kia      |niro           |Battery Electric Vehicle (BEV)        |
|Thurston |WA   |2019      |chevrolet|bolt           |Battery Electric Vehicle (BEV)        |
|Chelan   |WA   |2015      |nissan   |leaf           |Battery Electric Vehicle (BEV)        |
|King     |WA   |2015      |ford     |c-max energi   |Plug-in Hybrid Electric Vehicle (PHEV)|
|King     |WA   |2017      |chevrolet|bolt           |Battery Electric Vehicle (BEV)        |
|King     |WA   |2020      |kia      |niro           |Batter

# US Population

In [8]:
us_pop_df = spark.read.option("header",True).csv(f"s3a://{ev_bucket}/{us_population_path}")
us_pop_df = us_pop_df.withColumn("population", col("POPESTIMATE2019").cast('int')).drop('POPESTIMATE2019')

total_pop = us_pop_df.agg(sum("population")).collect()[0][0]

In [9]:
def population_ratio(state_population):
    return round((state_population / total_pop) * 100, 1)
 
population_ratio_udf = udf(population_ratio)

In [10]:
us_state_df = spark.read.option("header",True).csv(f"s3a://{ev_bucket}/{us_state_code_path}")
us_state_pop_ratio_df = us_pop_df.join(us_state_df, us_pop_df.STATE == us_state_df.state, "left")  \
                            .select(us_state_df.state, us_state_df.code, col('population'))  \
                            .withColumn('ratio', population_ratio_udf(col('population')))  \
                            .drop('STATE')  \
                            .dropDuplicates()
us_state_pop_ratio_df.count()

                                                                                

51

In [11]:
us_state_pop_ratio_df.show(30, truncate=False)

+----+----------+-----+
|code|population|ratio|
+----+----------+-----+
|MN  |5639632   |1.7  |
|NJ  |8882190   |2.7  |
|HI  |1415872   |0.4  |
|PA  |12801989  |3.9  |
|MA  |6892503   |2.1  |
|OK  |3956971   |1.2  |
|MD  |6045680   |1.8  |
|SC  |5148714   |1.6  |
|IL  |12671821  |3.9  |
|VT  |623989    |0.2  |
|OR  |4217737   |1.3  |
|IA  |3155070   |1.0  |
|ND  |762062    |0.2  |
|SD  |884659    |0.3  |
|FL  |21477737  |6.5  |
|NV  |3080156   |0.9  |
|OH  |11689100  |3.6  |
|KS  |2913314   |0.9  |
|WY  |578759    |0.2  |
|WV  |1792147   |0.5  |
|UT  |3205958   |1.0  |
|MS  |2976149   |0.9  |
|AR  |3017804   |0.9  |
|NE  |1934408   |0.6  |
|WI  |5822434   |1.8  |
|CT  |3565287   |1.1  |
|NY  |19453561  |5.9  |
|LA  |4648794   |1.4  |
|CA  |39512223  |12.0 |
|MT  |1068778   |0.3  |
+----+----------+-----+
only showing top 30 rows



# Consolidating Output

In [None]:
ev_demographics_df = ev_pop_df.join(us_state_pop_ratio_df, us_state_pop_ratio_df.code == ev_pop_df.state, "left")  \
                                .join(ev_df, (ev_df.model_name == ev_pop_df.model) & (ev_df.year == ev_pop_df.model_year) & (ev_df.manufacturer == ev_pop_df.make), "left")  \
                                .drop('model_year', 'model_year', 'make', 'code')

ev_demographics_df.printSchema()
ev_demographics_df.write.format("parquet").partitionBy('state').mode("overwrite").save(f's3a://{ev_bucket}/output')



root
 |-- county: string (nullable = true)
 |-- state: string (nullable = true)
 |-- model: string (nullable = true)
 |-- electric_vehicle_type: string (nullable = true)
 |-- population: integer (nullable = true)
 |-- ratio: string (nullable = true)
 |-- model_name: string (nullable = true)
 |-- year: string (nullable = true)
 |-- transmission_type: string (nullable = true)
 |-- engine_type: string (nullable = true)
 |-- engine_size: string (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- category: string (nullable = true)
 |-- fuel: string (nullable = true)



                                                                                