# Notebook for Sample Spark Session
This file creates Spark Session for Electric Vehicle Population, loads csv file in GCS and perform fundamental Spark functions

In [1]:
# !spark-shell

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, sum, max

spark = SparkSession \
    .builder \
    .appName("Spark Session for Electric Vehicle Population") \
    .getOrCreate()

gcs_bucket = "kristin-0105"
path=f"gs://{gcs_bucket}/dataproc_serverless/notebooks/datasets/electric_vehicle_population.csv"
df = spark.read.csv(path, header=True)
df.show()

+----------+---------+------------+-----+-----------+----------+---------+-------+---------------------+-------------------------------------------------+--------------+---------+--------------------+--------------+--------------------+--------------------+-----------------+
|VIN (1-10)|   County|        City|State|Postal Code|Model Year|     Make|  Model|Electric Vehicle Type|Clean Alternative Fuel Vehicle (CAFV) Eligibility|Electric Range|Base MSRP|Legislative District|DOL Vehicle ID|    Vehicle Location|    Electric Utility|2020 Census Tract|
+----------+---------+------------+-----+-----------+----------+---------+-------+---------------------+-------------------------------------------------+--------------+---------+--------------------+--------------+--------------------+--------------------+-----------------+
|5YJ3E1EB2J|  Suffolk|     Suffolk|   VA|      23435|      2018|    TESLA|MODEL 3| Battery Electric ...|                             Clean Alternative...|           215|   

In [3]:
df.printSchema()

root
 |-- VIN (1-10): string (nullable = true)
 |-- County: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Postal Code: string (nullable = true)
 |-- Model Year: string (nullable = true)
 |-- Make: string (nullable = true)
 |-- Model: string (nullable = true)
 |-- Electric Vehicle Type: string (nullable = true)
 |-- Clean Alternative Fuel Vehicle (CAFV) Eligibility: string (nullable = true)
 |-- Electric Range: string (nullable = true)
 |-- Base MSRP: string (nullable = true)
 |-- Legislative District: string (nullable = true)
 |-- DOL Vehicle ID: string (nullable = true)
 |-- Vehicle Location: string (nullable = true)
 |-- Electric Utility: string (nullable = true)
 |-- 2020 Census Tract: string (nullable = true)



In [4]:
df.show(5)

+----------+--------+--------+-----+-----------+----------+-----+-------+---------------------+-------------------------------------------------+--------------+---------+--------------------+--------------+--------------------+----------------+-----------------+
|VIN (1-10)|  County|    City|State|Postal Code|Model Year| Make|  Model|Electric Vehicle Type|Clean Alternative Fuel Vehicle (CAFV) Eligibility|Electric Range|Base MSRP|Legislative District|DOL Vehicle ID|    Vehicle Location|Electric Utility|2020 Census Tract|
+----------+--------+--------+-----+-----------+----------+-----+-------+---------------------+-------------------------------------------------+--------------+---------+--------------------+--------------+--------------------+----------------+-----------------+
|5YJ3E1EB2J| Suffolk| Suffolk|   VA|      23435|      2018|TESLA|MODEL 3| Battery Electric ...|                             Clean Alternative...|           215|        0|                null|     476647986|POINT

In [5]:
num_rows = df.count()
print("number of rows: ", num_rows)

number of rows:  121978


In [6]:
df.describe('Model Year').show()

+-------+------------------+
|summary|        Model Year|
+-------+------------------+
|  count|            121978|
|   mean|2019.3088999655677|
| stddev|2.9563504561406617|
|    min|              1997|
|    max|              2023|
+-------+------------------+



In [7]:
df.select('Model').distinct().count()

120

In [8]:
df.filter(df['Model Year'] > 2020).show(5)

+----------+---------+---------+-----+-----------+----------+-----+-------+---------------------+-------------------------------------------------+--------------+---------+--------------------+--------------+--------------------+--------------------+-----------------+
|VIN (1-10)|   County|     City|State|Postal Code|Model Year| Make|  Model|Electric Vehicle Type|Clean Alternative Fuel Vehicle (CAFV) Eligibility|Electric Range|Base MSRP|Legislative District|DOL Vehicle ID|    Vehicle Location|    Electric Utility|2020 Census Tract|
+----------+---------+---------+-----+-----------+----------+-----+-------+---------------------+-------------------------------------------------+--------------+---------+--------------------+--------------+--------------------+--------------------+-----------------+
|WA1LAAGE7M|   Yakima|   Yakima|   WA|      98908|      2021| AUDI| E-TRON| Battery Electric ...|                             Clean Alternative...|           222|        0|                  14|

In [9]:
sortedByElectricRange = df.orderBy('Electric Range').show(10)

+----------+------+-------------+-----+-----------+----------+----------+-------+---------------------+-------------------------------------------------+--------------+---------+--------------------+--------------+--------------------+--------------------+-----------------+
|VIN (1-10)|County|         City|State|Postal Code|Model Year|      Make|  Model|Electric Vehicle Type|Clean Alternative Fuel Vehicle (CAFV) Eligibility|Electric Range|Base MSRP|Legislative District|DOL Vehicle ID|    Vehicle Location|    Electric Utility|2020 Census Tract|
+----------+------+-------------+-----+-----------+----------+----------+-------+---------------------+-------------------------------------------------+--------------+---------+--------------------+--------------+--------------------+--------------------+-----------------+
|5YJYGDEE7M|  King|         Kent|   WA|      98030|      2021|     TESLA|MODEL Y| Battery Electric ...|                             Eligibility unkno...|             0|       

In [10]:
# Reference: https://www.nbshare.io/notebook/97969492/Data-Analysis-With-Pyspark-Dataframe/