# Imports

In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import *
import pyspark.sql.functions as F

In [2]:
spark = (SparkSession
         .builder
         .appName('Chapter 3')
         .getOrCreate())
spark

# Introduction

In [3]:
# create our data
data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter", "LinkedIn"]],
       [2, "Brooke","Wenig","https://tinyurl.2", "5/5/2018", 8908, ["twitter", "LinkedIn"]],
       [3, "Denny", "Lee", "https://tinyurl.3","6/7/2019",7659, ["web", "twitter", "FB", "LinkedIn"]],
       [4, "Tathagata", "Das","https://tinyurl.4", "5/12/2018", 10568, ["twitter", "FB"]],
       [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web", "twitter", "FB", "LinkedIn"]],
       [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568, ["twitter", "LinkedIn"]]
      ]

In [4]:
ddl_schema = """
`Id` INT, `First` STRING, `Last` STRING, `URL` STRING, `PUBLISHED` STRING, `Hits` INT, `Campaigns` ARRAY<STRING> 
"""

In [5]:
df = spark.createDataFrame(data, ddl_schema)
df.show(5)

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              URL|PUBLISHED| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
+---+---------+-------+-----------------+---------+-----+--------------------+
only showing top 5 rows



In [6]:
df.schema

StructType(List(StructField(Id,IntegerType,true),StructField(First,StringType,true),StructField(Last,StringType,true),StructField(URL,StringType,true),StructField(PUBLISHED,StringType,true),StructField(Hits,IntegerType,true),StructField(Campaigns,ArrayType(StringType,true),true)))

In [7]:
df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- First: string (nullable = true)
 |-- Last: string (nullable = true)
 |-- URL: string (nullable = true)
 |-- PUBLISHED: string (nullable = true)
 |-- Hits: integer (nullable = true)
 |-- Campaigns: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [8]:
df.columns

['Id', 'First', 'Last', 'URL', 'PUBLISHED', 'Hits', 'Campaigns']

In [9]:
df.select(F.col('Id') + F.col('Id')).show(5)

+---------+
|(Id + Id)|
+---------+
|        2|
|        4|
|        6|
|        8|
|       10|
+---------+
only showing top 5 rows



In [10]:
df.select(F.expr('Id + Id')).show(5)

+---------+
|(Id + Id)|
+---------+
|        2|
|        4|
|        6|
|        8|
|       10|
+---------+
only showing top 5 rows



In [11]:
df.select(F.col('Hits') * 2).show()

+----------+
|(Hits * 2)|
+----------+
|      9070|
|     17816|
|     15318|
|     21136|
|     81156|
|     51136|
+----------+



In [12]:
df.withColumn('Big Hitters', F.col('Hits') > 10000).show()

+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
| Id|    First|   Last|              URL|PUBLISHED| Hits|           Campaigns|Big Hitters|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|      false|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|      false|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|      false|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|       true|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|       true|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|       true|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+



In [13]:
df.withColumn('AuthorsId', F.concat(F.col('First'), F.col('Last'), F.col('Id'))).show()

+---+---------+-------+-----------------+---------+-----+--------------------+-------------+
| Id|    First|   Last|              URL|PUBLISHED| Hits|           Campaigns|    AuthorsId|
+---+---------+-------+-----------------+---------+-----+--------------------+-------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|  JulesDamji1|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]| BrookeWenig2|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|    DennyLee3|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|TathagataDas4|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|MateiZaharia5|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|  ReynoldXin6|
+---+---------+-------+-----------------+---------+-----+--------------------+-------------+



In [14]:
rows = [Row("Matei Zaharia", "CA"), Row("Reynold Xin", "CA")]

In [15]:
df_rows = spark.createDataFrame(rows, schema=['Author', 'State'])

In [16]:
df_rows.show()

+-------------+-----+
|       Author|State|
+-------------+-----+
|Matei Zaharia|   CA|
|  Reynold Xin|   CA|
+-------------+-----+



# DataFrame Common Operations 

In [17]:
fire_schema = StructType([
    StructField('CallNumber', IntegerType(), True),
    StructField('UnitID', StringType(), True),
    StructField('IncidentNumber', IntegerType(), True),
    StructField('CallType', StringType(), True),
    StructField('CallDate', StringType(), True),
    StructField('WatchDate', StringType(), True),
    StructField('CallFinalDisposition', StringType(), True),
    StructField('AvailableDtTm', StringType(), True),
    StructField('Address', StringType(), True),
    StructField('City', StringType(), True),
    StructField('Zipcode', IntegerType(), True),
    StructField('Battalion', StringType(), True),
    StructField('StationArea', StringType(), True),
    StructField('Box', StringType(), True),
    StructField('OriginalPriority', StringType(), True),
    StructField('Priority', StringType(), True),
    StructField('FinalPriority', IntegerType(), True),
    StructField('ALSUnit', BooleanType(), True),
    StructField('CallTypeGroup', StringType(), True),
    StructField('NumAlarms', IntegerType(), True),
    StructField('UnitType', StringType(), True),
    StructField('UnitSequenceInCallDispatch', IntegerType(), True),
    StructField('FirePreventionDistrict', StringType(), True),
    StructField('SupervisorDistrict', StringType(), True),
    StructField('Neighborhood', StringType(), True),
    StructField('Location', StringType(), True),
    StructField('RowID', StringType(), True),
    StructField('Delay', FloatType(), True)
]
)

In [18]:
sf_fire_calls = (spark
                 .read
                 .format('csv')
                 .option('header', 'true')
                 .option('schema', fire_schema)
                 .load('data/sf-fire-calls.csv'))

In [19]:
sf_fire_calls.explain()

== Physical Plan ==
FileScan csv [CallNumber#177,UnitID#178,IncidentNumber#179,CallType#180,CallDate#181,WatchDate#182,CallFinalDisposition#183,AvailableDtTm#184,Address#185,City#186,Zipcode#187,Battalion#188,StationArea#189,Box#190,OriginalPriority#191,Priority#192,FinalPriority#193,ALSUnit#194,CallTypeGroup#195,NumAlarms#196,UnitType#197,UnitSequenceInCallDispatch#198,FirePreventionDistrict#199,SupervisorDistrict#200,... 4 more fields] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex[file:/Users/imad/Documents/courses/data-engineering/big-data/notebooks/Learning..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<CallNumber:string,UnitID:string,IncidentNumber:string,CallType:string,CallDate:string,Watc...




In [20]:
sf_fire_calls.count()

175296

In [21]:
(sf_fire_calls
 .select('IncidentNumber', 'AvailableDtTm', 'CallType')
 .where(F.col('CallType') != 'Medical Incident')
 .show(5, truncate=False))

+--------------+----------------------+--------------+
|IncidentNumber|AvailableDtTm         |CallType      |
+--------------+----------------------+--------------+
|2003235       |01/11/2002 01:51:44 AM|Structure Fire|
|2003250       |01/11/2002 04:16:46 AM|Vehicle Fire  |
|2003259       |01/11/2002 06:01:58 AM|Alarms        |
|2003279       |01/11/2002 08:03:26 AM|Structure Fire|
|2003301       |01/11/2002 09:46:44 AM|Alarms        |
+--------------+----------------------+--------------+
only showing top 5 rows



In [22]:
(sf_fire_calls
 .select('CallType')
 .distinct()
 .count())

30

In [23]:
(sf_fire_calls
 .select('CallType')
 .where(F.col('CallType').isNotNull())
 .agg(F.countDistinct('CallType').alias('DistinctCallTypes'))
 .show())

+-----------------+
|DistinctCallTypes|
+-----------------+
|               30|
+-----------------+



In [24]:
(sf_fire_calls
 .withColumnRenamed('Delay', 'ResponseDelayedinMins')
 .select('ResponseDelayedinMins')
 .where(F.col('ResponseDelayedinMins') > 5)
 .show(5, False))

+---------------------+
|ResponseDelayedinMins|
+---------------------+
|6.25                 |
|7.25                 |
|11.916667            |
|8.633333             |
|95.28333             |
+---------------------+
only showing top 5 rows



In [25]:
fire_ts_df = (sf_fire_calls
              .withColumn('IncidentDate', F.to_timestamp(F.col('CallDate'), 'MM/dd/yyyy'))
              .drop('CallDate')
              .withColumn('OnWatchDate', F.to_timestamp(F.col('WatchDate'), 'MM/dd/yyyy'))
              .drop('WatchDate')
              .withColumn('AvailableDtTS', F.to_timestamp(F.col('AvailableDtTm'), 'MM/dd/yyyy hh:mm:ss a'))
              .drop('AvailableDtTm'))

In [26]:
(fire_ts_df
 .select('IncidentDate', 'OnWatchDate', 'AvailableDtTS')
 .show(5, False))

+-------------------+-------------------+-------------------+
|IncidentDate       |OnWatchDate        |AvailableDtTS      |
+-------------------+-------------------+-------------------+
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 01:51:44|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 03:01:18|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 02:39:50|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 04:16:46|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 06:01:58|
+-------------------+-------------------+-------------------+
only showing top 5 rows



In [27]:
(fire_ts_df
 .select(F.year('IncidentDate'))
 .distinct()
 .orderBy(F.year('IncidentDate'))
 .show(20))

+------------------+
|year(IncidentDate)|
+------------------+
|              2000|
|              2001|
|              2002|
|              2003|
|              2004|
|              2005|
|              2006|
|              2007|
|              2008|
|              2009|
|              2010|
|              2011|
|              2012|
|              2013|
|              2014|
|              2015|
|              2016|
|              2017|
|              2018|
+------------------+



In [28]:
(fire_ts_df
 .select('CallType')
 .where(F.col('CallType').isNotNull())
 .groupBy('CallType')
 .count()
 .orderBy('count', ascending=False)
 .show(10, False))

+-------------------------------+------+
|CallType                       |count |
+-------------------------------+------+
|Medical Incident               |113794|
|Structure Fire                 |23319 |
|Alarms                         |19406 |
|Traffic Collision              |7013  |
|Citizen Assist / Service Call  |2524  |
|Other                          |2166  |
|Outside Fire                   |2094  |
|Vehicle Fire                   |854   |
|Gas Leak (Natural and LP Gases)|764   |
|Water Rescue                   |755   |
+-------------------------------+------+
only showing top 10 rows



In [29]:
fire_ts_df.explain(True)

== Parsed Logical Plan ==
Project [CallNumber#177, UnitID#178, IncidentNumber#179, CallType#180, CallFinalDisposition#183, Address#185, City#186, Zipcode#187, Battalion#188, StationArea#189, Box#190, OriginalPriority#191, Priority#192, FinalPriority#193, ALSUnit#194, CallTypeGroup#195, NumAlarms#196, UnitType#197, UnitSequenceInCallDispatch#198, FirePreventionDistrict#199, SupervisorDistrict#200, Neighborhood#201, Location#202, RowID#203, ... 4 more fields]
+- Project [CallNumber#177, UnitID#178, IncidentNumber#179, CallType#180, CallFinalDisposition#183, AvailableDtTm#184, Address#185, City#186, Zipcode#187, Battalion#188, StationArea#189, Box#190, OriginalPriority#191, Priority#192, FinalPriority#193, ALSUnit#194, CallTypeGroup#195, NumAlarms#196, UnitType#197, UnitSequenceInCallDispatch#198, FirePreventionDistrict#199, SupervisorDistrict#200, Neighborhood#201, Location#202, ... 5 more fields]
   +- Project [CallNumber#177, UnitID#178, IncidentNumber#179, CallType#180, CallFinalDispo

In [30]:
(fire_ts_df
 .select(F.sum('NumAlarms'), F.avg('Delay'), F.min('Delay'), F.max('Delay'))
 .show())

+--------------+------------------+-----------+----------+
|sum(NumAlarms)|        avg(Delay)| min(Delay)|max(Delay)|
+--------------+------------------+-----------+----------+
|      176170.0|3.8923641541750134|0.016666668|      99.9|
+--------------+------------------+-----------+----------+



In [31]:
# What were all the different types of fire calls in 2018?
(fire_ts_df
 .select('CallType')
 .where(F.year('IncidentDate') == 2018)
 .distinct()
 .show())

+--------------------+
|            CallType|
+--------------------+
|Elevator / Escala...|
|              Alarms|
|Odor (Strange / U...|
|Citizen Assist / ...|
|              HazMat|
|           Explosion|
|        Vehicle Fire|
|  Suspicious Package|
|               Other|
|        Outside Fire|
|   Traffic Collision|
|       Assist Police|
|Gas Leak (Natural...|
|        Water Rescue|
|   Electrical Hazard|
|      Structure Fire|
|    Medical Incident|
|          Fuel Spill|
|Smoke Investigati...|
|Train / Rail Inci...|
+--------------------+



In [32]:
# What months within the year 2018 saw the highest number of fire calls?
(fire_ts_df
 .select(F.month('IncidentDate').alias('month'))
 .where(F.year('IncidentDate') == 2018)
 .groupBy('month')
 .count()
 .orderBy('count', ascending=False)
 .show())

+-----+-----+
|month|count|
+-----+-----+
|   10| 1068|
|    5| 1047|
|    3| 1029|
|    8| 1021|
|    1| 1007|
|    7|  974|
|    6|  974|
|    9|  951|
|    4|  947|
|    2|  919|
|   11|  199|
+-----+-----+



In [33]:
# Which neighborhood in San Francisco generated the most fire calls in 2018?
(fire_ts_df
 .select('Neighborhood')
 .where(F.year('IncidentDate') == 2018)
 .groupBy('Neighborhood')
 .count()
 .orderBy('count', ascending=False)
 .show(1))

+------------+-----+
|Neighborhood|count|
+------------+-----+
|  Tenderloin| 1393|
+------------+-----+
only showing top 1 row



In [34]:
(fire_ts_df
 .select('Neighborhood')
 .where(F.year('IncidentDate') == 2018)
 .explain())

== Physical Plan ==
*(1) Project [Neighborhood#201]
+- *(1) Filter (year(cast(gettimestamp(CallDate#181, MM/dd/yyyy, Some(America/Chicago)) as date)) = 2018)
   +- FileScan csv [CallDate#181,Neighborhood#201] Batched: false, DataFilters: [(year(cast(gettimestamp(CallDate#181, MM/dd/yyyy, Some(America/Chicago)) as date)) = 2018)], Format: CSV, Location: InMemoryFileIndex[file:/Users/imad/Documents/courses/data-engineering/big-data/notebooks/Learning..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<CallDate:string,Neighborhood:string>




In [35]:
(fire_ts_df
 .where(F.year('IncidentDate') == 2018)
 .select('Neighborhood')
 .explain())

== Physical Plan ==
*(1) Project [Neighborhood#201]
+- *(1) Filter (year(cast(gettimestamp(CallDate#181, MM/dd/yyyy, Some(America/Chicago)) as date)) = 2018)
   +- FileScan csv [CallDate#181,Neighborhood#201] Batched: false, DataFilters: [(year(cast(gettimestamp(CallDate#181, MM/dd/yyyy, Some(America/Chicago)) as date)) = 2018)], Format: CSV, Location: InMemoryFileIndex[file:/Users/imad/Documents/courses/data-engineering/big-data/notebooks/Learning..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<CallDate:string,Neighborhood:string>


