## Example 3.3. Fire Calls


Using DataFrameReader.
Let's read a large CSV file containing data on San Francisco Fire Department calls. 
We will define a schema for this file and use the DataFrameReader class and its methods to tell Spark what to do.

In [2]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._


In [5]:
import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType, FloatType, BooleanType};

val fireSchema = StructType(Array(StructField("CallNumber", IntegerType, true),
     StructField("UnitID", StringType, true),
     StructField("IncidentNumber", IntegerType, true),
     StructField("CallType", StringType, true),
     StructField("CallDate", StringType, true), 
     StructField("WatchDate", StringType, true),
     StructField("CallFinalDisposition", StringType, true),
     StructField("AvailableDtTm", StringType, true),
     StructField("Address", StringType, true), 
     StructField("City", StringType, true), 
     StructField("Zipcode", IntegerType, true), 
     StructField("Battalion", StringType, true), 
     StructField("StationArea", StringType, true), 
     StructField("Box", StringType, true), 
     StructField("OriginalPriority", StringType, true), 
     StructField("Priority", StringType, true), 
     StructField("FinalPriority", IntegerType, true), 
     StructField("ALSUnit", BooleanType, true), 
     StructField("CallTypeGroup", StringType, true),
     StructField("NumAlarms", IntegerType, true),
     StructField("UnitType", StringType, true),
     StructField("UnitSequenceInCallDispatch", IntegerType, true),
     StructField("FirePreventionDistrict", StringType, true),
     StructField("SupervisorDistrict", StringType, true),
     StructField("Neighborhood", StringType, true),                       
     StructField("Location", StringType, true),
     StructField("RowID", StringType, true),
     StructField("Delay", FloatType, true)))


import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType, FloatType, BooleanType}
fireSchema: org.apache.spark.sql.types.StructType = StructType(StructField(CallNumber,IntegerType,true),StructField(UnitID,StringType,true),StructField(IncidentNumber,IntegerType,true),StructField(CallType,StringType,true),StructField(CallDate,StringType,true),StructField(WatchDate,StringType,true),StructField(CallFinalDisposition,StringType,true),StructField(AvailableDtTm,StringType,true),StructField(Address,StringType,true),StructField(City,StringType,true),StructField(Zipcode,IntegerType,true),StructField(Battalion,StringType,true),StructField(StationArea,StringType,true),StructField(Box,StringType,true),StructField(OriginalPriority,StringType,true),StructField(Priority,Strin...


In [6]:
// Read the file using the CSV DataFrameReader
val sfFireFile="C:/Users/mariajose.chinchilla/OneDrive - Bosonit/Escritorio/Bosonit/Spark/datarepositorio/chapter3/data/sf-fire-calls.csv"
val fireDF = spark.read.schema(fireSchema)
 .option("header", "true")
 .csv(sfFireFile)

sfFireFile: String = C:/Users/mariajose.chinchilla/OneDrive - Bosonit/Escritorio/Bosonit/Spark/datarepositorio/chapter3/data/sf-fire-calls.csv
fireDF: org.apache.spark.sql.DataFrame = [CallNumber: int, UnitID: string ... 26 more fields]


## DUDA: Guardar en formato x


In [None]:
//Saving a DataFrame as a Parquet file or SQL table

//To save as a Parquet file
//val parquetPath = DUDA
//fireDF.write.format("parquet").save(parquetPath)

//To save as a table
//val parquetTable = "table_fire"
//fireDF.write.format("parquet").saveAsTable(parquetTable)

In [7]:
//Projections and filters
val fewFireDF = fireDF
 .select("IncidentNumber", "AvailableDtTm", "CallType")
 .where($"CallType" =!= "Medical Incident") 
fewFireDF.show(5, false)

+--------------+----------------------+--------------+
|IncidentNumber|AvailableDtTm         |CallType      |
+--------------+----------------------+--------------+
|2003235       |01/11/2002 01:51:44 AM|Structure Fire|
|2003250       |01/11/2002 04:16:46 AM|Vehicle Fire  |
|2003259       |01/11/2002 06:01:58 AM|Alarms        |
|2003279       |01/11/2002 08:03:26 AM|Structure Fire|
|2003301       |01/11/2002 09:46:44 AM|Alarms        |
+--------------+----------------------+--------------+
only showing top 5 rows



fewFireDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [IncidentNumber: int, AvailableDtTm: string ... 1 more field]


In [8]:
//Return the number of distinct types of calls
import org.apache.spark.sql.functions._
fireDF
 .select("CallType")
 .where(col("CallType").isNotNull)
 .agg(countDistinct('CallType) as 'DistinctCallTypes)
 .show()


+-----------------+
|DistinctCallTypes|
+-----------------+
|               30|
+-----------------+



import org.apache.spark.sql.functions._


In [9]:
//List the distinct call types 
fireDF
 .select("CallType")
 .where($"CallType".isNotNull)
 .distinct()
 .show(10, false)

+-----------------------------+
|CallType                     |
+-----------------------------+
|Elevator / Escalator Rescue  |
|Marine Fire                  |
|Aircraft Emergency           |
|Administrative               |
|Alarms                       |
|Odor (Strange / Unknown)     |
|Citizen Assist / Service Call|
|HazMat                       |
|Watercraft in Distress       |
|Explosion                    |
+-----------------------------+
only showing top 10 rows



In [10]:
// Renaming, adding, and dropping columns
// Rename columns with the withColumnRenamed() method. We change the name of our Delay column to RespondseDelayedMins. And then we select the response times that were longer than five minutes.
val newFireDF = fireDF.withColumnRenamed("Delay", "ResponseDelayedinMins")
newFireDF
 .select("ResponseDelayedinMins")
 .where($"ResponseDelayedinMins" > 5)
 .show(5, false)

+---------------------+
|ResponseDelayedinMins|
+---------------------+
|5.35                 |
|6.25                 |
|5.2                  |
|5.6                  |
|7.25                 |
+---------------------+
only showing top 5 rows



newFireDF: org.apache.spark.sql.DataFrame = [CallNumber: int, UnitID: string ... 26 more fields]


In [11]:
//1. Convert the existing column’s data type from string to a Spark-supported timestamp.
//2. Use the new format specified in the format string "MM/dd/yyyy" or "MM/dd/yyyy hh:mm:ss a" where appropriate
//3. After converting to the new data type, drop() the old column and append the new one specified in the first argument to the withColumn() method.
//4. Assign the new modified DataFrame to fire_ts_df.
val fireTsDF = newFireDF
 .withColumn("IncidentDate", to_timestamp(col("CallDate"), "MM/dd/yyyy"))
 .drop("CallDate")
 .withColumn("OnWatchDate", to_timestamp(col("WatchDate"), "MM/dd/yyyy"))
 .drop("WatchDate")
 .withColumn("AvailableDtTS", to_timestamp(col("AvailableDtTm"),
 "MM/dd/yyyy hh:mm:ss a"))
 .drop("AvailableDtTm")
// Select the converted columns
fireTsDF
 .select("IncidentDate", "OnWatchDate", "AvailableDtTS")
 .show(5, false)

+-------------------+-------------------+-------------------+
|IncidentDate       |OnWatchDate        |AvailableDtTS      |
+-------------------+-------------------+-------------------+
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 01:51:44|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 03:01:18|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 02:39:50|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 04:16:46|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 06:01:58|
+-------------------+-------------------+-------------------+
only showing top 5 rows



fireTsDF: org.apache.spark.sql.DataFrame = [CallNumber: int, UnitID: string ... 26 more fields]


In [12]:
//Explore our data. We could see how many years' worth of Fire Department calls are included in the dara set
fireTsDF
 .select(year($"IncidentDate"))
 .distinct()
 .orderBy(year($"IncidentDate"))
 .show()

+------------------+
|year(IncidentDate)|
+------------------+
|              2000|
|              2001|
|              2002|
|              2003|
|              2004|
|              2005|
|              2006|
|              2007|
|              2008|
|              2009|
|              2010|
|              2011|
|              2012|
|              2013|
|              2014|
|              2015|
|              2016|
|              2017|
|              2018|
+------------------+



In [13]:
//Aggregations

//We will see what were the most common types of fire calls
fireTsDF
 .select("CallType")
 .where(col("CallType").isNotNull)
 .groupBy("CallType")
 .count()
 .orderBy(desc("count"))
 .show(10, false)

+-------------------------------+------+
|CallType                       |count |
+-------------------------------+------+
|Medical Incident               |113794|
|Structure Fire                 |23319 |
|Alarms                         |19406 |
|Traffic Collision              |7013  |
|Citizen Assist / Service Call  |2524  |
|Other                          |2166  |
|Outside Fire                   |2094  |
|Vehicle Fire                   |854   |
|Gas Leak (Natural and LP Gases)|764   |
|Water Rescue                   |755   |
+-------------------------------+------+
only showing top 10 rows



In [14]:
//Other common DataFrame operations:

//We compute the sum of alarms, the average response time and the minimum and maximum response times to all fire calls in our data set
import org.apache.spark.sql.{functions => F}
fireTsDF
 .select(F.sum("NumAlarms"), F.avg("ResponseDelayedinMins"),
 F.min("ResponseDelayedinMins"), F.max("ResponseDelayedinMins"))
 .show()


+--------------+--------------------------+--------------------------+--------------------------+
|sum(NumAlarms)|avg(ResponseDelayedinMins)|min(ResponseDelayedinMins)|max(ResponseDelayedinMins)|
+--------------+--------------------------+--------------------------+--------------------------+
|        176170|         3.892364154521585|               0.016666668|                   1844.55|
+--------------+--------------------------+--------------------------+--------------------------+



import org.apache.spark.sql.{functions=>F}
