# Learning Spark - Chapter 3 (Scala)
## Apache Spark’s Structured APIs

In [1]:
import org.apache.spark.sql.functions.avg
import org.apache.spark.sql.SparkSession

Intitializing Scala interpreter ...

Spark Web UI available at http://EM2021002778.bosonit.local:4040
SparkContext available as 'sc' (version = 3.1.1, master = local[*], app id = local-1620586059217)
SparkSession available as 'spark'


import org.apache.spark.sql.functions.avg
import org.apache.spark.sql.SparkSession


In [2]:
// Create a DataFrame using SparkSession
val spark = SparkSession
.builder
.appName("AuthorsAges")
.getOrCreate()

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@6da896b3


In [3]:
// Create a DataFrame of names and ages
val dataDF = spark.createDataFrame(Seq(("Brooke", 20), ("Brooke", 25),
("Denny", 31), ("Jules", 30), ("TD", 35))).toDF("name", "age")

dataDF: org.apache.spark.sql.DataFrame = [name: string, age: int]


In [4]:
// Group the same names together, aggregate their ages, and compute an average
val avgDF = dataDF.groupBy("name").agg(avg("age"))

avgDF: org.apache.spark.sql.DataFrame = [name: string, avg(age): double]


In [5]:
// Show the results of the final execution
avgDF.show()

+------+--------+
|  name|avg(age)|
+------+--------+
|Brooke|    22.5|
| Jules|    30.0|
|    TD|    35.0|
| Denny|    31.0|
+------+--------+



### The DataFrame API
#### Spark’s Basic Data Types

In [6]:
import org.apache.spark.sql.types._

import org.apache.spark.sql.types._


In [7]:
val nameTypes = StringType

nameTypes: org.apache.spark.sql.types.StringType.type = StringType


In [8]:
val firstName = nameTypes

firstName: org.apache.spark.sql.types.StringType.type = StringType


In [9]:
val lastName = nameTypes

lastName: org.apache.spark.sql.types.StringType.type = StringType


#### Schemas and Creating DataFrames

In [10]:
val schema = StructType(Array(StructField("author", StringType, false),
StructField("title", StringType, false),
StructField("pages", IntegerType, false)))

schema: org.apache.spark.sql.types.StructType = StructType(StructField(author,StringType,false), StructField(title,StringType,false), StructField(pages,IntegerType,false))


In [11]:
val schema = "author STRING, title STRING, pages INT"

schema: String = author STRING, title STRING, pages INT


In [12]:
// Get the path to the JSON file
val jsonFile = "C:/Users/jorgedario.mendez/OneDrive - Bosonit/Documentos/3. Libro/LearningSparkV2-master/chapter3/data/blogs.json"

jsonFile: String = C:/Users/jorgedario.mendez/OneDrive - Bosonit/Documentos/3. Libro/LearningSparkV2-master/chapter3/data/blogs.json


In [13]:
// Define our schema programmatically
val schema = StructType(Array(StructField("Id", IntegerType, false),
StructField("First", StringType, false),
StructField("Last", StringType, false),
StructField("Url", StringType, false),
StructField("Published", StringType, false),
StructField("Hits", IntegerType, false),
StructField("Campaigns", ArrayType(StringType), false)))

schema: org.apache.spark.sql.types.StructType = StructType(StructField(Id,IntegerType,false), StructField(First,StringType,false), StructField(Last,StringType,false), StructField(Url,StringType,false), StructField(Published,StringType,false), StructField(Hits,IntegerType,false), StructField(Campaigns,ArrayType(StringType,true),false))


In [14]:
// Create a DataFrame by reading from the JSON file
// with a predefined schema
val blogsDF = spark.read.schema(schema).json(jsonFile)

blogsDF: org.apache.spark.sql.DataFrame = [Id: int, First: string ... 5 more fields]


In [15]:
// Show the DataFrame schema as output
blogsDF.show(false)

+---+---------+-------+-----------------+---------+-----+----------------------------+
|Id |First    |Last   |Url              |Published|Hits |Campaigns                   |
+---+---------+-------+-----------------+---------+-----+----------------------------+
|1  |Jules    |Damji  |https://tinyurl.1|1/4/2016 |4535 |[twitter, LinkedIn]         |
|2  |Brooke   |Wenig  |https://tinyurl.2|5/5/2018 |8908 |[twitter, LinkedIn]         |
|3  |Denny    |Lee    |https://tinyurl.3|6/7/2019 |7659 |[web, twitter, FB, LinkedIn]|
|4  |Tathagata|Das    |https://tinyurl.4|5/12/2018|10568|[twitter, FB]               |
|5  |Matei    |Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB, LinkedIn]|
|6  |Reynold  |Xin    |https://tinyurl.6|3/2/2015 |25568|[twitter, LinkedIn]         |
+---+---------+-------+-----------------+---------+-----+----------------------------+



#### Columns

In [16]:
import org.apache.spark.sql.functions._

import org.apache.spark.sql.functions._


In [17]:
blogsDF.columns

res2: Array[String] = Array(Id, First, Last, Url, Published, Hits, Campaigns)


In [18]:
// Access a particular column with col and it returns a Column type
blogsDF.col("Id")

res3: org.apache.spark.sql.Column = Id


In [19]:
// Use an expression to compute a value
blogsDF.select(expr("Hits * 2")).show(2)

+----------+
|(Hits * 2)|
+----------+
|      9070|
|     17816|
+----------+
only showing top 2 rows



In [20]:
// or use col to compute value
blogsDF.select(col("Hits") * 2).show(2)

+----------+
|(Hits * 2)|
+----------+
|      9070|
|     17816|
+----------+
only showing top 2 rows



In [21]:
// Use an expression to compute big hitters for blogs
// This adds a new column, Big Hitters, based on the conditional expression
blogsDF.withColumn("Big Hitters", (expr("Hits > 10000"))).show()

+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|Big Hitters|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|      false|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|      false|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|      false|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|       true|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|       true|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|       true|
+---+---------+-------+-----------------+---------+-----+--------------------+-----------+



In [22]:
// Concatenate three columns, create a new column, and show the
// newly created concatenated column
blogsDF.withColumn("AuthorsId", (concat(expr("First"), expr("Last"), expr("Id")))).select(col("AuthorsId")).show(4)

+-------------+
|    AuthorsId|
+-------------+
|  JulesDamji1|
| BrookeWenig2|
|    DennyLee3|
|TathagataDas4|
+-------------+
only showing top 4 rows



In [23]:
// These statements return the same value, showing that
// expr is the same as a col method call
blogsDF.select(expr("Hits")).show(2)
blogsDF.select(col("Hits")).show(2)
blogsDF.select("Hits").show(2)

+----+
|Hits|
+----+
|4535|
|8908|
+----+
only showing top 2 rows

+----+
|Hits|
+----+
|4535|
|8908|
+----+
only showing top 2 rows

+----+
|Hits|
+----+
|4535|
|8908|
+----+
only showing top 2 rows



In [24]:
// Sort by column "Id" in descending order
blogsDF.sort(col("Id").desc).show()
blogsDF.sort($"Id".desc).show()

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+

#### Rows

In [25]:
import org.apache.spark.sql.Row
// Create a Row
val blogRow = Row(6, "Reynold", "Xin", "https://tinyurl.6", 255568, "3/2/2015",Array("twitter", "LinkedIn"))

import org.apache.spark.sql.Row
blogRow: org.apache.spark.sql.Row = [6,Reynold,Xin,https://tinyurl.6,255568,3/2/2015,[Ljava.lang.String;@6fbb2bf9]


In [26]:
// Access using index for individual items
blogRow(1)

res10: Any = Reynold


In [27]:
val rows = Seq(("Matei Zaharia", "CA"), ("Reynold Xin", "CA"))
val authorsDF = rows.toDF("Author", "State")
authorsDF.show()

+-------------+-----+
|       Author|State|
+-------------+-----+
|Matei Zaharia|   CA|
|  Reynold Xin|   CA|
+-------------+-----+



rows: Seq[(String, String)] = List((Matei Zaharia,CA), (Reynold Xin,CA))
authorsDF: org.apache.spark.sql.DataFrame = [Author: string, State: string]


## Fire deparment

In [28]:
val sampleDF = spark.read.option("samplingRatio", 0.001).option("header", true)
.csv("C:/Users/jorgedario.mendez/OneDrive - Bosonit/Documentos/3. Libro/LearningSparkV2-master/databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv")

sampleDF: org.apache.spark.sql.DataFrame = [CallNumber: string, UnitID: string ... 26 more fields]


## schema firecalls

In [29]:
val fireSchema = StructType(Array(StructField("CallNumber", IntegerType, true),StructField("UnitID", StringType, true),
StructField("IncidentNumber", IntegerType, true),
StructField("CallType", StringType, true),
StructField("CallDate", StringType, true),
StructField("WatchDate", StringType, true),
StructField("CallFinalDisposition", StringType, true),
StructField("AvailableDtTm", StringType, true),
StructField("Address", StringType, true),
StructField("City", StringType, true),
StructField("Zipcode", IntegerType, true),
StructField("Battalion", StringType, true),
StructField("StationArea", StringType, true),
StructField("Box", StringType, true),
StructField("OriginalPriority", StringType, true),
StructField("Priority", StringType, true),
StructField("FinalPriority", IntegerType, true),
StructField("ALSUnit", BooleanType, true),
StructField("CallTypeGroup", StringType, true),
StructField("NumAlarms", IntegerType, true),
StructField("UnitType", StringType, true),
StructField("UnitSequenceInCallDispatch", IntegerType, true),
StructField("FirePreventionDistrict", StringType, true),
StructField("SupervisorDistrict", StringType, true),
StructField("Neighborhood", StringType, true),
StructField("Location", StringType, true),
StructField("RowID", StringType, true),
StructField("Delay", FloatType, true)))

fireSchema: org.apache.spark.sql.types.StructType = StructType(StructField(CallNumber,IntegerType,true), StructField(UnitID,StringType,true), StructField(IncidentNumber,IntegerType,true), StructField(CallType,StringType,true), StructField(CallDate,StringType,true), StructField(WatchDate,StringType,true), StructField(CallFinalDisposition,StringType,true), StructField(AvailableDtTm,StringType,true), StructField(Address,StringType,true), StructField(City,StringType,true), StructField(Zipcode,IntegerType,true), StructField(Battalion,StringType,true), StructField(StationArea,StringType,true), StructField(Box,StringType,true), StructField(OriginalPriority,StringType,true), StructField(Priority,StringType,true), StructField(FinalPriority,IntegerType,true), StructField(ALSUnit,BooleanType,true)...


In [30]:
// Read the file using the CSV DataFrameReader
val sfFireFile="C:/Users/jorgedario.mendez/OneDrive - Bosonit/Documentos/3. Libro/LearningSparkV2-master/databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv"
val fireDF = spark.read.schema(fireSchema)
.option("header", "true")
.csv(sfFireFile)

sfFireFile: String = C:/Users/jorgedario.mendez/OneDrive - Bosonit/Documentos/3. Libro/LearningSparkV2-master/databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv
fireDF: org.apache.spark.sql.DataFrame = [CallNumber: int, UnitID: string ... 26 more fields]


In [31]:
import org.apache.spark.sql.types._ 
import org.apache.spark.sql.functions._ 

import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._


In [32]:
fireDF.printSchema()

root
 |-- CallNumber: integer (nullable = true)
 |-- UnitID: string (nullable = true)
 |-- IncidentNumber: integer (nullable = true)
 |-- CallType: string (nullable = true)
 |-- CallDate: string (nullable = true)
 |-- WatchDate: string (nullable = true)
 |-- CallFinalDisposition: string (nullable = true)
 |-- AvailableDtTm: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: integer (nullable = true)
 |-- Battalion: string (nullable = true)
 |-- StationArea: string (nullable = true)
 |-- Box: string (nullable = true)
 |-- OriginalPriority: string (nullable = true)
 |-- Priority: string (nullable = true)
 |-- FinalPriority: integer (nullable = true)
 |-- ALSUnit: boolean (nullable = true)
 |-- CallTypeGroup: string (nullable = true)
 |-- NumAlarms: integer (nullable = true)
 |-- UnitType: string (nullable = true)
 |-- UnitSequenceInCallDispatch: integer (nullable = true)
 |-- FirePreventionDistrict: string (nullable = true)
 

#### Saving a DataFrame as a Parquet file or SQL table.

In [33]:
// to save as a Parquet file
// val parquetPath = "C:/Users/jorgedario.mendez/OneDrive - Bosonit/Documentos/3. Libro/LearningSparkV2-master/JM Jupyter/parquescala1/"
// fireDF.write.format("parquet").save(parquetPath)

In [34]:
// to save as a table
// val parquetTable = "parquetablascala1" // name of the table
// fireDF.write.format("parquet").saveAsTable(parquetTable)

#### Transformations and actions

In [35]:
val fewFireDF = fireDF.select("IncidentNumber", "AvailableDtTm", "CallType").where($"CallType" =!= "Medical Incident")
fewFireDF.show(5, false)

+--------------+----------------------+--------------+
|IncidentNumber|AvailableDtTm         |CallType      |
+--------------+----------------------+--------------+
|2003235       |01/11/2002 01:51:44 AM|Structure Fire|
|2003250       |01/11/2002 04:16:46 AM|Vehicle Fire  |
|2003259       |01/11/2002 06:01:58 AM|Alarms        |
|2003279       |01/11/2002 08:03:26 AM|Structure Fire|
|2003301       |01/11/2002 09:46:44 AM|Alarms        |
+--------------+----------------------+--------------+
only showing top 5 rows



fewFireDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [IncidentNumber: int, AvailableDtTm: string ... 1 more field]


In [36]:
fireDF
.select("CallType")
.where(col("CallType").isNotNull)
.agg(countDistinct('CallType) as 'DistinctCallTypes)
.show(10, false)

+-----------------+
|DistinctCallTypes|
+-----------------+
|30               |
+-----------------+



In [37]:
//distinct call types in the data
fireDF
.select("CallType")
.where($"CallType".isNotNull)
.distinct()
.show(10, false)

+-----------------------------------+
|CallType                           |
+-----------------------------------+
|Elevator / Escalator Rescue        |
|Marine Fire                        |
|Aircraft Emergency                 |
|Confined Space / Structure Collapse|
|Administrative                     |
|Alarms                             |
|Odor (Strange / Unknown)           |
|Citizen Assist / Service Call      |
|HazMat                             |
|Watercraft in Distress             |
+-----------------------------------+
only showing top 10 rows



#### Renaming, adding, and dropping columns.

In [38]:
val newFireDF = fireDF.withColumnRenamed("Delay", "ResponseDelayedinMins")
newFireDF
.select("ResponseDelayedinMins")
.where($"ResponseDelayedinMins" > 5)
.show(5, false)

+---------------------+
|ResponseDelayedinMins|
+---------------------+
|5.35                 |
|6.25                 |
|5.2                  |
|5.6                  |
|7.25                 |
+---------------------+
only showing top 5 rows



newFireDF: org.apache.spark.sql.DataFrame = [CallNumber: int, UnitID: string ... 26 more fields]


In [39]:
// Change date format
val fireTsDF = newFireDF
.withColumn("IncidentDate", to_timestamp(col("CallDate"), "MM/dd/yyyy"))
.drop("CallDate")
.withColumn("OnWatchDate", to_timestamp(col("WatchDate"), "MM/dd/yyyy"))
.drop("WatchDate")
.withColumn("AvailableDtTS", to_timestamp(col("AvailableDtTm"),"MM/dd/yyyy hh:mm:ss a"))
.drop("AvailableDtTm")

fireTsDF: org.apache.spark.sql.DataFrame = [CallNumber: int, UnitID: string ... 26 more fields]


In [40]:
fireTsDF
.select("IncidentDate", "OnWatchDate", "AvailableDtTS")
.show(5, false)

+-------------------+-------------------+-------------------+
|IncidentDate       |OnWatchDate        |AvailableDtTS      |
+-------------------+-------------------+-------------------+
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 01:51:44|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 03:01:18|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 02:39:50|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 04:16:46|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 06:01:58|
+-------------------+-------------------+-------------------+
only showing top 5 rows



In [41]:
fireTsDF
.select(year(col("IncidentDate")))
.distinct()
.orderBy(year($"IncidentDate"))
.show()

+------------------+
|year(IncidentDate)|
+------------------+
|              2000|
|              2001|
|              2002|
|              2003|
|              2004|
|              2005|
|              2006|
|              2007|
|              2008|
|              2009|
|              2010|
|              2011|
|              2012|
|              2013|
|              2014|
|              2015|
|              2016|
|              2017|
|              2018|
+------------------+



#### Aggregations

what were the most common types of fire calls?

In [42]:
fireTsDF
.where(col("CallType").isNotNull)
.groupBy("CallType")
.count()
.orderBy(desc("count"))
.show(10, false)

+-------------------------------+------+
|CallType                       |count |
+-------------------------------+------+
|Medical Incident               |113794|
|Structure Fire                 |23319 |
|Alarms                         |19406 |
|Traffic Collision              |7013  |
|Citizen Assist / Service Call  |2524  |
|Other                          |2166  |
|Outside Fire                   |2094  |
|Vehicle Fire                   |854   |
|Gas Leak (Natural and LP Gases)|764   |
|Water Rescue                   |755   |
+-------------------------------+------+
only showing top 10 rows



### Other common DataFrame operations

the Data‐
Frame API provides descriptive statistical methods like min(), max(), sum(), and avg()

In [43]:
import org.apache.spark.sql.{functions => F}

import org.apache.spark.sql.{functions=>F}


In [44]:
fireTsDF
.select(F.sum("NumAlarms"), F.avg("ResponseDelayedinMins"),
        F.min("ResponseDelayedinMins"), F.max("ResponseDelayedinMins"))
.show()

+--------------+--------------------------+--------------------------+--------------------------+
|sum(NumAlarms)|avg(ResponseDelayedinMins)|min(ResponseDelayedinMins)|max(ResponseDelayedinMins)|
+--------------+--------------------------+--------------------------+--------------------------+
|        176170|         3.892364154521585|               0.016666668|                   1844.55|
+--------------+--------------------------+--------------------------+--------------------------+



### End-to-End DataFrame Example

#####  What were all the different types of fire calls in 2018?

In [45]:
fireTsDF.select("CallType").where(col("CallType").isNotNull).distinct.show(false)

+--------------------------------------------+
|CallType                                    |
+--------------------------------------------+
|Elevator / Escalator Rescue                 |
|Marine Fire                                 |
|Aircraft Emergency                          |
|Confined Space / Structure Collapse         |
|Administrative                              |
|Alarms                                      |
|Odor (Strange / Unknown)                    |
|Citizen Assist / Service Call               |
|HazMat                                      |
|Watercraft in Distress                      |
|Explosion                                   |
|Oil Spill                                   |
|Vehicle Fire                                |
|Suspicious Package                          |
|Extrication / Entrapped (Machinery, Vehicle)|
|Other                                       |
|Outside Fire                                |
|Traffic Collision                           |
|Assist Polic

##### What months within the year 2018 saw the highest number of fire calls?

In [46]:
fireTsDF.where(year($"IncidentDate") === "2018")
.groupBy(month($"IncidentDate"))
.count()
.orderBy(desc("count"))
.show(12, false)

+-------------------+-----+
|month(IncidentDate)|count|
+-------------------+-----+
|10                 |1068 |
|5                  |1047 |
|3                  |1029 |
|8                  |1021 |
|1                  |1007 |
|6                  |974  |
|7                  |974  |
|9                  |951  |
|4                  |947  |
|2                  |919  |
|11                 |199  |
+-------------------+-----+



##### Which neighborhood in San Francisco generated the most fire calls in 2018?

In [47]:
fireTsDF.select("Neighborhood").where(col("City") === "San Francisco").groupBy(col("Neighborhood")).count().orderBy(desc("count")).show(false)

+------------------------------+-----+
|Neighborhood                  |count|
+------------------------------+-----+
|Tenderloin                    |7067 |
|South of Market               |5323 |
|Mission                       |4666 |
|Financial District/South Beach|3907 |
|Bayview Hunters Point         |2643 |
|Sunset/Parkside               |1900 |
|Western Addition              |1826 |
|Nob Hill                      |1660 |
|Castro/Upper Market           |1364 |
|Outer Richmond                |1319 |
|Hayes Valley                  |1317 |
|North Beach                   |1182 |
|Pacific Heights               |1111 |
|West of Twin Peaks            |1103 |
|Excelsior                     |1071 |
|Chinatown                     |1025 |
|Potrero Hill                  |995  |
|Marina                        |976  |
|Bernal Heights                |839  |
|Haight Ashbury                |834  |
+------------------------------+-----+
only showing top 20 rows



##### Which neighborhoods had the worst response times to fire calls in 2018?

In [48]:
fireTsDF.select("Neighborhood","ResponseDelayedinMins")
.where(col("ResponseDelayedinMins").isNotNull)
.groupBy(col("Neighborhood"))
.agg(avg(col("ResponseDelayedinMins")).alias("avgTimeDelay"))
.orderBy(desc("avgTimeDelay"))
.show(10,false)

+---------------------+------------------+
|Neighborhood         |avgTimeDelay      |
+---------------------+------------------+
|Treasure Island      |5.47149999499321  |
|Presidio             |4.9653753549934505|
|Mission Bay          |4.530760579728938 |
|McLaren Park         |4.309822855580256 |
|None                 |4.307180858534226 |
|Twin Peaks           |4.294008398269729 |
|Golden Gate Park     |4.249903662329421 |
|Lakeshore            |4.201812146300208 |
|Bayview Hunters Point|4.150424644461803 |
|Seacliff             |4.137820510795483 |
+---------------------+------------------+
only showing top 10 rows



##### Which week in the year in 2018 had the most fire calls?

In [49]:
fireTsDF
.select("IncidentDate")
.where(year($"IncidentDate") === "2018")
.groupBy(weekofyear($"IncidentDate"))
.count()
.orderBy(desc("count")).show(12, false)

+------------------------+-----+
|weekofyear(IncidentDate)|count|
+------------------------+-----+
|22                      |259  |
|40                      |255  |
|43                      |250  |
|25                      |249  |
|1                       |246  |
|44                      |244  |
|13                      |243  |
|32                      |243  |
|11                      |240  |
|5                       |236  |
|18                      |236  |
|23                      |235  |
+------------------------+-----+
only showing top 12 rows



##### Is there a correlation between neighborhood, zip code, and number of fire calls?

In [50]:
fireTsDF
.where(col("Zipcode")isNotNull)
.groupBy(col("Zipcode"))
.count().orderBy(desc("count"))
.show(15, false)

+-------+-----+
|Zipcode|count|
+-------+-----+
|94102  |21840|
|94103  |20897|
|94110  |14801|
|94109  |14686|
|94124  |9236 |
|94112  |8421 |
|94115  |7812 |
|94107  |6941 |
|94122  |6355 |
|94133  |6246 |
|94117  |5804 |
|94114  |5175 |
|94118  |5157 |
|94134  |5009 |
|94121  |4555 |
+-------+-----+
only showing top 15 rows





##### How can we use Parquet files or SQL tables to store this data and read it back?

val parquetPath = "C:/Users/jorgedario.mendez/OneDrive - Bosonit/Documentos/3. Libro/LearningSparkV2-master/JM Jupyter/fireTsDF1/"
fireTsDF.write.format("parquet").save(parquetPath)

val parquetTable = "fireTsDF_table1" // name of the table
fireTsDF.write.format("parquet").saveAsTable(parquetTable)

### Typed Objects, Untyped Objects, and Generic Rows

In [51]:
import org.apache.spark.sql.Row
val row = Row(350, true, "Learning Spark 2E", null)

import org.apache.spark.sql.Row
row: org.apache.spark.sql.Row = [350,true,Learning Spark 2E,null]


In [52]:
row.getInt(0)

res29: Int = 350


In [53]:
row.getBoolean(1)

res30: Boolean = true


In [54]:
row.getString(2)

res31: String = Learning Spark 2E


## Creating Datasets
### Scala: Case classes

In [55]:
case class DeviceIoTData (battery_level: Long, c02_level: Long, cca2: String, 
                          cca3: String, cn: String, device_id: Long,
                          device_name: String, humidity: Long, ip: String, 
                          latitude: Double, lcd: String, longitude: Double, 
                          scale:String, temp: Long, timestamp: Long)

defined class DeviceIoTData


In [56]:
val ds = spark.read.json("C:/Users/jorgedario.mendez/OneDrive - Bosonit/Documentos/3. Libro/LearningSparkV2-master/databricks-datasets/learning-spark-v2/iot-devices/iot_devices.json").as[DeviceIoTData]

ds: org.apache.spark.sql.Dataset[DeviceIoTData] = [battery_level: bigint, c02_level: bigint ... 13 more fields]


In [57]:
ds.show(5, false)

+-------------+---------+----+----+-------------+---------+---------------------+--------+-------------+--------+------+---------+-------+----+-------------+
|battery_level|c02_level|cca2|cca3|cn           |device_id|device_name          |humidity|ip           |latitude|lcd   |longitude|scale  |temp|timestamp    |
+-------------+---------+----+----+-------------+---------+---------------------+--------+-------------+--------+------+---------+-------+----+-------------+
|8            |868      |US  |USA |United States|1        |meter-gauge-1xbYRYcj |51      |68.161.225.1 |38.0    |green |-97.0    |Celsius|34  |1458444054093|
|7            |1473     |NO  |NOR |Norway       |2        |sensor-pad-2n2Pea    |70      |213.161.254.1|62.47   |red   |6.15     |Celsius|11  |1458444054119|
|2            |1556     |IT  |ITA |Italy        |3        |device-mac-36TWSKiT  |44      |88.36.5.1    |42.83   |red   |12.83    |Celsius|19  |1458444054120|
|6            |1080     |US  |USA |United States|4  

### Dataset Operations

In [58]:
val filterTempDS = ds.filter(d => {d.temp > 30 && d.humidity > 70})

filterTempDS: org.apache.spark.sql.Dataset[DeviceIoTData] = [battery_level: bigint, c02_level: bigint ... 13 more fields]


In [59]:
val filterTempDS = ds.where(col("temp") > 30  && col("humidity") > 70)

filterTempDS: org.apache.spark.sql.Dataset[DeviceIoTData] = [battery_level: bigint, c02_level: bigint ... 13 more fields]


In [60]:
filterTempDS.show(5, false)

+-------------+---------+----+----+-------------+---------+----------------------+--------+---------------+--------+------+---------+-------+----+-------------+
|battery_level|c02_level|cca2|cca3|cn           |device_id|device_name           |humidity|ip             |latitude|lcd   |longitude|scale  |temp|timestamp    |
+-------------+---------+----+----+-------------+---------+----------------------+--------+---------------+--------+------+---------+-------+----+-------------+
|0            |1466     |US  |USA |United States|17       |meter-gauge-17zb8Fghhl|98      |161.188.212.254|39.95   |red   |-75.16   |Celsius|31  |1458444054129|
|9            |986      |FR  |FRA |France       |48       |sensor-pad-48jt4eL    |97      |90.37.208.1    |43.88   |green |4.9      |Celsius|31  |1458444054151|
|8            |1436     |US  |USA |United States|54       |sensor-pad-5410CWPrNb6|73      |204.15.64.249  |32.89   |red   |-117.13  |Celsius|34  |1458444054155|
|4            |1090     |US  |USA 

In [61]:
case class DeviceTempByCountry(temp: Long, device_name: String, device_id: Long,cca3: String)

defined class DeviceTempByCountry


val dsTemp = ds
.where(col("temp") > 25)
.map(col("temp"), col("device_name"), col("device_id"), col("cca3"))
.toDF("temp", "device_name", "device_id", "cca3")
.as[DeviceTempByCountry]

In [62]:
val dsTemp = ds.select("temp","device_name", "device_id","cca3")
.where(col("temp") > 25)
.toDF("temp", "device_name", "device_id", "cca3")
.as[DeviceTempByCountry]

dsTemp: org.apache.spark.sql.Dataset[DeviceTempByCountry] = [temp: bigint, device_name: string ... 2 more fields]


In [63]:
dsTemp.show(5, false)

+----+---------------------+---------+----+
|temp|device_name          |device_id|cca3|
+----+---------------------+---------+----+
|34  |meter-gauge-1xbYRYcj |1        |USA |
|28  |sensor-pad-4mzWkz    |4        |USA |
|27  |sensor-pad-6al7RTAobR|6        |USA |
|27  |sensor-pad-8xUD6pzsQI|8        |JPN |
|26  |sensor-pad-10BsywSYUF|10       |USA |
+----+---------------------+---------+----+
only showing top 5 rows



In [64]:
val device = dsTemp.first()
println(device)

DeviceTempByCountry(34,meter-gauge-1xbYRYcj,1,USA)


device: DeviceTempByCountry = DeviceTempByCountry(34,meter-gauge-1xbYRYcj,1,USA)


In [65]:
val dsTemp2 = ds
.select($"temp", $"device_name", $"device_id", $"device_id", $"cca3")
.where("temp > 25")
.as[DeviceTempByCountry]
dsTemp2.show()

+----+--------------------+---------+---------+----+
|temp|         device_name|device_id|device_id|cca3|
+----+--------------------+---------+---------+----+
|  34|meter-gauge-1xbYRYcj|        1|        1| USA|
|  28|   sensor-pad-4mzWkz|        4|        4| USA|
|  27|sensor-pad-6al7RT...|        6|        6| USA|
|  27|sensor-pad-8xUD6p...|        8|        8| JPN|
|  26|sensor-pad-10Bsyw...|       10|       10| USA|
|  31|meter-gauge-17zb8...|       17|       17| USA|
|  31|sensor-pad-18XULN9Xv|       18|       18| CHN|
|  29|meter-gauge-19eg1...|       19|       19| USA|
|  30|  device-mac-21sjz5h|       21|       21| AUT|
|  28|sensor-pad-24Pytz...|       24|       24| CAN|
|  27|therm-stick-25kK6...|       25|       25| USA|
|  27|sensor-pad-34F1Ju...|       34|       34| RUS|
|  34| sensor-pad-40NjeMqS|       40|       40| FRA|
|  27| sensor-pad-448DeWGL|       44|       44| DEU|
|  26|device-mac-45fN2C...|       45|       45| ITA|
|  29|meter-gauge-47WsF9s8|       47|       47

dsTemp2: org.apache.spark.sql.Dataset[DeviceTempByCountry] = [temp: bigint, device_name: string ... 3 more fields]


## End-to-End Dataset Example

##### Detect failing devices with battery levels below a threshold

In [66]:
ds.select("device_id","device_name","battery_level")
.where($"battery_level" < 8)
.orderBy("battery_level")
.show()

+---------+--------------------+-------------+
|device_id|         device_name|battery_level|
+---------+--------------------+-------------+
|   185211|device-mac-185211...|            0|
|   185423|meter-gauge-18542...|            0|
|   185231|meter-gauge-18523...|            0|
|   185202|sensor-pad-185202...|            0|
|   185240|sensor-pad-185240...|            0|
|   185139|device-mac-185139...|            0|
|   185255|therm-stick-18525...|            0|
|   185141|meter-gauge-18514...|            0|
|   185272|sensor-pad-185272...|            0|
|   185143|meter-gauge-18514...|            0|
|   185283|device-mac-185283...|            0|
|   185105|therm-stick-18510...|            0|
|   185292|sensor-pad-185292...|            0|
|   185078|sensor-pad-185078...|            0|
|   185297|meter-gauge-18529...|            0|
|   185157|device-mac-185157...|            0|
|   185316|sensor-pad-185316...|            0|
|   185160|sensor-pad-185160...|            0|
|   185324|se

##### Identify offending countries with high levels of CO2 emissions.

In [67]:
ds.select("cn","c02_level")
.where($"c02_level".isNotNull)
.groupBy($"cn")
.agg(avg($"c02_level").alias("avg_CO2"))
.orderBy(desc("avg_CO2"))
.show(false)

+----------------+------------------+
|cn              |avg_CO2           |
+----------------+------------------+
|Gabon           |1523.0            |
|Falkland Islands|1424.0            |
|Monaco          |1421.5            |
|Kosovo          |1389.0            |
|San Marino      |1379.6666666666667|
|Liberia         |1374.5            |
|Syria           |1345.8            |
|Mauritania      |1344.4285714285713|
|Congo           |1333.375          |
|Tonga           |1323.0            |
|East Timor      |1310.0            |
|Guinea          |1308.0            |
|Botswana        |1302.6666666666667|
|Haiti           |1291.3333333333333|
|Laos            |1291.0            |
|Maldives        |1284.7272727272727|
|Sint Maarten    |1282.2857142857142|
|Andorra         |1279.0            |
|Lesotho         |1274.6            |
|Mozambique      |1264.0            |
+----------------+------------------+
only showing top 20 rows



##### Compute the min and max values for temperature, battery level, CO2, and humidity.

In [68]:
ds.select(min("temp").alias("Min_Temp"), max("temp").alias("Max_Temp"),
          min("battery_level").alias("Min_Batt"), max("battery_level").alias("Max_Batt"),
          min("c02_level").alias("Min_CO2"), max("c02_level").alias("Max_CO2"),
          min("humidity").alias("Min_humidity"), max("humidity").alias("Max_humidity")
         ).show()

+--------+--------+--------+--------+-------+-------+------------+------------+
|Min_Temp|Max_Temp|Min_Batt|Max_Batt|Min_CO2|Max_CO2|Min_humidity|Max_humidity|
+--------+--------+--------+--------+-------+-------+------------+------------+
|      10|      34|       0|       9|    800|   1599|          25|          99|
+--------+--------+--------+--------+-------+-------+------------+------------+



In [69]:
ds.printSchema

root
 |-- battery_level: long (nullable = true)
 |-- c02_level: long (nullable = true)
 |-- cca2: string (nullable = true)
 |-- cca3: string (nullable = true)
 |-- cn: string (nullable = true)
 |-- device_id: long (nullable = true)
 |-- device_name: string (nullable = true)
 |-- humidity: long (nullable = true)
 |-- ip: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- lcd: string (nullable = true)
 |-- longitude: double (nullable = true)
 |-- scale: string (nullable = true)
 |-- temp: long (nullable = true)
 |-- timestamp: long (nullable = true)



###### Sort and group by average temperature, CO2, humidity, and country.

In [70]:
ds.select("temp", "humidity", "cn")
.where($"temp" > 25 && $"humidity" > 75)
.groupBy($"cn")
.avg()
.sort($"avg(temp)".desc, $"avg(humidity)".desc).alias("avg_humidity")
.show(10, false)

+----------------------+---------+-------------+
|cn                    |avg(temp)|avg(humidity)|
+----------------------+---------+-------------+
|Monaco                |34.0     |91.0         |
|Anguilla              |34.0     |83.0         |
|British Virgin Islands|34.0     |81.0         |
|Turkmenistan          |34.0     |80.0         |
|Suriname              |34.0     |79.0         |
|Gibraltar             |34.0     |78.0         |
|Liechtenstein         |34.0     |76.0         |
|Vanuatu               |33.5     |84.0         |
|Cameroon              |33.0     |91.0         |
|Fiji                  |33.0     |78.0         |
+----------------------+---------+-------------+
only showing top 10 rows

