In [1]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types._

Intitializing Scala interpreter ...

Spark Web UI available at http://172.16.18.84:4042
SparkContext available as 'sc' (version = 3.5.0, master = local[*], app id = local-1701119662919)
SparkSession available as 'spark'


import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types._


In [2]:
val spark = SparkSession
    .builder
    .appName("IoT_example")
    .getOrCreate()

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@6d8425e7


In [3]:
case class DeviceIoTData (battery_level: Long, c02_level: Long,
cca2: String, cca3: String, cn: String, device_id: Long,
device_name: String, humidity: Long, ip: String, latitude: Double,
lcd: String, longitude: Double, scale:String, temp: Long,
timestamp: Long)

defined class DeviceIoTData


In [5]:
val ds = spark.read
        .json("/Users/jiashu/Documents/StudyNotes/spark/examples/data/iot_devices.json")
        .as[DeviceIoTData]

ds: org.apache.spark.sql.Dataset[DeviceIoTData] = [battery_level: bigint, c02_level: bigint ... 13 more fields]


## Questions: 

1. Detect failing devices with battery levels below a threshold.
2. Identify offending countries with high levels of CO2 emissions.
3. Compute the min and max values for temperature, battery level, CO2, and
humidity.
4. Sort and group by average temperature, CO2, humidity, and country.

#### Detect failing devices with battery levels below a threshold.

In [6]:
ds.schema

res0: org.apache.spark.sql.types.StructType = StructType(StructField(battery_level,LongType,true),StructField(c02_level,LongType,true),StructField(cca2,StringType,true),StructField(cca3,StringType,true),StructField(cn,StringType,true),StructField(device_id,LongType,true),StructField(device_name,StringType,true),StructField(humidity,LongType,true),StructField(ip,StringType,true),StructField(latitude,DoubleType,true),StructField(lcd,StringType,true),StructField(longitude,DoubleType,true),StructField(scale,StringType,true),StructField(temp,LongType,true),StructField(timestamp,LongType,true))


In [7]:
ds.columns

res1: Array[String] = Array(battery_level, c02_level, cca2, cca3, cn, device_id, device_name, humidity, ip, latitude, lcd, longitude, scale, temp, timestamp)


In [8]:
ds.show(3, true)

+-------------+---------+----+----+-------------+---------+--------------------+--------+-------------+--------+-----+---------+-------+----+-------------+
|battery_level|c02_level|cca2|cca3|           cn|device_id|         device_name|humidity|           ip|latitude|  lcd|longitude|  scale|temp|    timestamp|
+-------------+---------+----+----+-------------+---------+--------------------+--------+-------------+--------+-----+---------+-------+----+-------------+
|            8|      868|  US| USA|United States|        1|meter-gauge-1xbYRYcj|      51| 68.161.225.1|    38.0|green|    -97.0|Celsius|  34|1458444054093|
|            7|     1473|  NO| NOR|       Norway|        2|   sensor-pad-2n2Pea|      70|213.161.254.1|   62.47|  red|     6.15|Celsius|  11|1458444054119|
|            2|     1556|  IT| ITA|        Italy|        3| device-mac-36TWSKiT|      44|    88.36.5.1|   42.83|  red|    12.83|Celsius|  19|1458444054120|
+-------------+---------+----+----+-------------+---------+-----

In [13]:
val threshold = 3

threshold: Int = 3


In [14]:
ds
.where($"battery_level"> threshold)
.show(5)

+-------------+---------+----+----+-------------+---------+--------------------+--------+---------------+--------+------+---------+-------+----+-------------+
|battery_level|c02_level|cca2|cca3|           cn|device_id|         device_name|humidity|             ip|latitude|   lcd|longitude|  scale|temp|    timestamp|
+-------------+---------+----+----+-------------+---------+--------------------+--------+---------------+--------+------+---------+-------+----+-------------+
|            8|      868|  US| USA|United States|        1|meter-gauge-1xbYRYcj|      51|   68.161.225.1|    38.0| green|    -97.0|Celsius|  34|1458444054093|
|            7|     1473|  NO| NOR|       Norway|        2|   sensor-pad-2n2Pea|      70|  213.161.254.1|   62.47|   red|     6.15|Celsius|  11|1458444054119|
|            6|     1080|  US| USA|United States|        4|   sensor-pad-4mzWkz|      32|  66.39.173.154|   44.06|yellow|  -121.32|Celsius|  28|1458444054121|
|            4|      931|  PH| PHL|  Philippin

In [15]:
ds
.filter(col("battery_level")> threshold)
.show(5)

+-------------+---------+----+----+-------------+---------+--------------------+--------+---------------+--------+------+---------+-------+----+-------------+
|battery_level|c02_level|cca2|cca3|           cn|device_id|         device_name|humidity|             ip|latitude|   lcd|longitude|  scale|temp|    timestamp|
+-------------+---------+----+----+-------------+---------+--------------------+--------+---------------+--------+------+---------+-------+----+-------------+
|            8|      868|  US| USA|United States|        1|meter-gauge-1xbYRYcj|      51|   68.161.225.1|    38.0| green|    -97.0|Celsius|  34|1458444054093|
|            7|     1473|  NO| NOR|       Norway|        2|   sensor-pad-2n2Pea|      70|  213.161.254.1|   62.47|   red|     6.15|Celsius|  11|1458444054119|
|            6|     1080|  US| USA|United States|        4|   sensor-pad-4mzWkz|      32|  66.39.173.154|   44.06|yellow|  -121.32|Celsius|  28|1458444054121|
|            4|      931|  PH| PHL|  Philippin

#### Identify offending countries with high levels of CO2 emissions.

In [16]:
val country_co2 = ds
    .groupBy("cca3")
    .sum("c02_level")

country_co2: org.apache.spark.sql.DataFrame = [cca3: string, sum(c02_level): bigint]


In [21]:
val high_level = 15000

high_level: Int = 15000


In [22]:
country_co2
    .filter(col("sum(c02_level)") > high_level)
    .show(5)

+----+--------------+
|cca3|sum(c02_level)|
+----+--------------+
| HTI|         15496|
| PSE|         31116|
| POL|       3275637|
| LVA|        425710|
| BRB|         47787|
+----+--------------+
only showing top 5 rows



#### Compute the min and max values for temperature, battery level, CO2, and humidity.

In [23]:
ds.columns

res10: Array[String] = Array(battery_level, c02_level, cca2, cca3, cn, device_id, device_name, humidity, ip, latitude, lcd, longitude, scale, temp, timestamp)


In [25]:
val selected_cols = array("temp", "battery_level", "c02_level", "humidity")

selected_cols: org.apache.spark.sql.Column = array(temp, battery_level, c02_level, humidity)


In [30]:
ds
.agg(min(selected_cols), max(selected_cols))
.show(true)

+----------------------------------------------------+----------------------------------------------------+
|min(array(temp, battery_level, c02_level, humidity))|max(array(temp, battery_level, c02_level, humidity))|
+----------------------------------------------------+----------------------------------------------------+
|                                    [10, 0, 800, 94]|                                   [34, 9, 1597, 94]|
+----------------------------------------------------+----------------------------------------------------+



#### Sort and group by average temperature, CO2, humidity, and country.

In [33]:
val ds_by_country = ds
        .groupBy("cca3")
        .agg(avg("temp"), avg("c02_level"), avg("humidity"))

ds_by_country: org.apache.spark.sql.DataFrame = [cca3: string, avg(temp): double ... 2 more fields]


In [34]:
ds_by_country.show()

+----+------------------+------------------+------------------+
|cca3|         avg(temp)|    avg(c02_level)|     avg(humidity)|
+----+------------------+------------------+------------------+
| HTI|25.333333333333332|1291.3333333333333| 64.58333333333333|
| PSE|             20.84|           1244.64|              64.4|
| POL|21.983965014577258|1193.7452623906706| 62.33163265306123|
| LVA|21.899441340782122|1189.1340782122904| 63.11173184357542|
| BRB|23.210526315789473|1257.5526315789473| 58.36842105263158|
| JAM|22.113636363636363|1162.2272727272727| 63.86363636363637|
| BRA|21.958126550868485|1208.7382133995038| 61.96867245657568|
| ARM| 21.58823529411765|1207.9117647058824| 63.23529411764706|
| MOZ| 19.59090909090909|            1264.0| 58.77272727272727|
| JOR|21.065217391304348|1222.3478260869565| 63.84782608695652|
| CUB|25.866666666666667|1222.5333333333333| 49.53333333333333|
| FRA|22.115739868049012|1200.7059377945334| 61.82054665409991|
| ABW|             20.75|          1190.

In [35]:
ds_by_country.queryExecution.logical

res18: org.apache.spark.sql.catalyst.plans.logical.LogicalPlan =
'Aggregate [cca3#11], [cca3#11, avg('temp) AS avg(temp)#609, avg('c02_level) AS avg(c02_level)#610, avg('humidity) AS avg(humidity)#611]
+- Relation [battery_level#8L,c02_level#9L,cca2#10,cca3#11,cn#12,device_id#13L,device_name#14,humidity#15L,ip#16,latitude#17,lcd#18,longitude#19,scale#20,temp#21L,timestamp#22L] json
