### Pre-requisite:
Before running this notebook, you will have to:
1. download the csv file named `dht_1k.csv` 
stored under https://github.com/IBMProjectEventStore/db2eventstore-IoT-Analytics/tree/master/data.
2. Go to the `Project tab` and load both above mentioned csv files into the current project as dataset.
----
**Note: This Notebook can only run in Python version >= 3.0**

In [1]:
from eventstore.oltp import EventContext
from eventstore.sql import EventSession
from eventstore.common import ConfigurationReader
from pyspark.sql import SparkSession

ConfigurationReader.setEventUser("")
ConfigurationReader.setEventPassword("")

In [2]:
#sparkSession = SparkSession.builder.config('spark.jars', './spark-time-series-sql.jar').appName("EventStore SQL in Python").getOrCreate()
sparkSession = SparkSession.builder.appName("EventStore SQL in Python").getOrCreate()
eventSession = EventSession(sparkSession.sparkContext, "EVENTDB")
eventSession.set_query_read_option("SnapshotNow")
eventSession.open_database()
ctx = EventContext.get_event_context("EVENTDB")

In [3]:
eventSession._jvm.org.apache.spark.sql.types.SqlGeometry.registerAll(eventSession._jsparkSession)

In [4]:
from eventstore.catalog import TableSchema
from pyspark.sql.types import *

In [5]:
table_names = ctx.get_names_of_tables()
for idx, name in enumerate(table_names):
    print(idx, name)

0 ADMIN.DHT_TABLE
1 ADMIN.SDS_TABLE
2 ADMIN.IOTPERF
3 ADMIN.DHT_FULL_TABLE


In [6]:
from datetime import datetime

def datetime_converter(datetime_string):
    # (1) Convert to datetime format
    utc_time = datetime.strptime(datetime_string.split('.000Z')[0], "%Y-%m-%dT%H:%M:%S")

    return int((utc_time - datetime(1970, 1, 1)).total_seconds())

In [7]:
def showPartitionInfo(df):
    # show partition number and number of records in a partition in the given Spark dataframe
    #@df: Spark DataFrame
    print("- number of partitions prior to time series (after loading table): ",df.rdd.getNumPartitions())
    print("- partition sizes prior to time series (after loading table): ", df.rdd.mapPartitions(lambda s: iter([sum(1 for _ in s)])).collect())

### Data Preparation
Create table, ingest data table, load table as Spark Dataframe and repartition the dataframe properly.

In [8]:
# Define table schema to be created
with EventContext.get_event_context("EVENTDB") as ctx:
    schema = StructType([
        StructField("sensor_id", IntegerType(), nullable = False),
        StructField("timestamp", IntegerType(), nullable = False),
        StructField("location", IntegerType(), nullable = False),
        StructField("humidity", FloatType(), nullable = True),
        StructField("temperature", FloatType(), nullable = False),
        StructField("LAT", FloatType(), nullable = False),
        StructField("LON", FloatType(), nullable = False),
        StructField("sensor_type", StringType(), nullable = False)
    ])  
    table_schema = TableSchema("dht_full_table", schema,
                                sharding_columns=["sensor_id"],
                                pk_columns=["timestamp","sensor_id"])

In [9]:
## create table and loading data for DHT

In [10]:
# try create table if not exist
# try:
#     ctx.drop_table("dht_full_table")
# except Exception as error:
#     print(error)
try:
    ctx.create_table(table_schema)
except Exception as error:
    print("Table not created. Table may already exist.")
    print(error)
    
table_names = ctx.get_names_of_tables()
for idx, name in enumerate(table_names):
    print(name)

Table not created. Table may already exist.
An error occurred while calling o2.createTable.
: com.ibm.event.EventException: createTableWithIndex() : createTable() : client request failed: DB2 SQL Error: SQLCODE=-601, SQLSTATE=42710, SQLERRMC=ADMIN.DHT_FULL_TABLE;TABLE, DRIVER=4.25.4
	at com.ibm.event.oltp.EventContext.createTableWithIndexInternal(EventContext.scala:1144)
	at com.ibm.event.oltp.EventContext.createTable(EventContext.scala:993)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.comma

In [11]:
dht_table = eventSession.load_event_table("dht_full_table")

In [12]:
# ingest data into table
import os
resolved_table_schema = ctx.get_table("dht_full_table")
print(resolved_table_schema)
with open(os.environ['DSX_PROJECT_DIR']+'/datasets/dht_1k.csv') as f:
    f.readline()
    content = f.readlines()
content = [l.split(",") for l in content]
batch = [dict(sensor_id=int(c[5]), timestamp=datetime_converter(c[7]), location=int(c[0]), \
              humidity=float(c[2]),temperature=float(c[1]),lat=float(c[3]),lon=float(c[4]),sensor_type=str(c[6])) for c in content]
ctx.batch_insert(resolved_table_schema, batch)

ResolvedTableSchema(tableName=ADMINDHT_FULL_TABLE, schema=StructType(List(StructField(SENSOR_ID,IntegerType,false),StructField(TIMESTAMP,IntegerType,false),StructField(LOCATION,IntegerType,false),StructField(HUMIDITY,FloatType,true),StructField(TEMPERATURE,FloatType,false),StructField(LAT,FloatType,false),StructField(LON,FloatType,false),StructField(SENSOR_TYPE,StringType,false))), sharding_columns=['SENSOR_ID'], pk_columns=['TIMESTAMP', 'SENSOR_ID'], partition_columns=None, schema_name=Some(ADMIN))


In [13]:
# verify ingested result
dht_table = eventSession.load_event_table("dht_full_table")
dht_table.count()

41569

### Repartitioning
Optimize parallelism by repartitioning Note that when the query is pushed down to Db2 Event Store and the data is retrieved, the data will be received by Spark as one single partitioned data frame. It's necessary for the user to explicitly repartition the dataframe. It's suggested that one partition is created for each CPU core in the Spark cluster.

In [14]:
showPartitionInfo(dht_table)

- number of partitions prior to time series (after loading table):  1
- partition sizes prior to time series (after loading table):  [41569]


In [15]:
dht_table = dht_table.repartition(48)
showPartitionInfo(dht_table)

- number of partitions prior to time series (after loading table):  48
- partition sizes prior to time series (after loading table):  [866, 867, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866, 866]


In [16]:
dht_table.createOrReplaceTempView("dht_full_table")

In [17]:
eventSession.sql("select * from dht_full_table LIMIT 5").show()

+---------+----------+--------+--------+-----------+------+-----+-----------+
|SENSOR_ID| TIMESTAMP|LOCATION|HUMIDITY|TEMPERATURE|   LAT|  LON|SENSOR_TYPE|
+---------+----------+--------+--------+-----------+------+-----+-----------+
|     1309|1504144549|     647|    99.9|       20.0|48.839|9.315|      DHT22|
|     1309|1504151592|     647|    99.9|       19.4|48.839|9.315|      DHT22|
|     1309|1504158635|     647|    99.9|       20.0|48.839|9.315|      DHT22|
|     1309|1504168937|     647|    99.9|       22.6|48.839|9.315|      DHT22|
|     1309|1504185747|     647|    90.1|       19.4|48.839|9.315|      DHT22|
+---------+----------+--------+--------+-----------+------+-----+-----------+



## Objective: Group Sensors into Geohashes by using SQL with ST Support.
### Use SQL
We utilize SQL for this step because the volume of raw data can be huge - there are a lot of sensors and each sensor has a lot of readings per day. We use sql so that we can avoid pulling the whole raw data which could cause some serious memory issues. It is often suggested to, whenever possible, run Spatial operations in SQL first as a preprocessing step to reduce the complexity and volume of the data.
### SQL with ST Support
The key part in this query is the ST support - 
- `ST_Point(lon, lat)` creates a spatial ST_Point object from given latitude and longitude in the raw data.
- `ST_ContainingGeohash(ST_Point, distance_buffer)` encode the point into its geohash.

Everything else is just the normal SQL query - 
- We get geohash, humidity from the raw dataset.
- We group these readings by geohash and calcuate average reading for each geohash.

For a full list of geospatial functions available on SQL Query, [click here](https://www.ibm.com/support/knowledgecenter/SSGNPV_2.0.0/com.ibm.swg.im.dashdb.analytics.doc/doc/geo_intro.html)

In [18]:
stmt = """
    SELECT geohash, AVG(humidity) as avg_h
    FROM(
        SELECT cast(ST_ContainingGeohash(ST_Point(lon, lat), 300) as string) as geohash, humidity
        FROM dht_full_table
    )
    GROUP BY geohash
"""

eventSession.sql(stmt).createOrReplaceTempView("dht_spatial_agg_table")

Objectives - 

- Since all the humidity sensors are discretely and spatially distributed, we want to group them into areas based on their locations so that we are able to tell the humidity for different areas instead of discrete points. To achieve this, we will utilize Geohashes since each geohash represents a grid area on the earth, and we compute the geohash for each sensor location and group them by geohashes.

In [19]:
eventSession.sql("select * from dht_spatial_agg_table").show()

+--------------------+------------------+
|             geohash|             avg_h|
+--------------------+------------------+
|11010000001110011...|58.117268643045534|
|11010000011000001...| 77.29051867398348|
|11010000100100000...| 64.97717030761326|
|11010000001110011...| 77.50434785925823|
|11010000001111100...| 66.54934394039975|
|11010000001110011...| 87.21721242391146|
|11010000100100011100| 59.80641026263471|
|11010000001110011...| 63.08857686510693|
|11010000010010110...| 63.79313880487195|
|11010000001110011...|12.257560955722157|
|11010000001101000...| 68.66730075434118|
|11010001101100001...| 81.45821407398726|
|11010000110001101...| 72.19365398246984|
|11010000001110011001| 47.90347020425701|
|11000101110101010...|51.095640883634985|
|11010000110001101...| 97.05609872419103|
|11010000001110010...|57.990811994612386|
|11010000011000001...| 51.01045743406208|
|11010000001111010...| 78.36612912916368|
| 1101000000111001001| 51.00362491833121|
+--------------------+------------

In [20]:
#decode the geohashes and then find the top humidity location
decode_stmt = """
    SELECT geohash, ST_X(ST_BoundingBoxCenter(ST_Envelope(ST_GeohashDecode(geohash)))) as lon, 
    ST_Y(ST_BoundingBoxCenter(ST_Envelope(ST_GeohashDecode(geohash)))) AS lat, avg_h
    FROM dht_spatial_agg_table
    ORDER BY avg_h desc
"""

eventSession.sql(decode_stmt).createOrReplaceTempView("dht_spatial_decoded_agg_table")

In [21]:
df_spark = eventSession.sql("select * from dht_spatial_decoded_agg_table")

In [22]:
df_spark.show()

+--------------------+--------------+----------------+------------------+
|             geohash|           lon|             lat|             avg_h|
+--------------------+--------------+----------------+------------------+
|11010000110001101...|13.46923828125| 52.547607421875| 97.05609872419103|
| 1101000001101111011|   10.37109375|     53.26171875| 88.24107743750439|
|11010000001100100...| 7.71240234375|  48.01025390625| 87.74399086785695|
|11010000001110011...|   9.228515625|   48.8232421875| 87.21721242391146|
|11010001101100001...| 18.2373046875|   59.2822265625| 81.45821407398726|
|11010000001111010...|9.063720703125| 50.086669921875| 78.36612912916368|
|11010000001110011...| 9.25048828125|  48.75732421875| 77.50434785925823|
|11010000011000001...| 6.74560546875| 50.855712890625| 77.29051867398348|
|11010000110001101...| 13.4912109375|   52.5146484375| 72.19365398246984|
|11010000100100000...|11.53564453125| 48.153076171875| 70.01383566990478|
|11010000001101000...|   6.064453125| 