### Pre-requisite:
Before running this notebook, you will have to:
1. download the csv file named `dht_1k.csv` and `sds_1k.csv`  
stored under https://github.com/IBMProjectEventStore/db2eventstore-IoT-Analytics/tree/master/data.
2. Go to the `Project tab` and load both above mentioned csv files into the current project as dataset.
----
**Note: This Notebook can only run in Python version >= 3.0**

In [1]:
from eventstore.oltp import EventContext
from eventstore.sql import EventSession
from eventstore.common import ConfigurationReader
from pyspark.sql import SparkSession

ConfigurationReader.setEventUser("user5")
ConfigurationReader.setEventPassword("EventStore20")

In [2]:
sparkSession = SparkSession.builder.appName("EventStore SQL in Python").getOrCreate()
eventSession = EventSession(sparkSession.sparkContext, "EVENTDB")
eventSession.set_query_read_option("SnapshotNow")
eventSession._jvm.org.apache.spark.sql.types.SqlTimeSeries.register(eventSession._jsparkSession)
eventSession.open_database()
ctx = EventContext.get_event_context("EVENTDB")

In [3]:
from eventstore.catalog import TableSchema
from pyspark.sql.types import *

In [4]:
table_names = ctx.get_names_of_tables()
for idx, name in enumerate(table_names):
    print(idx, name)

0 USER5.DHT_FULL_TABLE
1 USER5.IOT_TEMP_NOTEBOOK_SAMPLE


In [5]:
from datetime import datetime

def datetime_converter(datetime_string):
    # (1) Convert to datetime format
    utc_time = datetime.strptime(datetime_string.split('.000Z')[0], "%Y-%m-%dT%H:%M:%S")

    return int((utc_time - datetime(1970, 1, 1)).total_seconds())

In [6]:
## create table and loading data for DHT

In [7]:
# Define table schema to be created
with EventContext.get_event_context("EVENTDB") as ctx:
    schema = StructType([
        StructField("sensor_id", IntegerType(), nullable = False),
        StructField("timestamp", IntegerType(), nullable = False),
        StructField("location", IntegerType(), nullable = False),
        StructField("humidity", FloatType(), nullable = True)
    ])  
    table_schema = TableSchema("dht_table", schema,
                                sharding_columns=["sensor_id"],
                                pk_columns=["sensor_id","timestamp","location"])

In [8]:
# try create table if not exist
# try:
#     ctx.drop_table("DHT_TABLE")
# except Exception as error:
#     print(error)
try:
    ctx.create_table(table_schema)
except Exception as error:
    pass
    
table_names = ctx.get_names_of_tables()
for idx, name in enumerate(table_names):
    print(name)

USER5.DHT_FULL_TABLE
USER5.IOT_TEMP_NOTEBOOK_SAMPLE
USER5.DHT_TABLE


In [9]:
dht_table = eventSession.load_event_table("dht_table")

In [10]:
# ingest data into table
import os
resolved_table_schema = ctx.get_table("dht_table")
print(resolved_table_schema)
with open(os.environ['DSX_PROJECT_DIR']+'/datasets/dht_1k.csv') as f:
    f.readline()
    content = f.readlines()
content = [l.split(",") for l in content]
batch = [dict(sensor_id=int(c[5]), timestamp=datetime_converter(c[7]), location=int(c[0]), humidity=float(c[2])) for c in content]
ctx.batch_insert(resolved_table_schema, batch)

ResolvedTableSchema(tableName=USER5DHT_TABLE, schema=StructType(List(StructField(SENSOR_ID,IntegerType,false),StructField(TIMESTAMP,IntegerType,false),StructField(LOCATION,IntegerType,false),StructField(HUMIDITY,FloatType,true))), sharding_columns=['SENSOR_ID'], pk_columns=['SENSOR_ID', 'TIMESTAMP', 'LOCATION'], partition_columns=None, schema_name=Some(USER5))


In [11]:
# verify ingested result
dht_table = eventSession.load_event_table("dht_table")

In [12]:
dht_table.createOrReplaceTempView("dht_raw_table")

In [13]:
eventSession.sql("select count(*) from dht_raw_table").show()

+--------+
|count(1)|
+--------+
|   41569|
+--------+



In [14]:
## create table and loading data for SDS

In [15]:
with EventContext.get_event_context("EVENTDB") as ctx:
    schema = StructType([
        StructField("sensor_id", IntegerType(), nullable = False),
        StructField("timestamp", LongType(), nullable = False),
        StructField("location", IntegerType(), nullable = False),
        StructField("p_1", DoubleType(), nullable = True)
    ])  
    table_schema = TableSchema("sds_table", schema,
                                sharding_columns=["sensor_id"],
                                pk_columns=["sensor_id","timestamp","location"])


In [16]:
# try:
#     ctx.drop_table("SDS_TABLE")
# except Exception as error:
#     print(error)
try:
    ctx.create_table(table_schema)
except Exception as error:
    print("Table not created.")
table_names = ctx.get_names_of_tables()
for idx, name in enumerate(table_names):
    print(name)

USER5.DHT_FULL_TABLE
USER5.IOT_TEMP_NOTEBOOK_SAMPLE
USER5.DHT_TABLE
USER5.SDS_TABLE


In [17]:
sds_table = eventSession.load_event_table("sds_table")

In [18]:
with EventContext.get_event_context("EVENTDB") as ctx:
    resolved_table_schema = ctx.get_table("sds_table")
    with open(os.environ['DSX_PROJECT_DIR']+'/datasets/sds_1k.csv') as f:
        f.readline()
        content = f.readlines()
    content = [l.split(",") for l in content]
    batch = [dict(sensor_id=int(c[5]), timestamp=datetime_converter(c[7]), location=int(c[0]), p_1=float(c[2])) for c in content if c[2] != ""]
    ctx.batch_insert(resolved_table_schema, batch)

In [19]:
sds_table=eventSession.load_event_table("sds_table")
sds_table.count()

51247

In [20]:
sds_table.createOrReplaceTempView("sds_raw_table")

In [21]:
eventSession.sql("select * from sds_raw_table").show(5)

+---------+----------+--------+-----+
|SENSOR_ID| TIMESTAMP|LOCATION|  P_1|
+---------+----------+--------+-----+
|     5323|1504051345|    2684| 29.0|
|     5323|1504051492|    2684| 27.5|
|     5323|1504051638|    2684|27.43|
|     5323|1504051785|    2684| 30.1|
|     5323|1504051931|    2684|28.73|
+---------+----------+--------+-----+
only showing top 5 rows



In [22]:
## Query

In [23]:
sql="SELECT count(*) FROM dht_raw_table"
eventSession.sql(sql).show(5)

+--------+
|count(1)|
+--------+
|   41569|
+--------+



# Alignment of multiple time-series from different IoT sensors

The below example shows how one can align two timeseries using an inner join with the nearest timestamp. The problem arises when two sensors are generating timestamps with clocks that are not fully synchronized. This can occur regularly and is common in IoT scenarios.

In what follows, we will show an example on the OK Lab data that considers the timeseries generated by the SDS (particulate matter) sensors and the DHT (humidity/temperature) sensors. The manual of SDS states that the SDS values are valid only when the DHT sensor measures humidity as < 70%. Realizing this restraint requires the data from the 2 sensors to be synchronized temporally. We note first that traditional join style would be to look for exact timestamps, which will not work in this case as the sensors are coming from two different devices (with the same location). One approach would be to use a standard SQL statement such as:

```sql
SELECT 
    humidity, 
    p_1
FROM sds_raw_table STORED AS PARQUET sds011, dht_raw_table STORED AS PARQUET dht22
AND dht22.humidity <= 70
AND ((sds011.timestamp - INTERVAL 10 SECONDS) < dht22.timestamp) 
AND (dht22.timestamp < (sds011.timestamp + INTERVAL 10 SECONDS))
```

However, if executed as it stands, it can take several hours to complete given that it results in a traditional Cartesian Join.

In our example, we will down-select the data using DB2 Eventstore into two dataframes (sds and dht). Our approach then is to do a clever join that takes windowing into account and not do a full Cartesian join. 

In what follows, we we will show how using time-series capabilities of DB2-Eventstore can be used to address this problem of unaligned sensors. Although this is one application that time-series for DB2-eventstore covers, time-series capabilites are not limited to just this one use case as we have functions to handle simple statistical methods (fft, avg, percentile, etc.), time-series distance metrics (DL, DTW, SBD, etc.), sophisticated forms of segmentation (time-based, anchor-base, marker-base, record-based, etc.), etc.

# Initial exploration on data

In many cases, the first step to doing any Time-Series analysis is to learn about your time-series. To do so, we use what is called a describe. This will provide a rich set of metrics (avg, percentiles, timing-statistics, etc.) over a time-series such that a user can have some knowledge of the time-series they are working with.

In [24]:
stmt = """SELECT location, TS_DESCRIBE(ts) FROM ( SELECT location,TIME_SERIES(timestamp, humidity) AS ts FROM dht_raw_table where humidity < 70 GROUP BY location)
"""

df = eventSession.sql(stmt).toPandas()
df.head()

Unnamed: 0,location,ts_time_min,ts_time_max,ts_time_mean,ts_mode,ts_unique,ts_frequency,ts_first_time_tick,ts_first_value,ts_last_time_tick,ts_last_value,ts_count,ts_num_mean,ts_num_std,ts_num_min,ts_num_max,ts_num_25,ts_num_50,ts_num_75
0,2662,146,50321,277.408397,30.0,387,12,1503834699,37.900002,1504198104,50.700001,1311,46.191533,12.138422,27.299999,69.900002,35.400002,43.5,57.799999
1,2683,146,46404,317.732601,57.799999,159,6,1504034813,69.400002,1504121554,69.0,274,56.771533,6.525704,42.200001,69.900002,51.200001,57.75,60.400002
2,2682,146,36449,274.428233,44.299999,275,13,1503960327,67.699997,1504191670,69.800003,844,51.431398,9.830193,33.400002,69.900002,43.900002,48.099998,61.5
3,2680,147,40779,241.413181,25.200001,370,9,1503990245,69.400002,1504180720,69.199997,790,45.783544,14.517849,21.799999,69.900002,31.150001,45.800001,58.5
4,2684,133,38120,241.702523,67.199997,350,8,1503994577,69.699997,1504176579,69.800003,754,49.919231,13.417248,23.9,69.800003,38.299999,52.4,61.799999


## Description of performing a time-series sql temporal align

This query has a few main things to consider:

### Creating your time series

first a time series must be created

#### sds time series

```sql
SELECT location, D_TIME_SERIES(timestamp, p_1) AS sds FROM sds_raw_table GROUP BY location
```

#### dht time series

```sql
SELECT location, D_TIME_SERIES(timestamp, humidity) AS dht FROM dht_raw_table GROUP BY location
```

### Performing full temporal align

Performing a full temporal align requires 2 parameters:

- The left Time Series
- The right Time Series

Once given, the returned output will be 2 columns (the 2 aligned time series) as **left_column_aligned** and **right_column_aligned**

*Note: With this method, all missing values will be replaced with null*

In [25]:
stmt = """
    SELECT sds_table.location, TS_FULL_ALIGN(dht, sds, TS_INTERPOLATOR_NEAREST(-1.0)) FROM 
        (SELECT location, TIME_SERIES(timestamp, humidity) AS dht FROM dht_raw_table where humidity < 70 GROUP BY location) AS dht_table
        INNER JOIN
        (SELECT location, TIME_SERIES(timestamp, p_1) AS sds FROM sds_raw_table GROUP BY location) AS sds_table
        ON dht_table.location = sds_table.location
"""
df = eventSession.sql(stmt)
df.show()
df.count()
eventSession.sql(stmt).createOrReplaceTempView("dht_sds_ts_table")

+--------+--------------------+--------------------+
|location|         dht_aligned|         sds_aligned|
+--------+--------------------+--------------------+
|    2662|[(1503834699,37.9...|[(1503834699,5.33...|
|    2683|[(1503961757,-1.0...|[(1503961757,4.8)...|
|    2682|[(1503960326,-1.0...|[(1503960326,8.87...|
|    2680|[(1503958922,-1.0...|[(1503958922,30.8...|
|    2684|[(1503963045,-1.0...|[(1503963045,38.3...|
|    1773|[(1501703936,-1.0...|[(1501703936,424....|
|    2633|[(1503575337,-1.0...|[(1503575337,6.2)...|
|    1323|[(1503897678,-1.0...|[(1503897678,6.27...|
|    2686|[(1503964139,64.4...|[(1503964139,7.0)...|
|    2681|[(1503960072,-1.0...|[(1503960072,24.2...|
|     966|[(1501545689,-1.0...|[(1501545689,7.83...|
|    2640|[(1503609572,64.4...|[(1503609572,3.0)...|
|     732|[(1501545688,-1.0...|[(1501545688,0.3)...|
|    2652|[(1503702364,67.9...|[(1503702364,9.1)...|
|    1314|[(1501577454,53.5...|[(1501577454,5.43...|
|     462|[(1501545650,-1.0...|[(1501545650,4.

# Display the aligned TimeSeries table

In [26]:
eventSession.sql("select count(*) from dht_sds_ts_table").show()

+--------+
|count(1)|
+--------+
|      23|
+--------+



# Interpolate missing values after alignment

Because in IoT use cases, sensors tend to be clocked at different rates, it's important to properly fill values where they don't exist in the data. Just because a value is not in our data, does not mean it did not exist. To approximate the missing value, we can provide an interpolator as simple as nearest, next, prev, but as sophisticated as linear interpolation or cubic spline interpolation. In the following example, we will fill all missing values based on a nearest interpolation method.

In [27]:
eventSession.sql("SELECT location, TS_FILLNA(dht_aligned,TS_INTERPOLATOR_NEAREST(-1.0)) as ts FROM dht_sds_ts_table").createOrReplaceTempView("dht_no_nulls")
eventSession.sql("SELECT location, TS_FILLNA(sds_aligned,TS_INTERPOLATOR_NEAREST(-1.0)) as ts FROM dht_sds_ts_table").createOrReplaceTempView("sds_no_nulls")

# Converting Time-Series data to tabular data

Once all Time-Series analysis has been done, because Time-Series types are not directly ingestable, a user may want to display there data in a tabular format to prepare for graphing or performing further analysis. The following is how would could convert that data.

In [28]:
eventSession.sql("SELECT location, TS_EXPLODE(ts) FROM dht_no_nulls").createOrReplaceTempView("dht_exploded")
eventSession.sql("SELECT location, TS_EXPLODE(ts) FROM sds_no_nulls").createOrReplaceTempView("sds_exploded")

In [29]:
eventSession.sql("select * from dht_exploded").count()

42953

In [30]:
eventSession.sql("select * from sds_exploded").count()

42953

# Joining the tabular data

Lastly, we will perform a classical join on the location and time_tick for humidity and coarse particulate matter data to properly display the aligned values

In [31]:
stmt = """
    select dht_exploded.location, dht_exploded.ts_timeTick as timestamp, dht_exploded.ts_value as humidity, sds_exploded.ts_value as p_1 FROM
        dht_exploded
        INNER JOIN
        sds_exploded
        ON dht_exploded.location=sds_exploded.location and dht_exploded.ts_timeTick=sds_exploded.ts_timeTick
"""
df = eventSession.sql(stmt)
df.show()
df.count()

+--------+----------+------------------+-----+
|location| timestamp|          humidity|  p_1|
+--------+----------+------------------+-----+
|     138|1501548094|               1.0|  5.9|
|     138|1501564534|               1.0|  7.2|
|     138|1501583914|               1.0|  4.1|
|     138|1501604784|               1.0|  4.2|
|     138|1501606399|               1.0| 4.83|
|     138|1501610653|               1.0| 8.07|
|     138|1501636474|58.400001525878906| 5.63|
|     138|1501656427|               1.0|  4.1|
|     138|1501683577|               1.0|  7.9|
|     314|1504069209|56.400001525878906| 9.52|
|     314|1504105745| 53.20000076293945|  4.8|
|     314|1504109853|54.599998474121094| 5.05|
|     314|1504115575|58.099998474121094|10.38|
|     314|1504127465| 62.70000076293945|  8.5|
|     314|1504165064|  69.9000015258789| 3.95|
|     314|1504184729| 63.29999923706055| 1.58|
|     314|1504218776|  69.9000015258789| 3.08|
|     462|1501580732| 65.19999694824219|  5.6|
|     462|150

42953

## Timing for Time-Series SQL
This section shows how long it takes to run each time-series SQL query.

In [32]:
import time

stmt = "SELECT TIME_SERIES(timestamp, humidity) AS dht FROM dht_raw_table"

df = eventSession.sql(stmt)
df.cache()
df.createOrReplaceTempView("table_a")
start = time.time()
df.show()
end = time.time()

+--------------------+
|                 dht|
+--------------------+
|[(1501545650,87.1...|
+--------------------+



In [33]:
total_time = end - start
print("ingestion time: " + str(total_time) + " seconds")

ingestion time: 125.03376197814941 seconds


In [34]:
stmt = "SELECT TS_RESAMPLE(dht, 3600, TS_INTERPOLATOR_NEAREST(0.0)) as dht_interp from table_a"
start = time.time()
eventSession.sql(stmt).show()
end = time.time()

+--------------------+
|          dht_interp|
+--------------------+
|[(1501545600,0.0)...|
+--------------------+



In [35]:
total_time = end - start
print("ingestion time: " + str(total_time) + " seconds")

ingestion time: 0.43882179260253906 seconds


In [36]:
stmt = "select count(*) from (SELECT TS_EXPLODE(dht) from table_a)"
start = time.time()
eventSession.sql(stmt).show()
end = time.time()

+--------+
|count(1)|
+--------+
|   41569|
+--------+



In [37]:
total_time = end - start
print("ingestion time: " + str(total_time) + " seconds")

ingestion time: 0.1370067596435547 seconds
