# Initialize spark

In [1]:
from pyspark.sql.types import *
scc = StreamingContext(spark,4)

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
3,application_1655488930143_0004,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Initialize data streaming

In [2]:
MONITOR_SOURCE='s3://guglielmo-data-lake/monitor'
P_SENSORS_SOURCE='s3://guglielmo-data-lake/measurements/pollution'
T_SENSORS_SOURCE='s3://guglielmo-data-lake/measurements/temperature'

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

### Initialize monitor data streaming

In [3]:
schema = StructType([
    StructField("device_id", StringType(), True),
    StructField("device_health", IntegerType(), True),
    StructField("type", StringType(), True),
    StructField("area", StringType(), True),
    StructField("customer", StringType(), True),
])
monitor_stream = spark.readStream.schema(schema).format("json").option('path',MONITOR_SOURCE).load()
monitor_stream

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[device_id: string, device_health: int, type: string, area: string, customer: string]

### Initialize sensors data streaming

In [4]:
schema = StructType([
    StructField("device_id", StringType(), True),
    StructField("measured", TimestampType(), True),
    StructField("arrived", TimestampType(), True),
    StructField("humidity", IntegerType(), True),
    StructField("temperature", IntegerType(), True)
])
t_sensors_stream = spark.readStream.schema(schema).format("json").option('path',T_SENSORS_SOURCE).load()
t_sensors_stream

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[device_id: string, timestamp: timestamp, humidity: int, temperature: int]

In [5]:
schema = StructType([
    StructField("device_id", StringType(), True),
    StructField("measured", TimestampType(), True),
    StructField("arrived", TimestampType(), True),
    StructField("CO2_level", IntegerType(), True)
])
p_sensors_stream = spark.readStream.schema(schema).format("json").option('path',P_SENSORS_SOURCE).load()
p_sensors_stream

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[device_id: string, timestamp: timestamp, CO2_level: int]

In [6]:
print(monitor_stream.isStreaming, t_sensors_stream.isStreaming, p_sensors_stream.isStreaming)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

True True True

### Cleanse and aggregate data

In [None]:
import dateutil.parser

queries = []

for stream in [t_sensors_stream, p_sensors_stream]:
    
    # Remove duplicates
    query = stream.dropDuplicates(["device_id", "measured"])

    # Remove values arrived more than one day late
    query = query.filter("datediff(arrived, measured) < 1")
    
    # Join with monitor data
    query = query.join(monitor_stream, on="device_id")
    
    queries += [query]

In [None]:
'''
monitor_query = monitor_stream \
    .writeStream \
  .format("json")\
  .option("path", "s3://test-output-emr/monitor/")\
  .option("checkpointLocation", "s3://test-spark-checkpoints/monitor")\
  .start()'''

In [7]:
import boto3

dynamodb = boto3.resource('dynamodb', region_name='us-east-1')
table = dynamodb.Table('measurements')

def process(row):
    row = row.asDict()
    row['measured'] = str(row['measured'])
    row['arrived'] = str(row['arrived'])
    row['dynamo_id'] = row['id'] + row['measured']
    table.put_item(
        Item=row
    )
    

for query in queries:
    query.writeStream.foreach(process).start()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

for query in queries:
    query.stop()