# Initialize spark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
from pyspark.streaming import StreamingContext

spark = SparkSession \
    .builder \
    .appName("CloudProject") \
    .getOrCreate()


Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/06/28 03:02:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
from pyspark.sql.types import *
scc = StreamingContext(spark,4)

# Initialize data streaming

In [3]:
MONITOR_SOURCE='/home/guglielmo/Scaricati/monitor'
P_SENSORS_SOURCE='/home/guglielmo/Scaricati/pollution'
T_SENSORS_SOURCE='/home/guglielmo/Scaricati/temperature'

### Initialize monitor data streaming

In [4]:
schema = StructType([
    StructField("device_id", StringType(), True),
    StructField("device_health", IntegerType(), True),
    StructField("type", StringType(), True),
    StructField("area", StringType(), True),
    StructField("customer", StringType(), True),
])
monitor_stream = spark.readStream.schema(schema).format("json").option('path',MONITOR_SOURCE).load()
monitor_stream

DataFrame[device_id: string, device_health: int, type: string, area: string, customer: string]

In [5]:
monitor_stream.isStreaming

True

### Initialize sensors data streaming

In [6]:
schema = StructType([
    StructField("device_id", StringType(), True),
    StructField("measured", TimestampType(), True),
    StructField("arrived", TimestampType(), True),
    StructField("humidity", IntegerType(), True),
    StructField("temperature", IntegerType(), True)
])
t_sensors_stream = spark.readStream.schema(schema).format("json").option('path',T_SENSORS_SOURCE).load()
t_sensors_stream

DataFrame[device_id: string, measured: timestamp, arrived: timestamp, humidity: int, temperature: int]

In [7]:
schema = StructType([
    StructField("device_id", StringType(), True),
    StructField("measured", TimestampType(), True),
    StructField("arrived", TimestampType(), True),
    StructField("CO2_level", IntegerType(), True)
])
p_sensors_stream = spark.readStream.schema(schema).format("json").option('path',P_SENSORS_SOURCE).load()
p_sensors_stream

DataFrame[device_id: string, measured: timestamp, arrived: timestamp, CO2_level: int]

In [8]:
print(monitor_stream.isStreaming, t_sensors_stream.isStreaming, p_sensors_stream.isStreaming)

True True True


### Cleanse data

In [9]:
import dateutil.parser

# Remove duplicates
t_query = t_sensors_stream.dropDuplicates(["device_id", "measured"])

# Remove values arrived more than one day late
#t_query = t_query.filter("datediff(arrived, measured) < 1")

### Aggregate data

In [10]:
m_query = monitor_stream
final = t_query.join(m_query, on="device_id")

In [11]:
'''m_stream = m_query \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

t_stream = t_query \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .start()
'''

def process(row):
    row = row.asDict()
    row['measured'] = str(row['measured'])
    row['arrived'] = str(row['arrived'])
    print(row)
    

final = final.writeStream.foreach(process).start()

'''p_sensors_query = p_sensors_stream \
    .writeStream \
    .outputMode("append") \
    .format("console") \
    .start()'''

22/06/28 03:02:44 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-c72dbd9e-16af-411b-bd61-26582f71011b. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/06/28 03:02:45 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


'p_sensors_query = p_sensors_stream     .writeStream     .outputMode("append")     .format("console")     .start()'

{'device_id': 't-15023820', 'measured': '2022-06-28 02:54:46.063000', 'arrived': '2022-06-29 02:54:46.063000', 'humidity': 69, 'temperature': 23, 'device_health': 3, 'type': 'temperature', 'area': 'residential', 'customer': 'AB-Service'}
                                                                                

final.stop()
m_stream.stop()
t_stream.stop()
# p_sensors_query.stop()