In [0]:
from pyspark.sql.types import StructType, StringType, TimestampType
from pyspark.sql.functions import window

mobile_data_schema = StructType().add("id", StringType(), False).add("action", StringType(), False).add("ts", TimestampType(), False)

# Create a streaming DataFrame from JSON files in the specified directory
mobile_ss_df = spark.readStream.schema(mobile_data_schema).json("/FileStore/mobile") 

# Check if the DataFrame is a streaming DataFrame
print(mobile_ss_df.isStreaming)

# Group by a 10 minute window and action, then count occurrences
action_count_df = mobile_ss_df.groupBy(window("ts", "10 minutes"), "action").count()

mobile_console_sq = action_count_df.writeStream.format("console").option("truncate", "false").outputMode("complete").start()

True


In [0]:
display(action_count_df)

window,action,count
"List(2018-03-02T10:00:00.000+0000, 2018-03-02T10:10:00.000+0000)",open,4
"List(2018-03-02T11:00:00.000+0000, 2018-03-02T11:10:00.000+0000)",crash,3
"List(2018-03-02T10:10:00.000+0000, 2018-03-02T10:20:00.000+0000)",open,1
"List(2018-03-02T10:00:00.000+0000, 2018-03-02T10:10:00.000+0000)",close,3
"List(2018-03-02T11:10:00.000+0000, 2018-03-02T11:20:00.000+0000)",swipe,3


In [0]:
# Question 7:

# The data would change due to the addition of the new file. New actions/timestamps are processed and the output would include any new additions in the time window.

In [0]:
mobile_console_sq.stop()

In [0]:
# Question 9:

# Data Source: JSON file stream. Reads the JSON files as theyre added to the folder.

# Output Mode: Complete. Everytime a new batch is processed the entire result is output. Not just the changes.

# Trigger Type: Default. Processes data in small batches during the set time window.

# Data Sink: Console. Results of the streaming job are output in the console.

In [0]:
# Question 10:

# Reads JSON data from the mobile folder, parses it using the schema and aggregates the data in a 10 min window. Groups the data by the action field and counts the number of  # times each action occurs in each window. Results are then output to the console in a complete mode. 