In [1]:
# Importing Spark session object
from pyspark.sql import SparkSession

In [2]:
# Creating a spark session object
spark = SparkSession. \
    builder. \
    enableHiveSupport(). \
    appName('Demo'). \
    master('yarn'). \
    getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/26 01:16:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Setting the number of shuffle partition as 2 by overriding the default
spark.conf.set('spark.sql.shuffle.partitions', '2')

### Spark Streaming reading messages from port 9100

In [4]:
# This command is run from the console where the content from the file cta_api_dump.csv is streamed to a web server on the port 9100
!sh tail_api.sh|nc -lk `hostname -f` 9100

sh: 0: Can't open tail_api.sh
^C


In [5]:
# Importing the required libraries such as socket
import socket
hostname = socket.gethostname()
hostname

'bigdata-project'

In [6]:
# Creating a read stream by reading the messages from port 9101
api_messages = spark. \
    readStream. \
    format("socket"). \
    option("host", hostname). \
    option("port", 9100). \
    load()

24/02/26 01:17:50 WARN TextSocketSourceProvider: The socket source should not be used for production applications! It does not support recovery.


In [7]:
api_messages.isStreaming

True

In [8]:
api_messages.printSchema()

root
 |-- value: string (nullable = true)



In [9]:
# Starting a write stream object by reading from api_messages every 10 seconds using append mode
api_messages. \
    writeStream. \
    outputMode("append"). \
    format("console"). \
    option('truncate', 'false'). \
    trigger(processingTime='10 seconds'). \
    start()

24/02/26 01:17:59 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-82610cac-94bb-4e1f-887b-f3ee3e278837. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/02/26 01:17:59 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.query.StreamingQuery at 0x7f19c072e790>

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-----+
|value|
+-----+
+-----+

-------------------------------------------
Batch: 1
-------------------------------------------
+---------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                  |
+---------------------------------------------------------------------------------------------------------------------------------------+
|red,811,30173,Howard,40190,Sox-35th,2024-02-25T19:13:10,2024-02-25T19:15:10,0,41.81036,-87.63101                                       |
|red,813,30089,95th/Dan Ryan,41220,Fullerton,2024-02-25T19:13:46,2024-02-25T19:14:46,0,41.92916,-87.65298                               |
|red,815,30173,Howard,40540,Wilson,2024-02-25T19:13:53,2024-02-25T19:15:53,0,41.964

In [11]:
# Importing required libraries
from pyspark.sql.functions import split, count, lit
# Creating route_count dataframe that groups the data based on the route color and prints the results
route_count = api_messages. \
    select(split('value', ',')[0].alias('route_color')). \
    groupBy('route_color'). \
    agg(count(lit(1)).alias('count'))

In [12]:
# Using "complete" mode to print the resul
route_count. \
    writeStream. \
    outputMode("complete"). \
    format("console"). \
    option('truncate', 'false'). \
    trigger(processingTime='10 seconds'). \
    start()

24/02/26 01:18:15 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-57ab24c9-c167-45a5-9601-b756eeceaabf. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/02/26 01:18:15 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.query.StreamingQuery at 0x7f1998d33850>

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-----------+-----+
|route_color|count|
+-----------+-----+
+-----------+-----+

-------------------------------------------
Batch: 2
-------------------------------------------
+-------------------------------------------------------------------------------------------------------------+
|value                                                                                                        |
+-------------------------------------------------------------------------------------------------------------+
|red,810,30173,Howard,41660,Lake,2024-02-25T19:17:52,2024-02-25T19:18:52,0,41.88321,-87.62775                 |
|red,812,30089,95th/Dan Ryan,40190,Sox-35th,2024-02-25T19:17:45,2024-02-25T19:18:45,0,41.84691,-87.63114      |
|red,814,30089,95th/Dan Ryan,40880,Thorndale,2024-02-25T19:17:55,2024-02-25T19:18:55,0,41.99196,-87.65912     |
|red,816,30089,95th/Dan Ryan,41420,Addison,2024-02-25

In [None]:
# Using "update" mode to print the results to the console
route_count. \
    writeStream. \
    outputMode("update"). \
    format("console"). \
    option('truncate', 'false'). \
    trigger(processingTime='10 seconds'). \
    start()