# Setup
* Start weblog server
* Find spark instance

In [1]:
WEBLOG_PORT = 7777

import findspark

findspark.init()
spark = findspark.find()

import socket
try:
    socket.create_server(('', WEBLOG_PORT)).close()
    import os
    # Start Weblog server
    os.system(f"nohup python logsender/server.py logsender/web.log {WEBLOG_PORT} > /dev/null 2> /dev/null &")
    print(f"Server started @ localhost:{WEBLOG_PORT}")
except OSError:
    print("Server already running")
    pass
print("Spark is @", spark)

Server already running
Spark is @ /home/joaooliv/.local/lib/python3.10/site-packages/pyspark


* setup context: this should be run between each exercise to reset the context, since we are closing the stream

In [12]:
stream.stop() # if needed

In [13]:
from pyspark import SparkContext

sc = SparkContext("local[4]", "WebLogProcessor")
sc.setLogLevel("FATAL")

from pyspark.streaming import StreamingContext

stream = StreamingContext(sc, 1)
stream.checkpoint("checkpoints")

log = stream.socketTextStream("localhost", WEBLOG_PORT)

----
# Exercises

Do the following exercises:

Every 3 seconds,

In [4]:
PRINT_PERIOD = 3

1. Dump the number of requests in the last 10 seconds;

In [5]:
WINDOW_SIZE = 10

log.window(WINDOW_SIZE, PRINT_PERIOD)\
    .count()\
    .map(lambda c: f"{c} processed in the last {WINDOW_SIZE} seconds")\
    .pprint()

stream.start()
stream.awaitTermination(10)
stream.stop()

                                                                                

-------------------------------------------
Time: 2023-03-06 23:35:41
-------------------------------------------
278 processed in the last 10 seconds



                                                                                

-------------------------------------------
Time: 2023-03-06 23:35:44
-------------------------------------------
498 processed in the last 10 seconds



                                                                                

-------------------------------------------
Time: 2023-03-06 23:35:47
-------------------------------------------
1181 processed in the last 10 seconds



Exception in thread "receiver-supervisor-future-0" java.lang.InterruptedException: sleep interrupted
	at java.base/java.lang.Thread.sleep(Native Method)
	at org.apache.spark.streaming.receiver.ReceiverSupervisor.$anonfun$restartReceiver$1(ReceiverSupervisor.scala:196)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at scala.concurrent.Future$.$anonfun$apply$1(Future.scala:659)
	at scala.util.Success.$anonfun$map$1(Try.scala:255)
	at scala.util.Success.map(Try.scala:213)
	at scala.concurrent.Future.$anonfun$map$1(Future.scala:292)
	at scala.concurrent.impl.Promise.liftedTree1$1(Promise.scala:33)
	at scala.concurrent.impl.Promise.$anonfun$transform$1(Promise.scala:33)
	at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:64)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.ja

2. Dump the number of requests in the last 10 seconds, only if they total more than 100;

In [8]:
WINDOW_SIZE = 10
LOWER_BOUND = 100

log.window(WINDOW_SIZE, PRINT_PERIOD)\
    .count()\
    .filter(lambda c: c > LOWER_BOUND)\
    .map(lambda c: f"{c} processed in the last {WINDOW_SIZE} seconds")\
    .pprint()

stream.start()
stream.awaitTermination(10)
stream.stop()

                                                                                

-------------------------------------------
Time: 2023-03-06 23:36:04
-------------------------------------------
273 processed in the last 10 seconds



                                                                                

-------------------------------------------
Time: 2023-03-06 23:36:07
-------------------------------------------
550 processed in the last 10 seconds



                                                                                

-------------------------------------------
Time: 2023-03-06 23:36:10
-------------------------------------------
1094 processed in the last 10 seconds



Exception in thread "receiver-supervisor-future-0" java.lang.InterruptedException: sleep interrupted
	at java.base/java.lang.Thread.sleep(Native Method)
	at org.apache.spark.streaming.receiver.ReceiverSupervisor.$anonfun$restartReceiver$1(ReceiverSupervisor.scala:196)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at scala.concurrent.Future$.$anonfun$apply$1(Future.scala:659)
	at scala.util.Success.$anonfun$map$1(Try.scala:255)
	at scala.util.Success.map(Try.scala:213)
	at scala.concurrent.Future.$anonfun$map$1(Future.scala:292)
	at scala.concurrent.impl.Promise.liftedTree1$1(Promise.scala:33)
	at scala.concurrent.impl.Promise.$anonfun$transform$1(Promise.scala:33)
	at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:64)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.ja

3. Dump the number of requests in the last 10 seconds, if there is an IP address with more than 100 requests;

In [11]:
WINDOW_SIZE = 10
LOWER_BOUND = 100

def extractIP(log_entry: str) -> str:
    return log_entry.split(" ")[1]

log.map(extractIP)\
    .countByValueAndWindow(WINDOW_SIZE, PRINT_PERIOD, 1)\
    .filter(lambda tuple: tuple[1] > LOWER_BOUND)\
    .map(lambda tuple: f"WARNING! {tuple[0]} made {tuple[1]} requests in the last {WINDOW_SIZE} seconds")\
    .pprint()

stream.start()
stream.awaitTermination(10)
stream.stop()

                                                                                

-------------------------------------------
Time: 2023-03-06 23:36:26
-------------------------------------------



                                                                                

-------------------------------------------
Time: 2023-03-06 23:36:29
-------------------------------------------



                                                                                

-------------------------------------------
Time: 2023-03-06 23:36:32
-------------------------------------------



Exception in thread "receiver-supervisor-future-0" java.lang.InterruptedException: sleep interrupted
	at java.base/java.lang.Thread.sleep(Native Method)
	at org.apache.spark.streaming.receiver.ReceiverSupervisor.$anonfun$restartReceiver$1(ReceiverSupervisor.scala:196)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at scala.concurrent.Future$.$anonfun$apply$1(Future.scala:659)
	at scala.util.Success.$anonfun$map$1(Try.scala:255)
	at scala.util.Success.map(Try.scala:213)
	at scala.concurrent.Future.$anonfun$map$1(Future.scala:292)
	at scala.concurrent.impl.Promise.liftedTree1$1(Promise.scala:33)
	at scala.concurrent.impl.Promise.$anonfun$transform$1(Promise.scala:33)
	at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:64)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.ja

4. Dump the proportion of IPv4 vs IPv6 requests in the last 20 seconds.

In [14]:
from ipaddress import ip_address, IPv4Address


WINDOW_SIZE = 20

def extractIPType(log_entry: str) -> str:
    return "IPv4" if type(ip_address(log_entry.split(" ")[1])) is IPv4Address else "IPv6"

# def extractIP(log_entry: str) -> str:
#     return log_entry.split(" ")[1]

log.map(extractIPType)\
    .countByValueAndWindow(WINDOW_SIZE, PRINT_PERIOD, 1)\
    .map(lambda tuple: f"Processed {tuple[1]} {tuple[0]} addresses in the last {WINDOW_SIZE} seconds")\
    .pprint()

stream.start()
stream.awaitTermination(10)
stream.stop()

                                                                                

-------------------------------------------
Time: 2023-03-06 23:36:52
-------------------------------------------
Processed 270 IPv4 addresses in the last 20 seconds
Processed 8 IPv6 addresses in the last 20 seconds



                                                                                

-------------------------------------------
Time: 2023-03-06 23:36:55
-------------------------------------------
Processed 601 IPv4 addresses in the last 20 seconds
Processed 25 IPv6 addresses in the last 20 seconds



                                                                                

-------------------------------------------
Time: 2023-03-06 23:36:58
-------------------------------------------
Processed 1058 IPv4 addresses in the last 20 seconds
Processed 36 IPv6 addresses in the last 20 seconds



Exception in thread "receiver-supervisor-future-0" java.lang.InterruptedException: sleep interrupted
	at java.base/java.lang.Thread.sleep(Native Method)
	at org.apache.spark.streaming.receiver.ReceiverSupervisor.$anonfun$restartReceiver$1(ReceiverSupervisor.scala:196)
	at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)
	at scala.concurrent.Future$.$anonfun$apply$1(Future.scala:659)
	at scala.util.Success.$anonfun$map$1(Try.scala:255)
	at scala.util.Success.map(Try.scala:213)
	at scala.concurrent.Future.$anonfun$map$1(Future.scala:292)
	at scala.concurrent.impl.Promise.liftedTree1$1(Promise.scala:33)
	at scala.concurrent.impl.Promise.$anonfun$transform$1(Promise.scala:33)
	at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:64)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.ja