# Setup

In [None]:
# imports

import findspark
import pyspark
from pyspark.streaming import StreamingContext

findspark.init()
findspark.find()

In [None]:
# dropping context

if 'sc' in locals():
    sc.stop()
if 'ssc' in locals() :
    ssc.stop()

# setting up context

sc = pyspark.SparkContext("local[*]")
sc.setLogLevel("FATAL")
ssc = StreamingContext(sc, 1) # 1 sec mini-batches

ssc.checkpoint(directory="spark_checkpoints")

---
# Exercises

## Exercise 1

In a denial-of-service event it is important to identify the IP sources that might be attacking the system, by issuing a large number of requests.

Write a program to find the IP sources that have done more than 50 requests in the last 10 seconds -- dump this information every 5 seconds. 


In [None]:
lines = ssc.socketTextStream("localhost", 7777)

lines.filter(lambda line: len(line) > 0)\
    .map(lambda line: line.split(" ")[1])\
    .countByValueAndWindow(10, 5)\
    .filter(lambda count: count[1] > 50)\
    .pprint()

ssc.start()
ssc.awaitTermination(60)
ssc.stop()

## Exercise 2

#### a)
Write a program to dump the number of requests, minimum processing time, maximum processing time for request in the last 10 seconds, **for all** source IPs that performed more than 100 requests -- dump this information every 5 second.  

In [None]:
lines = ssc.socketTextStream("localhost", 7777)

more_than_100 = lines.filter(lambda line: len(line) > 0)\
    .map(lambda line: line.split(" ")[1])\
    .countByValueAndWindow(10, 5)\
    .filter(lambda count: count[1] > 100)

metrics = lines.filter(lambda line: len(line) > 0)\
    .map(lambda line: line.split(" "))\
    .map(lambda values: (values[1], (float(values[5]), float(values[5]))))\
    .reduceByKeyAndWindow(lambda rt1, rt2: (max(rt1[0], rt2[0]), min(rt1[1], rt2[1])), None, 10, 5)

more_than_100.leftOuterJoin(metrics).pprint()

ssc.start()
ssc.awaitTermination(50)
ssc.stop()

#### b)

Write a program to dump the number of requests, minimum processing time, maximum processing time for request in the last 10 seconds, **only if at least one** source IP has performed more than 100 requests -- dump this information every 5 second.

In [None]:
lines = ssc.socketTextStream("localhost", 7777)

more_than_100 = lines.filter(lambda line: len(line) > 0)\
    .map(lambda line: line.split(" ")[1])\
    .countByValueAndWindow(10, 5)\
    .filter(lambda count: count[1] > 100)

metrics = lines.filter(lambda line: len(line) > 0)\
    .map(lambda line: line.split(" "))\
    .map(lambda values: (values[1], (float(values[5]), float(values[5]))))\
    .reduceByKeyAndWindow(lambda rt1, rt2: (max(rt1[0], rt2[0]), min(rt1[1], rt2[1])), None, 10, 5)

more_than_100.leftOuterJoin(metrics).pprint()

ssc.start()
ssc.awaitTermination(50)
ssc.stop()

## Exercise 3
Write a program to dump the IP sources that deviate most from the average in terms of the number of requests made in the last 30 seconds - dump this information every 5 seconds.

In [None]:
lines = ssc.socketTextStream("localhost", 7777)

request_per_ip = lines.filter(lambda line: len(line) > 0)\
    .map(lambda line: line.split(" ")[1])\
    .countByValueAndWindow(10, 5)\
    .transform(lambda rdd: rdd.zipWithIndex().map(lambda t: (t[0][1], t[1])))\
    .reduce(lambda t1, t2: (t1[0] + t2[0], max(t1[1], t2[1])))\
    .map(lambda t: t[0] / t[1])

request_per_ip.pprint()

ssc.start()
ssc.awaitTermination(50)
ssc.stop()

## Exercise 4

Run additional logsender servers for subsets of the logs (IPv4 and IPv6 logs), using the following commands.

```
!nohup python logsender/server.py logsender/webipv4.log 7778 > /dev/null 2> /dev/null &
!nohup python logsender/server.py logsender/webipv6.log 7779 > /dev/null 2> /dev/null &
```

Write a program that combines the two streams, dumping the number of requests made in the last 15 seconds - dump this information every 5 seconds.

## Exercise 5

Write a program that combines the two streams from the previous exercise and dumps the proportion of IPv4 vs IPv6 requests in the last 20 seconds - dump this information every 5 seconds.
