# DSCI 417 - Homework 01

**Kyle Fowler**

In [0]:
import math
import pyspark
from pyspark.sql import SparkSession
from pyspark.mllib.random import RandomRDDs

## Problem 1: Terminology

1. Spark provides APIs for Python, Scala, Java, and R. But it was written in what language?
Scala
2. What is the type of object that represents the primary entry point for accessing Spark functionality?
Contexts - represents the connection to a Spark Cluster
3. What is the type of object that provides tools for working with RDDs?
SparkContext
4. What does the acronym RDD stand for?
Resilient Distributed Datasets
5. When an RDD is created, it is broken into smaller pieces that can be distributed over the cluster. What
are these pieces called?
Partitions
6. Does a transformation or an action return a new RDD?
Transformations
7. What type of method triggers the evaluation of an RDD, a transformation or an action?
An Action
8. Is sample() a transformation or an action?
A Transformation
9. Is take() a transformation or an action?
An Action
10. Is map() a transformation or an action?
A Transformation
11. Is reduce() a transformation or an action?
An Action
12. What data type is returned by the collect() method?
List
13. What is the term used for the node that manages the other nodes in a cluster?
Driver Node
14. What is the term used for the other nodes in a cluster?
Worker Nodes
15. What is the term used for the process that manages tasks in a Spark application?
Driver
16. What is the term used for the processes that perform tasks in a Spark application?
Executors


## Problem 2: Working with a Numerical RDD

In [0]:
random_rdd = RandomRDDs.uniformRDD(sc, size=1200000, seed=1)

print('Sum:     ', random_rdd.sum())
print('Mean:    ', random_rdd.mean())
print('Std Dev: ', random_rdd.stdev())
print('Minimum: ', random_rdd.min())
print('Maximum: ', random_rdd.max())

Sum:      599731.0925959006
Mean:     0.4997759104965818
Std Dev:  0.2887300658502359
Minimum:  1.0351479373671424e-07
Maximum:  0.9999990237273222


In [0]:
random_rdd.getNumPartitions()

partitions_rdd = random_rdd.glom()
collected_rrd = partitions_rdd.map(lambda x : len(x)).collect()


In [0]:
print('Number of Partitions: ', len(collected_rrd))
print('Size of Partitions:   ', collected_rrd)

Number of Partitions:  8
Size of Partitions:    [150000, 150000, 150000, 150000, 150000, 150000, 150000, 150000]


## Problem 3: Transformations

In [0]:
scaled_rdd = random_rdd.map(lambda x : x*10)

print('Sum:     ', scaled_rdd.sum())
print('Mean:    ', scaled_rdd.mean())
print('Std Dev: ', scaled_rdd.stdev())
print('Minimum: ', scaled_rdd.min())
print('Maximum: ', scaled_rdd.max())

Sum:      5997310.925958995
Mean:     4.997759104965838
Std Dev:  2.887300658502357
Minimum:  1.0351479373671424e-06
Maximum:  9.999990237273222


In [0]:
log_rdd = scaled_rdd.map(lambda x : math.log(x))

print('Sum:     ', log_rdd.sum())
print('Mean:    ', log_rdd.mean())
print('Std Dev: ', log_rdd.stdev())
print('Minimum: ', log_rdd.min())
print('Maximum: ', log_rdd.max())

Sum:      1561638.08183098
Mean:     1.3013650681924729
Std Dev:  1.0021344727804842
Minimum:  -13.780966206806882
Maximum:  2.302584116720891


## Problem 4: Calculating SSE

In [0]:
pairs_raw = sc.textFile("FileStore/tables/pairs_data.txt" , minPartitions = None)

pairs_raw.getNumPartitions()
pairs_raw.count()

12743548

In [0]:
for i, element in enumerate(pairs_raw.take(5), 1):
    print(f"{i}. {element}")

1. 12.3 12.1
2. 9.1 8.7
3. 9.3 9.9
4. 8.5 8.5
5. 11.2 10.8


In [0]:
def process_line(row):
    tokens = row.split()
    return (float(tokens[0]), float(tokens[1]))

pairs = pairs_raw.map(lambda x : process_line(x))

for i, element in enumerate(pairs.take(5), 1):
    print(f"{i}. {element}")

1. (12.3, 12.1)
2. (9.1, 8.7)
3. (9.3, 9.9)
4. (8.5, 8.5)
5. (11.2, 10.8)


In [0]:
SSE_before_sum = pairs.map(lambda x : (x[0] - x[1])**2)

#Checking:
for i, element in enumerate(SSE_before_sum.take(5), 1):
    print(f"{i}. {element}")

SSE = SSE_before_sum.sum()
print(SSE)

1. 0.040000000000000424
2. 0.16000000000000028
3. 0.3599999999999996
4. 0.0
5. 0.15999999999999887
4597380.190042952


## Problem 5: Calculating r-Squared

In [0]:
mean = pairs.map(lambda x : x[0])
mean = mean.mean()
print(mean)

10.00013136059118


In [0]:
SST = pairs.map(lambda x :(x[0] - mean) ** 2)
SST = SST.sum()
print(SST)

24980514.859974924


In [0]:
r2 = 1 - (SSE / SST)
print(r2)

0.815961351644953


## Problem 6: NASA Server Logs

In [0]:
nasa = sc.textFile("/FileStore/tables/NASA_server_logs_Aug_1995.txt" , minPartitions = None)
nasa.count()

1569888

In [0]:
for i, element in enumerate(nasa.take(5), 1):
    print(f"{i}. {element}")

1. in24.inetnebr.com [01/Aug/1995:00:00:01] "GET /shuttle/missions/sts-68/news/sts-68-mcc-05.txt" 200 1839
2. uplherc.upl.com [01/Aug/1995:00:00:07] "GET /" 304 0
3. uplherc.upl.com [01/Aug/1995:00:00:08] "GET /images/ksclogo-medium.gif" 304 0
4. uplherc.upl.com [01/Aug/1995:00:00:08] "GET /images/MOSAIC-logosmall.gif" 304 0
5. uplherc.upl.com [01/Aug/1995:00:00:08] "GET /images/USA-logosmall.gif" 304 0


In [0]:
nasa_GET_requests = nasa.map(lambda x : 'GET' in x)
nasa_POST_requests = nasa.map(lambda x : 'POST' in x)
nasa_HEAD_requests = nasa.map(lambda x : 'HEAD' in x)

nasa_GET_requests = nasa_GET_requests.sum()
nasa_POST_requests = nasa_POST_requests.sum()
nasa_HEAD_requests = nasa_HEAD_requests.sum()

print('Number of GET requests: ', nasa_GET_requests)
print('Number of POST requests: ', nasa_POST_requests)
print('Number of HEAD requests: ', nasa_HEAD_requests)

Number of GET requests:  1565812
Number of POST requests:  111
Number of HEAD requests:  3965
