In [1]:
#%pip install pyspark==3.5.1 pandas pyarrow numpy jupyter ipykernel

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Homework13") \
    .config("spark.sql.shuffle.partitions", "8") \
    .getOrCreate()

sc = spark.sparkContext

In [4]:
# this code is to estimate the value of pi
# we will use the Monte Carlo method to estimate the value of pi
import sys
from random import random
from operator import add

n = 1000000 #number of points
def inside(partition):
    count = 0
    # randomly generates a point in the unit square (between 0 and 1)
    for _ in partition:
        x, y = random(), random()
        if x*x + y*y < 1:
            count += 1
    yield count

# we will create a parallelized RDD to count the number of points inside the unit circle
# if it is inside the unit circle, we will return 1, otherwise 0
count = sc.parallelize(range(n), 500).mapPartitions(inside).reduce(add)

# then we will calculate the value of pi
pi = 4 * count / n
print('The value of pi is estimated to be %f' % pi)



The value of pi is estimated to be 3.134000


In [8]:
# this code is to generate matrices to perform matrix multiplication
import numpy as np
sc.setLogLevel("INFO")

# use numpy to generate two matrices
# make assumption of the matrix size
m, n, p = 3, 6, 4
matrix_a = np.random.randint(1, 20, (m, n))
matrix_b = np.random.randint(1, 20, (n, p))

# convert the matrices to lists
A = [(i, k, matrix_a[i, k]) for i in range(m) for k in range(n)]
B = [(k, j, matrix_b[k, j]) for k in range(n) for j in range(p)]

# convert A and B to RDDs
rddA = sc.parallelize(A)
rddB = sc.parallelize(B)

print(rddA.collect())
print(rddB.collect())

# perform matrix multiplication
# in this step, we will first map the elements of A and B 
# as for A, we will map the element to (k, (i, matrix_a[i, k]))
# as for B, we will map the element to (k, (j, matrix_b[k, j]))
# then we will join the two RDDs and do the multiplication
# we will use the reduceByKey to sum the results
# after that, we can get the matrix multiplication result
rddA2 = rddA.map(lambda x: (x[1], (x[0], x[2])))
rddB2 = rddB.map(lambda x: (x[0], (x[1], x[2])))

def multiply(x):
    k, ((i, a_ik), (j, b_kj)) = x
    return ((i, j), a_ik * b_kj)
rddC = rddA2.join(rddB2).map(multiply)
rddC_result = rddC.reduceByKey(lambda x, y: x + y)

print(rddC_result.sortByKey().collect())
for (i, j), value in rddC_result.sortByKey().collect():
    print(f"({i}, {j}) = {value}")

[(0, 0, np.int32(13)), (0, 1, np.int32(8)), (0, 2, np.int32(19)), (0, 3, np.int32(12)), (0, 4, np.int32(4)), (0, 5, np.int32(18)), (1, 0, np.int32(14)), (1, 1, np.int32(10)), (1, 2, np.int32(10)), (1, 3, np.int32(18)), (1, 4, np.int32(4)), (1, 5, np.int32(16)), (2, 0, np.int32(12)), (2, 1, np.int32(1)), (2, 2, np.int32(18)), (2, 3, np.int32(7)), (2, 4, np.int32(13)), (2, 5, np.int32(15))]
[(0, 0, np.int32(7)), (0, 1, np.int32(6)), (0, 2, np.int32(9)), (0, 3, np.int32(19)), (1, 0, np.int32(5)), (1, 1, np.int32(1)), (1, 2, np.int32(16)), (1, 3, np.int32(11)), (2, 0, np.int32(17)), (2, 1, np.int32(8)), (2, 2, np.int32(5)), (2, 3, np.int32(6)), (3, 0, np.int32(15)), (3, 1, np.int32(4)), (3, 2, np.int32(14)), (3, 3, np.int32(8)), (4, 0, np.int32(15)), (4, 1, np.int32(17)), (4, 2, np.int32(7)), (4, 3, np.int32(2)), (5, 0, np.int32(10)), (5, 1, np.int32(18)), (5, 2, np.int32(3)), (5, 3, np.int32(9))]
[((0, 0), np.int32(874)), ((0, 1), np.int32(678)), ((0, 2), np.int32(590)), ((0, 3), np.int32

In [9]:
# this code is to convert the code from scala to pyspark
# we will use the code from the class to do the conversion
# this is the code from RDD Transformations 1
rdd1 = sc.parallelize(["dog", "salmon", "rat"], 2)
rdd2 = rdd1.map(lambda x: (len(x), x))
print(rdd1.collect())
print(rdd1.take(0))
print(rdd1.take(1))
print(rdd1.take(2))
print(rdd2.collect())


['dog', 'salmon', 'rat']
[]
['dog']
['dog', 'salmon']
[(3, 'dog'), (6, 'salmon'), (3, 'rat')]


In [10]:
# this code is to convert the code from scala to pyspark
# we will use the code from the class to do the conversion
# this is the code from RDD Transformations 2
rdd1 = sc.parallelize([1, 2, 3], 2)
rdd2 = rdd1.map(lambda x: [x, x, x])
rdd3 = rdd1.flatMap(lambda x: [x, x, x])

print(rdd1.collect())
print(rdd2.collect())
print(rdd3.collect())

[1, 2, 3]
[[1, 1, 1], [2, 2, 2], [3, 3, 3]]
[1, 1, 1, 2, 2, 2, 3, 3, 3]


In [11]:
# this code is to convert the code from scala to pyspark
# we will use the code from the class to do the conversion
# this is the code from RDD Transformations 3
rdd1 = sc.parallelize(range(1,11), 1)
rdd2 = rdd1.filter(lambda x: (x % 2 == 0))

print(rdd1.collect())
print(rdd2.collect())


[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[2, 4, 6, 8, 10]


In [12]:
# this code is to convert the code from scala to pyspark
# we will use the code from the class to do the conversion
# this is the code from RDD Transformations 4
rdd1 = sc.parallelize(["dog", "salmon", "rat"], 2)
rdd2 = sc.parallelize(["cat", "rabbit", "horse"], 2)
rdd3 = rdd1.map(lambda x: (len(x), x))
rdd4 = rdd2.map(lambda x: (len(x), x))
rdd5 = rdd3.join(rdd4)

print(rdd3.collect())
print(rdd4.collect())
print(rdd5.collect())


[(3, 'dog'), (6, 'salmon'), (3, 'rat')]
[(3, 'cat'), (6, 'rabbit'), (5, 'horse')]
[(6, ('salmon', 'rabbit')), (3, ('dog', 'cat')), (3, ('rat', 'cat'))]


In [14]:
# this code is to convert the code from scala to pyspark
# we will use the code from the class to do the conversion
# this is the code from RDD Transformations 5
rdd1 = sc.parallelize(["dog", "salmon", "rat"], 2)
rdd2 = rdd1.map(lambda x: (len(x), x))
rdd3 = rdd2.groupByKey()

for key, value in rdd3.collect():
    print(key, list(value))

6 ['salmon']
3 ['dog', 'rat']


In [15]:
# this code is to convert the code from scala to pyspark
# we will use the code from the class to do the conversion
# this is the code from RDD Transformations 6
rdd1 = sc.parallelize(["dog", "salmon", "rat"], 2)
rdd2 = rdd1.map(lambda x: (len(x), x))
rdd3 = rdd2.reduceByKey(lambda x, y: x + '#' + y)

print(rdd2.collect())
print(rdd3.collect())


[(3, 'dog'), (6, 'salmon'), (3, 'rat')]
[(6, 'salmon'), (3, 'dog#rat')]


In [16]:
# this code is to join two RDDS, one is large and one is much smaller
# the large RDD used is StackOverflow RDD
# first, read the StackOverflow RDD in S3 bucket
from csv import reader
import time

# read the StackOverflow RDD
file_path = "H:/DE/DE Lecture/survey_results_public.csv"
stack_overflow = spark.read.csv(file_path, header = True)

# get the Respondent and Country column
filtered_stackoverflow = stack_overflow.select("ResponseId", "Country")
stackoverflow_rdd = filtered_stackoverflow.rdd.map(lambda row: (row.ResponseId, row.Country))

# print the first 10 elements of the StackOverflow RDD to check if it is correctly read
print(stackoverflow_rdd.take(10))

# create a much smaller RDD
rdd1 = sc.parallelize([('1', 'Canada'), ('2', 'United Kingdom of Great Britain and Northern Ireland'), ('3', 'China'), 
('4', 'Egypt'), ('5', 'Spain'), ('6', 'United States of America'), ('7', 'Germany'), ('8', 'Brazil')])

# join the two RDDs using traditional join method
rdd_join = stackoverflow_rdd.join(rdd1)
start_time = time.time()
count = rdd_join.count()
time_duration = time.time() - start_time

print(rdd_join.sortByKey().take(10))
print(f"time_duration: {time_duration}")

# join the two RDDs using broadcast join method
# broadcast the small RDD to all the nodes
broadcast_rdd1 = sc.broadcast(dict(rdd1.collect()))
# get the broadcasted RDD from the executor
# use map join method to join the two RDDs
start_time = time.time()
# use map join method to join the two RDDs
def map(record):
    ResponseId, Country = record
    if ResponseId in broadcast_rdd1.value:
        return (ResponseId, (Country, broadcast_rdd1.value[ResponseId]))
    else:
        return (ResponseId, None)

rdd_broadcast_join = stackoverflow_rdd.map(map).filter(lambda x: x[1] is not None)
count = rdd_broadcast_join.count()
time_duration = time.time() - start_time

print(rdd_broadcast_join.take(10))
print(f"time_duration: {time_duration}")


[('1', 'United States of America'), ('2', 'United Kingdom of Great Britain and Northern Ireland'), ('3', 'United Kingdom of Great Britain and Northern Ireland'), ('4', 'Canada'), ('5', 'Norway'), ('6', 'United States of America'), ('7', 'United States of America'), ('8', 'Uzbekistan'), ('9', 'United Kingdom of Great Britain and Northern Ireland'), ('10', 'Serbia')]
[('1', ('United States of America', 'Canada')), ('2', ('United Kingdom of Great Britain and Northern Ireland', 'United Kingdom of Great Britain and Northern Ireland')), ('3', ('United Kingdom of Great Britain and Northern Ireland', 'China')), ('4', ('Canada', 'Egypt')), ('5', ('Norway', 'Spain')), ('6', ('United States of America', 'United States of America')), ('7', ('United States of America', 'Germany')), ('8', ('Uzbekistan', 'Brazil'))]
time_duration: 51.844245195388794
[('1', ('United States of America', 'Canada')), ('2', ('United Kingdom of Great Britain and Northern Ireland', 'United Kingdom of Great Britain and North

In [17]:
# this code is to aggregate speficic metrics from the StackOverflow RDD
# we will aggregate the country metrics
# read the StackOverflow RDD
file_path = "H:/DE/DE Lecture/survey_results_public.csv"
stack_overflow = spark.read.csv(file_path, header = True)

# get the Respondent and Country column
filtered_stackoverflow = stack_overflow.select("ResponseId", "Country")
stackoverflow_rdd = filtered_stackoverflow.rdd.map(lambda row: (row.ResponseId, row.Country))

# print the first 10 elements of the StackOverflow RDD to check if it is correctly read
print(stackoverflow_rdd.take(10))

# use traditional method to aggregate the country metrics
count = stackoverflow_rdd.filter(lambda x: x[1] == "Singapore").count()
print(f"The number of Singapore respondents is {count}")

# use the accumulator to aggregate the country metrics
accumulator = sc.accumulator(0)
stackoverflow_rdd.foreach(lambda x: accumulator.add(1) if x[1] == "Singapore" else None)
print(f"The number of Singapore respondents is {accumulator.value}")

# Based on the result, it can be concluded that the accumulator method matches the traditional method
# It means that the accumulator method can also generate the expected result


[('1', 'United States of America'), ('2', 'United Kingdom of Great Britain and Northern Ireland'), ('3', 'United Kingdom of Great Britain and Northern Ireland'), ('4', 'Canada'), ('5', 'Norway'), ('6', 'United States of America'), ('7', 'United States of America'), ('8', 'Uzbekistan'), ('9', 'United Kingdom of Great Britain and Northern Ireland'), ('10', 'Serbia')]
The number of Singapore respondents is 177
The number of Singapore respondents is 177
