In [None]:
from pyspark.context import SparkContext
sc = SparkContext('local', 'RDD_Practice')


### Creating an RDD

Two common methods:
* textFile (input: text file) 
* parallelize (input: a list)

In [None]:
# Let's create our first RDD using a csv of some public data
crimes_rdd = sc.textFile("Chicago-Crimes-2018.csv")
#Creates a list of strings (one line = one element))

In [None]:
# investigate the type of the variable we created
print(type(crimes_rdd))
print(crimes_rdd)

In [None]:
# investigate the data in the RDD
crimes_rdd.take(3)

In [None]:
# Let's create another RDD using parallelize function
simple_list = ["This", "is", "a", "list"]
list_rdd = sc.parallelize(simple_list)

In [None]:
print(type(list_rdd))
print(list_rdd)

In [None]:
list_rdd.take(3)

### Operations
2 types of operations:
* Transformations
    * Returns 1 or more RDDs
    * Lazy - no immediate execution
* Actions
    * Returns non-RDD values
    * Eager - triggers evaluation

### Transformations

#### Map, flatMap

In [None]:
# you can use help to find some quick info on a function
help(list_rdd.map)

In [None]:
map_list_rdd = list_rdd.map(lambda x: (x, len(x)))
map_list_rdd.take(4)

# note: lambda functions are common in Spark RDDs. Read more in the link below, if unfamiliar with lambda:
# https://www.w3schools.com/python/python_lambda.asp

In [None]:
# it is possible to use regular functions as well. Useful for complex functions
# but generally this is too wordy and needless complexity

def give_length(x):
  return (x, len(x))

list_rdd.map(give_length).take(4)

In [None]:
# flatMap flattens the output to a single level list

list_rdd.flatMap(lambda x: (x, len(x))).take(4) #.take(10)

In [None]:
#crimes_rdd.take(4) # Every row is a long string, hard to do data processing
crimes_rdd.map(lambda x: x.split('\t')).take(4) # Now we have some structure - list of lists, every sublist is a row with index responding to column

#### Filter

In [None]:
list_rdd.filter(lambda x: len(x)>2).take(4)

In [None]:
# you can start combining multiple transformations

# note on notation: common to use (brackets) and every transformation/action on new line for improved readability

(list_rdd
 .map(lambda x: (len(x), x))
 .filter(lambda x: x[0]>2).take(4)
)

In [None]:
# find all crimes where there's been an arrest (column 9 = Arrest)
(crimes_rdd
 .map(lambda x: x.split('\t'))
 .filter(lambda x: x[8]=="true")
 .take(4)
)

#### Union & intersection

<img src= 'https://i.ytimg.com/vi/sdflTUW6gHo/maxresdefault.jpg'>

In [None]:
# Let's create some more RDDs. These are called pair RDDs, as they are key-value pairs

first_rdd = sc.parallelize([
  (1, "Batman"),
  (2, "Superman"),
  (3, "Spiderman")
])

second_rdd = sc.parallelize([
  (3, "Spiderman"),
  (4, "Hulk"),
  (5, "Peppa Pig")
])

In [None]:
(first_rdd
 .union(second_rdd)
 .take(10)
)

In [None]:
(first_rdd
 .intersection(second_rdd)
 .take(10)
)

#### Distinct

In [None]:
# returns duplicate values for spiderman
(first_rdd
 .union(second_rdd)
 .take(10)
)

In [None]:
# returns distinct values
(first_rdd
 .union(second_rdd)
 .distinct()
 .take(10) 
)
# note: order not preserved

#### ByKey operations
* groupByKey
* reduceByKey
* sortByKey
* aggregateByKey

expect pair RDDs _(key, value)_

In [None]:
for i in first_rdd.union(second_rdd).groupByKey().take(10): # this is a list of tuples, second element is iterable
  print(i[0], [j for j in i[1]])
  

GroupByKey vs ReduceByKey </br>
<img src ='https://www.edureka.co/community/?qa=blob&qa_blobid=6565348686735863167'> </br>
<img src ='https://www.edureka.co/community/?qa=blob&qa_blobid=8024890559746280233'>

_source: https://www.edureka.co/community/11996/groupbykey-vs-reducebykey-in-apache-spark_

reduceByKey is generally faster and preferred for grouping, compared to groupByKey

reduce will perform calculations (eg aggregations) within partition, and provides smaller output

Further info: https://stackoverflow.com/questions/24804619/how-does-spark-aggregate-function-aggregatebykey-work/24805905#24805905

In [None]:
# NB! aggregation must be commutative and associative (eg add, multiply). Average/st deviation not directly implementable

(first_rdd
 .union(second_rdd)
 .reduceByKey(lambda x,y: (x, y))
 .take(10)
)

In [None]:
(first_rdd
 .union(second_rdd)
 .map(lambda x: (x[1],x[0]))
 .reduceByKey(lambda x, y: x+y)
 .take(10)
)

In [None]:
# sorting
(first_rdd
 .union(second_rdd)
 .sortByKey(ascending=False)
 .take(10)
)

In [None]:
# sorting previous distinct query
(first_rdd
 .union(second_rdd)
 .distinct()
 .sortByKey()
 .take(10)
)

In [None]:
# aggregate by key takes 3 inputs: first value, function within partition, function after partition combination
(first_rdd
 .union(second_rdd)
 .aggregateByKey("",lambda x,y:x+y,lambda x,y:x+","+y)
 .take(5)
)

In [None]:
# Let's create another RDD for further calculations
third_rdd = sc.parallelize([
  (10, 'Batman'),
  (20, 'Superman'),
  (30, 'Hulk'),
  (40, 'Hulk')
])

In [None]:
# demonstration for calculating average
(first_rdd
 .union(second_rdd)
 .union(third_rdd)
 .map(lambda x: (x[1],x[0]))
 .aggregateByKey((0,0)
                 ,lambda x,y:(x[0]+y, x[1]+1)
                 ,lambda x,y:(x[0]+y[0],x[1]+y[1])
                )
 #.mapValues(lambda x: x[0]/x[1]) # mapValues works on pair RDDs, only mapping the values part of the key-values
 .take(5)
)

In [None]:
# some general math functions based on reduce
(first_rdd
 .union(second_rdd)
 .union(third_rdd)
 .map(lambda x: (x[1],x[0]))
# .reduceByKey(lambda x,y: x+y) # sum
# .reduceByKey(min) 
# .reduceByKey(max)
 .take(10)
)

In [None]:
# reduceByKey is not intuitively suited for getting the mean (or std deviation, etc). See above for aggregateByKey solution
(first_rdd
 .union(second_rdd)
 .union(third_rdd)
 .map(lambda x: (x[1],x[0]))
 .mapValues(lambda x: (x, 1))
 .reduceByKey(lambda x,y: (x[0]+y[0], x[1]+y[1]))
 .map(lambda x: (x[0], x[1][0]/x[1][1]))
 .take(10)
)

#### Join

In [None]:
# join is by key, returns (key, (value A, value B)
(first_rdd
.join(second_rdd)
#.fullOuterJoin(second_rdd)
#.leftOuterJoin(second_rdd)
.take(5)
)

In [None]:
(first_rdd
 .map(lambda x: (x[1],x[0]))
 .join(third_rdd.map(lambda x: (x[1],x[0])))
 .take(5)
)

### Actions

In [None]:
# displaying contents
#first_rdd.collect() # NB takes full dataset into driver memory
#first_rdd.take(1) # returns a list
#first_rdd.first() # returns first element from RDD - NB type is not same as for take(1)
#first_rdd.top(1) # returns n items starting from "the top"

In [None]:
# count of items/rows in RDD
first_rdd.count()
#crimes_rdd.count()

In [None]:
# reduce - calculate a result for whole RDD 
(first_rdd
 #.map(lambda x: x[0])
 .reduce(lambda x,y: x+y) #sum
 #.reduce(max)
 #.reduce(min)
)

In [None]:
# simple sum/mean/max/min functions

#first_rdd.map(lambda x: x[0]).sum()
#first_rdd.map(lambda x: x[0]).mean()
#first_rdd.map(lambda x: x[0]).min()
#first_rdd.map(lambda x: x[0]).max()

### Example
Let's use sample data from https://jsonplaceholder.typicode.com/

We will try to see what is the **average length** of **post title**, per **user**

In [None]:
import requests

users_path = "https://jsonplaceholder.typicode.com/users"
users_resp = requests.get(users_path)

posts_path = "https://jsonplaceholder.typicode.com/posts"
posts_resp = requests.get(posts_path)

users_rdd = sc.parallelize(users_resp.json())
posts_rdd = sc.parallelize(posts_resp.json())

In [None]:
users_rdd.take(3)
# we will be interested in id and name fields

In [None]:
posts_rdd.take(3)
# userId matches id from users_rdd. We are also interested in the title.

In [None]:
# both RDDs are a list of dictionaries, so we can use Python dictionary methods for accessing the fields we are interested in
(users_rdd
 .map(lambda x: (x["id"], x["name"]))
 .take(5)
)

In [None]:
(posts_rdd
 .map(lambda x: (x["userId"], x["title"])) # we do not need the actual title, can take len()
 .take(5)
)

In [None]:
# let's join these two RDDs
(users_rdd
 .map(lambda x: (x["id"], x["name"]))
 .join(posts_rdd
       .map(lambda x: (x["userId"], len(x["title"]))))
 .take(5)
)

In [None]:
# we can now finalize our query
(users_rdd
 .map(lambda x: (x["id"], x["name"]))
 .join(posts_rdd
       .map(lambda x: (x["userId"], len(x["title"]))))
 .map(lambda x: (x[1][0],x[1][1]))
 .aggregateByKey((0,0)
                 ,lambda x,y:(x[0]+y, x[1]+1)
                 ,lambda x,y:(x[0]+y[0],x[1]+y[1])
                )
 .mapValues(lambda x: x[0]/x[1])
 .sortBy(lambda x: x[1], ascending=False)
 .take(10)
)

### Additional reading material

Official documentation:</br>
https://spark.apache.org/docs/latest/rdd-programming-guide.html </br>
https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.RDD.html

RDDs pros and cons:</br>
https://www.databricks.com/glossary/what-is-rdd </br>
https://dzone.com/articles/apache-spark-3-reasons-why-you-should-not-use-rdds </br>
https://towardsdatascience.com/a-modern-guide-to-spark-rdds-725cd7c14059

### Task 1

Using the Chicago crimes dataset from above, create:
1. an RDD holding a distinct list of the column "Primary type" (30 rows total including heading)
2. an RDD holding the same distinct list as RDD 1, but including a count of each "Primary type" in the dataset, eg: ('HOMICIDE', 48)


In [None]:
# first RDD


In [None]:
# second RDD


### Task 2

Using the jsonplaceholder data, create an RDD that would return username and proportion (decimal percentage) of todos done.

Endpoints:
* https://jsonplaceholder.typicode.com/users
* https://jsonplaceholder.typicode.com/todos

Expected output is a sorted RDD, descending by proportion.</br>
Example take(3):</br>
[('Moriah.Stanton', 0.6),</br>
 ('Kamren', 0.6),</br>
 ('Maxime_Nienow', 0.55)]</br>

In [None]:
# RDD
