## Spark RDD Transformation & Actions

In [1]:
from pyspark.sql import SparkSession

spark=SparkSession. \
        builder. \
        master("local[5]"). \
        appName("rdd tutorial"). \
        config("inferSchema" , "True"). \
        getOrCreate()

# spark.conf.set("spark.sql.shuffle.partitions", "1000")
# spark.conf.set("spark.default.parallelism", "1000")

24/01/12 16:22:14 WARN Utils: Your hostname, navneetsajwan-ThinkPad-L480 resolves to a loopback address: 127.0.1.1; using 192.168.0.149 instead (on interface enp0s31f6)
24/01/12 16:22:14 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/12 16:22:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/01/12 16:22:21 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


### filter(),map(),FlatMap(),fold(),aggregate(),reduce(),reduceByKey(),sortbyKey()

In [2]:
rdd = spark.sparkContext.textFile("../data/data.txt")

In [3]:
rdd.collect()

                                                                                

['Project Gutenberg’s',
 'Alice’s Adventures in Wonderland',
 'Project Gutenberg’s',
 'Adventures in Wonderland',
 'Project Gutenberg’s']

In [4]:
#flatten all the data
rdd2=rdd.flatMap(lambda x:x.split(" "))
rdd2.collect()

                                                                                

['Project',
 'Gutenberg’s',
 'Alice’s',
 'Adventures',
 'in',
 'Wonderland',
 'Project',
 'Gutenberg’s',
 'Adventures',
 'in',
 'Wonderland',
 'Project',
 'Gutenberg’s']

In [5]:
# maping value by one
rdd3=rdd2.map(lambda x:(x,1))
rdd3.collect()

[('Project', 1),
 ('Gutenberg’s', 1),
 ('Alice’s', 1),
 ('Adventures', 1),
 ('in', 1),
 ('Wonderland', 1),
 ('Project', 1),
 ('Gutenberg’s', 1),
 ('Adventures', 1),
 ('in', 1),
 ('Wonderland', 1),
 ('Project', 1),
 ('Gutenberg’s', 1)]

In [6]:
# now perform countByValue,reduceByKey,reduce
"""
count unique word
"""
rdd3.reduceByKey(lambda x,y:x+y).collect()

                                                                                

[('Project', 3),
 ('Gutenberg’s', 3),
 ('Alice’s', 1),
 ('in', 2),
 ('Adventures', 2),
 ('Wonderland', 2)]

In [7]:
"""
find word which starting substr "A"
"""
rdd4=rdd2.filter(lambda x:x.startswith("A"))
rdd4.collect()

['Alice’s', 'Adventures', 'Adventures']

In [9]:
rdd3.reduceByKey(lambda x,y:x+y).collect()

[('Project', 3),
 ('Gutenberg’s', 3),
 ('Alice’s', 1),
 ('in', 2),
 ('Adventures', 2),
 ('Wonderland', 2)]

In [13]:
rdd3.reduceByKey(lambda x,y:x+y).sortByKey?

Object `sortByKey` not found.


In [8]:
"""
sortByKey()
"""
rdd3.reduceByKey(lambda x,y:x+y).sortByKey().collect()

                                                                                

[('Adventures', 2),
 ('Alice’s', 1),
 ('Gutenberg’s', 3),
 ('Project', 3),
 ('Wonderland', 2),
 ('in', 2)]

In [14]:
"""
countByValue()
"""
rdd = sc.parallelize([1,3,1,7,3,11], 2)
rdd.countByValue()

NameError: name 'sc' is not defined

## --------------------------------------------------------
## Prototype:

    aggregate(zeroValue, seqOp, combOp)

## Description:

    aggregate() lets you take an RDD and generate a single value that is of a different type than what was stored in the original RDD.

Parameters:

    zeroValue: The initialization value, for your result, in the desired format.
    seqOp: The operation you want to apply to RDD records. Runs once for every record in a partition.
    combOp: Defines how the resulted objects (one for every partition), gets combined.
    Example:

Compute the sum of a list and the length of that list. Return the result in a pair of (sum, length).

    In a Spark shell, I first created a list with 4 elements, with 2 partitions:



In [101]:

# listRDD = sc.parallelize([1,2,3,4], 2)
listRDD = sc.parallelize([1,3,5,7,9,11], 2)

In [102]:
# then I defined my seqOp:
seqOp = (lambda local_result, list_element: (local_result[0] + list_element, local_result[1] + 1) )


In [103]:
# and my combOp:

combOp = (lambda some_local_result, another_local_result: (some_local_result[0] + another_local_result[0], some_local_result[1] + another_local_result[1]) )

In [104]:
listRDD.aggregate( (0, 0), seqOp, combOp)

(36, 6)

<img src="https://i.stack.imgur.com/Sbk4V.png">

In [100]:
# example described in 'figure':

"""
            (0, 0) <-- zeroValue

[1, 2]                  [3, 4]

0 + 1 = 1               0 + 3 = 3
0 + 1 = 1               0 + 1 = 1

1 + 2 = 3               3 + 4 = 7
1 + 1 = 2               1 + 1 = 2       
    |                       |
    v                       v
  (3, 2)                  (7, 2)
      \                    / 
       \                  /
        \                /
         \              /
          \            /
           \          / 
           ------------
           |  combOp  |
           ------------
                |
                v
             (10, 4)
"""

'\n            (0, 0) <-- zeroValue\n\n[1, 2]                  [3, 4]\n\n0 + 1 = 1               0 + 3 = 3\n0 + 1 = 1               0 + 1 = 1\n\n1 + 2 = 3               3 + 4 = 7\n1 + 1 = 2               1 + 1 = 2       \n    |                       |\n    v                       v\n  (3, 2)                  (7, 2)\n      \\                    / \n       \\                  /\n        \\                /\n         \\              /\n          \\            /\n           \\          / \n           ------------\n           |  combOp  |\n           ------------\n                |\n                v\n             (10, 4)\n'