In [65]:
from pyspark.sql import SparkSession

In [66]:
spark = (
    SparkSession
    .builder
    .master("local")
    .appName("Local Spark")
    .config('spark.ui.port', '4040')
    .getOrCreate()
)

sc = spark.sparkContext    

In [67]:
data = [1, 5, 4, 7, 1, 2]

In [68]:
rddData = sc.parallelize(data, 3)

In [69]:
rddTlb = sc.textFile('tlb.txt')
rddTlb.collect()

['cause every little thing', 'is gonna be all right', 'little gonna cause all']

## Map
Return a new RDD formed by passing each element of the source though a function *func*

In [70]:
rddTlb.map(lambda x : len(x)).collect()

[24, 21, 22]

In [71]:
rddTlb.map(lambda x : x.split(" ")).collect()

[['cause', 'every', 'little', 'thing'],
 ['is', 'gonna', 'be', 'all', 'right'],
 ['little', 'gonna', 'cause', 'all']]

## FlatMap
Similar to map, but each input item can be mapped to 0 or more output items (so *func* should return a Seq rather than a single item)

In [72]:
rddTlb.flatMap(lambda x : x.split(" ")).collect()

['cause',
 'every',
 'little',
 'thing',
 'is',
 'gonna',
 'be',
 'all',
 'right',
 'little',
 'gonna',
 'cause',
 'all']

## Filter
Return a new RDD formed by selecting those elements of the source on which *func* returns true

In [73]:
(
    rddTlb
        .flatMap(lambda x : x.split(" "))
        .filter(lambda x : (len(x)>4))
        .collect()
)

['cause',
 'every',
 'little',
 'thing',
 'gonna',
 'right',
 'little',
 'gonna',
 'cause']

In [74]:
(
    rddData
    .filter(lambda x : (x>3))
    .collect()
)

[5, 4, 7]

## Creating key-value RDDs

In [75]:
(
    rddTlb.
    flatMap(lambda x : x.split(" "))
    .map(lambda x : (x, len(x)))
    .collect()   
)

[('cause', 5),
 ('every', 5),
 ('little', 6),
 ('thing', 5),
 ('is', 2),
 ('gonna', 5),
 ('be', 2),
 ('all', 3),
 ('right', 5),
 ('little', 6),
 ('gonna', 5),
 ('cause', 5),
 ('all', 3)]

In [76]:
(
    rddTlb
    .flatMap(lambda x : x.split(" "))
    .zipWithIndex()
    .collect()
)

[('cause', 0),
 ('every', 1),
 ('little', 2),
 ('thing', 3),
 ('is', 4),
 ('gonna', 5),
 ('be', 6),
 ('all', 7),
 ('right', 8),
 ('little', 9),
 ('gonna', 10),
 ('cause', 11),
 ('all', 12)]

In [77]:
(
    rddTlb
    .flatMap(lambda x : x.split(" "))
    .map(lambda x : (x,1))
    .collect()
)

[('cause', 1),
 ('every', 1),
 ('little', 1),
 ('thing', 1),
 ('is', 1),
 ('gonna', 1),
 ('be', 1),
 ('all', 1),
 ('right', 1),
 ('little', 1),
 ('gonna', 1),
 ('cause', 1),
 ('all', 1)]

## GroupByKey
When called on a RDDs of **(K, V) pairs**, returns a dataset of **(K, Iterable<V>) pairs**

In [78]:
(
    rddTlb
    .flatMap(lambda x : x.split(" "))
    .map(lambda x : (len(x), x))
    .groupByKey()
    .mapValues(lambda x : list(x)) # to make the Iterable printable
    .collect()
)

[(5, ['cause', 'every', 'thing', 'gonna', 'right', 'gonna', 'cause']),
 (6, ['little', 'little']),
 (2, ['is', 'be']),
 (3, ['all', 'all'])]

## ReduceByKey
When called on a RDDs of (K, V) pairs, returns a dataset of (K, V) pairs where the values for each key are aggregated using the given reduce function *func*

In [79]:
(
    rddTlb
    .flatMap(lambda x : x.split(" "))
    .map(lambda x : (x, 1))
    .reduceByKey(lambda x,y : x+y)
    .collect()
)

[('cause', 2),
 ('every', 1),
 ('little', 2),
 ('thing', 1),
 ('is', 1),
 ('gonna', 2),
 ('be', 1),
 ('all', 2),
 ('right', 1)]

## SortByKey
When called on an RDD of (K, V) pairs, returns a dataset of (K, V) pairs sorted by keys in ascending or descending order

In [84]:
(
    rddTlb
    .flatMap(lambda x : x.split(" "))
    .map(lambda x : (len(x), x))
    .sortByKey(False)
    .collect()    
)

[(6, 'little'),
 (6, 'little'),
 (5, 'cause'),
 (5, 'every'),
 (5, 'thing'),
 (5, 'gonna'),
 (5, 'right'),
 (5, 'gonna'),
 (5, 'cause'),
 (3, 'all'),
 (3, 'all'),
 (2, 'is'),
 (2, 'be')]

## Join
When called on datasets of type (K, V) and (K, W), returns a dataset of (K, (V, W)) pairs with all pairs of elements for each key

In [90]:
rdd1 = sc.parallelize([("a", 10), ("b", 20)])
rdd2 = sc.parallelize([("a", 1), ("a", 2), ("c", 3)])

In [91]:
rdd1.join(rdd2).collect()

[('a', (10, 1)), ('a', (10, 2))]