In [1]:
spark

# Simple RDD manipulation

In [1]:
r = range(0,10)

In [105]:
rdd1 = sc.parallelize(r)
rdd1

PythonRDD[200] at RDD at PythonRDD.scala:53

In [17]:
rdd1.collect()

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [115]:
rdd1.first()

0

In [116]:
rdd1.take(5)

[0, 1, 2, 3, 4]

In [20]:
rdd1.count()

10

## Operations on RDD

In [106]:
rdd1.filter(lambda x: x%2 == 0)

PythonRDD[201] at RDD at PythonRDD.scala:53

In [77]:
rdd1.filter(lambda x: x%2 == 0).collect()

[0, 2, 4, 6, 8]

In [78]:
rdd1.map(lambda x: 2*x).collect()

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]

In [107]:
rdd1.map(lambda x: 2*x).sum()

90

In [13]:
rdd1.map(lambda x: [y for y in  range(1,x)]).collect()

[[],
 [],
 [1],
 [1, 2],
 [1, 2, 3],
 [1, 2, 3, 4],
 [1, 2, 3, 4, 5],
 [1, 2, 3, 4, 5, 6],
 [1, 2, 3, 4, 5, 6, 7],
 [1, 2, 3, 4, 5, 6, 7, 8]]

In [15]:
rdd1.flatMap(lambda x: [y for y in  range(1,x)]).collect()

[1,
 1,
 2,
 1,
 2,
 3,
 1,
 2,
 3,
 4,
 1,
 2,
 3,
 4,
 5,
 1,
 2,
 3,
 4,
 5,
 6,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8]

## Read file

In [2]:
import re
pattern = re.compile('[^[^a-zA-Z ]')

In [3]:
rdd_text = sc.textFile('LICENSE_spark.txt')

In [4]:
rdd_text.take(5)

['                                 Apache License',
 '                           Version 2.0, January 2004',
 '                        http://www.apache.org/licenses/',
 '',
 '   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION']

**Q1:** limpiar la lista, quitar espacios y caracteres non alfabeticos

**Q2:** separar las palabras

**Q3:** cuantas palabras differentes ?

**Q4:** palabras mas frequentes ?

## Accumulator

In [25]:
acc = sc.accumulator(0)

In [10]:
def initiate(x):
    acc.add(1)
    return (x,1)

In [14]:
words_pairs2 = words.map(initiate)

In [17]:
words_pairs2.foldByKey(0, lambda x,y: x+y).take(5)

[('Apache', 5),
 ('', 98),
 ('January', 1),
 ('httpwwwapacheorglicenses', 1),
 ('CONDITIONS', 4)]

In [18]:
acc.value

4192

In [None]:
words_pairs2.foreach(lambda x: acc.value)

In [24]:
list_nt = sc.parallelize(range(1,100))

In [27]:
list_nt.foreach(lambda x: acc.add(1))

In [28]:
acc.value

99

## Broadcast

In [37]:
brands = SparkContext.broadcast(sc,["Apache", "Spark", "Hive", "AWS"])

In [40]:
words_pairs3 = words\
    .filter(lambda x: x in brands.value)\
    .map(initiate)

In [41]:
words_pairs3.foldByKey(0, lambda x,y: x+y).collect()

[('Apache', 5)]

In [42]:
br_ints = sc.broadcast([3,5])

In [47]:
list_nt.filter(lambda x: all([x % i == 0 for i in br_ints.value])).collect()

[15, 30, 45, 60, 75, 90]

## Analysis on numerical RDDs

In [20]:
import random

In [111]:
rdd2 = sc.parallelize([random.randint(0,10) for _ in range(25)])

In [112]:
rdd2.stats()

(count: 25, mean: 4.12, stdev: 3.166322788346128, max: 10.0, min: 1.0)

In [113]:
rdd2.histogram(5)

([1.0, 2.8, 4.6, 6.4, 8.2, 10], [10, 5, 4, 3, 3])

## Partitions

In [114]:
rdd2.getNumPartitions()

4

In [19]:
def printList(l): print('['+ ','.join([str(i) for i in l]) + ']')
rdd.foreachPartition(printList) # actually printing on stout of the 

In [21]:
#def collectList(iterator): return list(iterator)
def collectList(iterator): yield list(iterator)
#rdd.mapPartitions(lambda i: list(i)).take(3)
rdd2.mapPartitions(collectList).collect()

[[3, 0, 6, 4, 0, 5],
 [0, 5, 2, 1, 2, 3],
 [9, 8, 10, 3, 10, 9],
 [10, 2, 0, 7, 3, 4, 7]]

In [67]:
# there is a predefined function to do that: glom()
rdd2.glom().collect()

[[7, 1, 0, 0, 1, 0],
 [0, 6, 4, 2, 3, 3],
 [8, 10, 5, 5, 9, 5],
 [5, 4, 8, 9, 2, 2, 2]]

In [22]:
def lenIter(iterator): yield sum(1 for x in iterator)
#def lenIter(iterator): return sum(1 for x in iterator)
rdd2.mapPartitions(lenIter).collect()

[6, 6, 6, 7]

In [57]:
print(str(rdd2.coalesce(2).glom().toDebugString()).replace('\\n','\n'))
rdd2.coalesce(2).glom().collect()

b'(2) PythonRDD[126] at RDD at PythonRDD.scala:53 []
 |  CoalescedRDD[125] at coalesce at <unknown>:0 []
 |  ParallelCollectionRDD[5] at readRDDFromFile at PythonRDD.scala:262 []'


[[3, 0, 6, 4, 0, 5, 0, 5, 2, 1, 2, 3],
 [9, 8, 10, 3, 10, 9, 10, 2, 0, 7, 3, 4, 7]]

In [58]:
rdd2.coalesce(6).glom().collect()

[[3, 0, 6, 4, 0, 5],
 [0, 5, 2, 1, 2, 3],
 [9, 8, 10, 3, 10, 9],
 [10, 2, 0, 7, 3, 4, 7]]

In [55]:
print(str(rdd2.repartition(2).glom().toDebugString()).replace('\\n','\n'))
rdd2.repartition(2).glom().collect()

b'(2) PythonRDD[116] at RDD at PythonRDD.scala:53 []
 |  MapPartitionsRDD[115] at coalesce at <unknown>:0 []
 |  CoalescedRDD[114] at coalesce at <unknown>:0 []
 |  ShuffledRDD[113] at coalesce at <unknown>:0 []
 +-(4) MapPartitionsRDD[112] at coalesce at <unknown>:0 []
    |  PythonRDD[111] at RDD at PythonRDD.scala:53 []
    |  ParallelCollectionRDD[5] at readRDDFromFile at PythonRDD.scala:262 []'


[[3, 0, 6, 4, 0, 5, 9, 8, 10, 3, 10, 9, 10, 2, 0, 7, 3, 4, 7],
 [0, 5, 2, 1, 2, 3]]

In [59]:
rdd2.repartition(6).glom().collect()

[[],
 [3, 0, 6, 4, 0, 5],
 [10, 2, 0, 7, 3, 4, 7],
 [],
 [9, 8, 10, 3, 10, 9],
 [0, 5, 2, 1, 2, 3]]

In [69]:
rdd2.map(lambda x: (x%6, x)).partitionBy(6).glom().collect()
# idem as rdd.map(lambda x: (x,x)).partitionBy(6, partitionFunc=(lambda x: x%6)).glom().collect()

[[(0, 0), (0, 0), (0, 0), (0, 0), (0, 6)],
 [(1, 7), (1, 1), (1, 1)],
 [(2, 2), (2, 8), (2, 8), (2, 2), (2, 2), (2, 2)],
 [(3, 3), (3, 3), (3, 9), (3, 9)],
 [(4, 4), (4, 10), (4, 4)],
 [(5, 5), (5, 5), (5, 5), (5, 5)]]

## Persistance

In [50]:
rdd3 = sc.parallelize([random.randint(0,10) for _ in range(50)])\
    .map(lambda x: (x%5, x)).cache() #.checkpoint()

In [51]:
grps = rdd3.groupByKey()

In [52]:
grps.count()

5

In [53]:
grps.mapValues(lambda l: [x for x in l]).collect()

[(4, [9, 4, 9, 4, 4, 9]),
 (0, [10, 10, 5, 10, 10, 5, 10, 0, 0, 0, 5, 0, 0, 5, 10, 10, 5, 0, 5, 5, 0]),
 (1, [6, 1, 1, 1, 6, 6, 1]),
 (2, [2, 2, 2, 7, 7, 2, 7]),
 (3, [8, 3, 8, 3, 3, 8, 8, 8, 3])]