In [None]:
spark

# RDD creation

In [None]:
r = range(0,10)

In [None]:
rdd1 = sc.parallelize(r)
rdd1

In [None]:
rdd1.collect()

In [None]:
rdd1.first()

In [None]:
rdd1.take(5)

In [None]:
rdd1.count()

In [None]:
rdd_text = sc.textFile('LICENSE_spark.txt')

In [None]:
rdd_text.take(5)

In [None]:
rdd_text.count()

# Operations on RDD

In [None]:
rdd1.filter(lambda x: x%2 == 0)

In [None]:
rdd1.filter(lambda x: x%2 == 0).collect()

In [None]:
rdd1.map(lambda x: 2*x).collect()

In [None]:
rdd1.map(lambda x: 2*x).sum()

In [None]:
rdd1.map(lambda x: [y for y in  range(1,x)]).collect()

In [None]:
rdd1.flatMap(lambda x: [y for y in  range(1,x)]).collect()

In [None]:
rdd2 = sc.parallelize(range(4,13))
rdd1.intersection(rdd2).collect()

In [None]:
rdd2 = sc.parallelize(range(8,12))
rdd1.union(rdd2).collect()

In [None]:
x = sc.parallelize([1,2,3])
y = sc.parallelize([10,11,12])
z = x.cartesian(y)
z.collect()

## Aggregations

In [None]:
rdd1.reduce(lambda x,y: x + y)

In [None]:
rdd1.aggregate(0, lambda x,y: x + y, lambda x,y: x + y)

In [None]:
rdd1.aggregate(0, lambda x,y: max(x,y), lambda x,y: x + y)

In [None]:
a = sc.parallelize(["negro", "azul", "blanco", "verde", "gris"], 2)

In [None]:
a.aggregate(0, lambda x,y: x + len(y), lambda x,y: x + y)

## With keys

In [None]:
z.keys().collect()

In [None]:
a = sc.parallelize(["negro", "azul", "blanco", "verde", "gris"], 2)
b = a.groupBy(lambda x: len(x))
sorted([(x,sorted(y)) for (x,y) in b.collect()])

In [None]:
b.countByKey()

In [None]:
b.mapValues(len).collect()

In [None]:
a = sc.parallelize(["azul", "verde", "naranja"], 3)
b = a.keyBy(lambda x: len(x))
c = sc.parallelize(["negro", "blanco", "gris"], 3)
d = c.keyBy(lambda x: len(x))
b.join(d).collect()

In [None]:
b.leftOuterJoin(d).collect()

In [None]:
b.rightOuterJoin(d).collect()

In [None]:
b.fullOuterJoin(d).collect()

In [None]:
a = sc.parallelize(["negro", "azul", "blaco", "verde", "gris"], 2)
b = a.map(lambda x: (len(x), x))
b.reduceByKey(lambda x,y: x + y).collect()

## Words count

In [None]:
import re
pattern = re.compile('[^[^a-zA-Z ]')

In [None]:
rdd_text = sc.textFile('LICENSE_spark.txt')

In [None]:
rdd_text.take(5)

**Q1:** limpiar la lista, quitar espacios y caracteres non alfabeticos

**Q2:** separar las palabras

**Q3:** cuantas palabras differentes ?

**Q4:** palabras mas frequentes ?

# Accumulator

In [None]:
acc = sc.accumulator(0)

In [None]:
def initiate(x):
    acc.add(1)
    return (x,1)

In [None]:
words_pairs2 = words.map(initiate)

In [None]:
words_pairs2.foldByKey(0, lambda x,y: x+y).take(5)

In [None]:
acc.value

In [None]:
words_pairs2.foreach(lambda x: acc.value)

In [None]:
list_nt = sc.parallelize(range(1,100))

In [None]:
list_nt.foreach(lambda x: acc.add(1))

In [None]:
acc.value

# Broadcast

In [None]:
brands = SparkContext.broadcast(sc,["Apache", "Spark", "Hive", "AWS"])

In [None]:
words_pairs3 = words\
    .filter(lambda x: x in brands.value)\
    .map(initiate)

In [None]:
words_pairs3.foldByKey(0, lambda x,y: x+y).collect()

In [None]:
br_ints = sc.broadcast([3,5])

In [None]:
list_nt.filter(lambda x: all([x % i == 0 for i in br_ints.value])).collect()

# Analysis on numerical RDDs

In [None]:
import random

In [None]:
rdd2 = sc.parallelize([random.randint(0,10) for _ in range(25)])

In [None]:
rdd2.stats()

In [None]:
rdd2.histogram(5)

# Partitions

In [None]:
rdd2.getNumPartitions()

In [None]:
def printList(l): print('['+ ','.join([str(i) for i in l]) + ']')
rdd.foreachPartition(printList) # actually printing on stout of the 

In [None]:
#def collectList(iterator): return list(iterator)
def collectList(iterator): yield list(iterator)
#rdd.mapPartitions(lambda i: list(i)).take(3)
rdd2.mapPartitions(collectList).collect()

In [None]:
# there is a predefined function to do that: glom()
rdd2.glom().collect()

In [None]:
def lenIter(iterator): yield sum(1 for x in iterator)
#def lenIter(iterator): return sum(1 for x in iterator)
rdd2.mapPartitions(lenIter).collect()

In [None]:
print(str(rdd2.coalesce(2).glom().toDebugString()).replace('\\n','\n'))
rdd2.coalesce(2).glom().collect()

In [None]:
rdd2.coalesce(6).glom().collect()

In [None]:
print(str(rdd2.repartition(2).glom().toDebugString()).replace('\\n','\n'))
rdd2.repartition(2).glom().collect()

In [None]:
rdd2.repartition(6).glom().collect()

In [None]:
rdd2.map(lambda x: (x%6, x)).partitionBy(6).glom().collect()
# idem as rdd.map(lambda x: (x,x)).partitionBy(6, partitionFunc=(lambda x: x%6)).glom().collect()

# Persistance

In [None]:
rdd3 = sc.parallelize([random.randint(0,10) for _ in range(50)])\
    .map(lambda x: (x%5, x)).cache() #.checkpoint()

In [None]:
grps = rdd3.groupByKey()

In [None]:
grps.count()

In [None]:
grps.mapValues(lambda l: [x for x in l]).collect()