In [1]:
# Reason why we have the getOrCreate code
# http://stackoverflow.com/questions/28999332/how-to-access-sparkcontext-in-pyspark-script
sc = SparkContext.getOrCreate()

In [1]:
data = sc.parallelize(
[('Amber', 22), ('Alfred', 23), ('Skye', 4), ('Albert', 12),
('Amber', 9)])
print(data)
print(data.collect())

ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:195
[('Amber', 22), ('Alfred', 23), ('Skye', 4), ('Albert', 12), ('Amber', 9)]


In [2]:
import numpy as np
rData = [np.random.randn() for _ in range(10)]
print(rData)

[-0.1476690330609267, 1.5905135738187912, -0.47087220886818454, 0.34155116366645183, 0.42825442325465435, -1.4363675694570615, 1.1667971803200048, -0.9744371182908209, -0.0461867991886116, -1.6275070540080683]


In [3]:
data = sc.parallelize(rData)
print("data\n", data)
print("all data\n", data.collect())
print("the first 5\n", data.take(5))

data
 ParallelCollectionRDD[1] at parallelize at PythonRDD.scala:195
all data
 [-0.1476690330609267, 1.5905135738187912, -0.47087220886818454, 0.34155116366645183, 0.42825442325465435, -1.4363675694570615, 1.1667971803200048, -0.9744371182908209, -0.0461867991886116, -1.6275070540080683]
the first 5
 [-0.1476690330609267, 1.5905135738187912, -0.47087220886818454, 0.34155116366645183, 0.42825442325465435]


In [4]:
data = sc.parallelize([("park", 43), ("kim", 25)])
print(data.collect())

[('park', 43), ('kim', 25)]


In [5]:
data = sc.textFile("./test1.txt")
data.take(10)

['ROMEO AND JULIET',
 '',
 '',
 'ACT I',
 '',
 '',
 '',
 'SCENE I\tVerona. A public place.',
 '',
 '']

# 4. PySpark

<img src="images/start.png" width=800>

# pyspark map

<img src="images/map.png">

In [4]:
x = sc.parallelize(["b", "a", "c"])
print(x.collect())
print(x)

['b', 'a', 'c']
ParallelCollectionRDD[3] at parallelize at PythonRDD.scala:195


In [6]:
y = x.map(lambda z: (z, 1))
print(y.collect())
print(y)

[('b', 1), ('a', 1), ('c', 1)]
PythonRDD[5] at collect at <ipython-input-6-d048126179ab>:2


# filter

<img src="images/filter.png">

In [7]:
x = sc.parallelize([1,2,3])
y = x.filter(lambda x: x%2 == 1) #keep odd values
print(x.collect())
print(y.collect())

[1, 2, 3]
[1, 3]


# flatMap

<img src="images/flatmap.png">

In [8]:
x = sc.parallelize([1,2,3])
y = x.flatMap(lambda x: (x, x*100, 42))
print(x.collect())
print(y.collect())

[1, 2, 3]
[1, 100, 42, 2, 200, 42, 3, 300, 42]


In [9]:
x = sc.parallelize([1,2,3])
y = x.map(lambda x: (x, x*100, 42))
print(x.collect())
print(y.collect())

[1, 2, 3]
[(1, 100, 42), (2, 200, 42), (3, 300, 42)]


# groupBy

<img src="images/groupby.png">

In [11]:
x = sc.parallelize(['John', 'Fred', 'Anna', 'James'])
y = x.groupBy(lambda w: w[0])
print([(k, list(v)) for (k, v) in y.collect()])

[('J', ['John', 'James']), ('F', ['Fred']), ('A', ['Anna'])]


In [12]:
print(y)

PythonRDD[17] at collect at <ipython-input-11-8d58b3ac6c76>:3


In [14]:
[(k,v) for k,v in y.collect()]

[('J', <pyspark.resultiterable.ResultIterable at 0x1f2bb179240>),
 ('F', <pyspark.resultiterable.ResultIterable at 0x1f2bb1797f0>),
 ('A', <pyspark.resultiterable.ResultIterable at 0x1f2bb179a20>)]

# groupByKey

<img src="images/groupbykey.png">

In [15]:
x = sc.parallelize([('B',5),('B',4),('A',3),('A',2),('A',1)])
y = x.groupByKey()
print(x.collect())
print(list((j[0], list(j[1])) for j in y.collect()))

[('B', 5), ('B', 4), ('A', 3), ('A', 2), ('A', 1)]
[('B', [5, 4]), ('A', [3, 2, 1])]


In [16]:
y.collect()

[('B', <pyspark.resultiterable.ResultIterable at 0x1f2bb14a828>),
 ('A', <pyspark.resultiterable.ResultIterable at 0x1f2bb14a550>)]

# reduceByKey vs groupByKey

<img src="images/reducegroup.png">

In [20]:
words = ["one", "two", "two", "three", "three", "three"]
wordPairsRDD = sc.parallelize(words).map(lambda x : (x,1))
wordPairsRDD.collect()

[('one', 1), ('two', 1), ('two', 1), ('three', 1), ('three', 1), ('three', 1)]

In [28]:
from operator import add
wordsCountsWithReduce = wordPairsRDD.reduceByKey(lambda x,y : x+y).collect()
wordsCountsWithReduce

[('two', 2), ('three', 3), ('one', 1)]

In [35]:
wordCountsWithGroup = wordPairsRDD.groupByKey().map(lambda x :(x[0], sum(x[1]))).collect()
wordCountsWithGroup

[('two', 2), ('three', 3), ('one', 1)]

# mapPartitions

<img src="images/mappartition.png">

In [41]:
x = sc.parallelize([1,2,3], 2)
def f(iterator): 
    yield sum(iterator)
y = x.mapPartitions(f)
# glom() flattens elements on the same partition
print(x.glom().collect())
print(y.glom().collect())

[[1], [2, 3]]
[[1], [5]]


# mapPartitionsWIthIndex

<img src="images/mappartitionswithindex.png">

In [42]:
x = sc.parallelize([1,2,3], 2)
def f(partitionIndex, iterator): 
    yield (partitionIndex, sum(iterator))
y = x.mapPartitionsWithIndex(f)
# glom() flattens elements on the same partition
print(x.glom().collect())
print(y.glom().collect())


[[1], [2, 3]]
[[(0, 1)], [(1, 5)]]


# sample

<img src="images/sample.png">

In [47]:
x = sc.parallelize([1, 2, 3, 4, 5])
y = x.sample(False, 0.8, 42)
z = x.sample(False, 0.8, 41)
print(x.collect())
print(y.collect())
print(z.collect())

[1, 2, 3, 4, 5]
[1, 2, 4, 5]
[1, 2, 3, 4]


In [49]:
x = sc.parallelize([1,2,3], 2)
y = sc.parallelize([3,4], 1)
z = x.union(y)
print(x.glom().collect())
print(y.glom().collect())
print(z.glom().collect())

[[1], [2, 3]]
[[3, 4]]
[[1], [2, 3], [3, 4]]


# join

<img src="images/join.png">

In [50]:
x = sc.parallelize([("a", 1), ("b", 2)])
y = sc.parallelize([("a", 3), ("a", 4), ("b", 5)])
z = x.join(y)
print(z.collect())

[('b', (2, 5)), ('a', (1, 3)), ('a', (1, 4))]


In [51]:
x = sc.parallelize([1,2,3,3,4])
y = x.distinct()
print(y.collect())

[1, 2, 3, 4]


In [52]:
x = sc.parallelize([1,2,3,3,4],3)
y = x.distinct()
print(y.collect())

[3, 1, 4, 2]


In [55]:
print(x.glom().collect())
print(y.glom().collect())

[[1], [2, 3], [3, 4]]
[[3], [1, 4], [2]]


# coalesce

<img src="images/coalesce.png">

In [56]:
x = sc.parallelize([1, 2, 3, 4, 5], 3)
y = x.coalesce(2)
print(x.glom().collect())
print(y.glom().collect())

[[1], [2, 3], [4, 5]]
[[1], [2, 3, 4, 5]]


# keyBy

<img src="images/keybey.png">

In [58]:
x = sc.parallelize(['John', 'Fred', 'Anna', 'James'])
y = x.keyBy(lambda w: w[0])
print(y.collect())

[('J', 'John'), ('F', 'Fred'), ('A', 'Anna'), ('J', 'James')]


In [59]:
x = sc.parallelize(['John', 'Fred', 'Anna', 'James'])
y = x.keyBy(lambda w: w[1])
print(y.collect())

[('o', 'John'), ('r', 'Fred'), ('n', 'Anna'), ('a', 'James')]


# partitionBy

<img src="images/partitionby.png">

In [62]:
x = sc.parallelize([('J','James'),('F','Fred'),('A','Anna'),('J','John')], 3)
y = x.partitionBy(2, lambda w: 0 if w[0] < 'H' else 1)
print(x.glom().collect())
print(y.glom().collect())

[[('J', 'James')], [('F', 'Fred')], [('A', 'Anna'), ('J', 'John')]]
[[('F', 'Fred'), ('A', 'Anna')], [('J', 'James'), ('J', 'John')]]


# zip

<img src="images/zip.png">

In [63]:
x = sc.parallelize([1, 2, 3])
y = x.map(lambda n:n*n)
z = x.zip(y)
print(z.collect())

[(1, 1), (2, 4), (3, 9)]


# getNumPartitions

<img src="images/getnumpartitions.png">

In [64]:
x = sc.parallelize([1,2,3], 2)
y = x.getNumPartitions()
print(x.glom().collect())
print(y)

[[1], [2, 3]]
2


# collect

<img src="images/collect.png">

In [65]:
x = sc.parallelize([1,2,3], 2)
y = x.collect()
print(x.glom().collect())
print(y)

[[1], [2, 3]]
[1, 2, 3]


# reduce

<img src="images/reduce.png">

<img src="images/reduce2.png" width=700>

In [66]:
x = sc.parallelize([1,2,3,4])
y = x.reduce(lambda a,b: a+b)
print(x.collect())
print(y)

[1, 2, 3, 4]
10
