In [1]:
import pyspark as ps
sc = ps.SparkContext()

In [2]:
import os

# need to get local path since we are reading local files
cwd = os.getcwd()

In [3]:
file_rdd = sc.textFile('file://' + cwd + '/data/toy_data.txt')
file_rdd.take(2)

[u'{"Jane": "2"}', u'{"Jane": "1"}']

In [4]:
# default OSX block size is 4kb, and minPartitions is 2
file_rdd.getNumPartitions()

2

In [5]:
cat data/toy_data.txt

{"Jane": "2"}
{"Jane": "1"}
{"Pete": "20"}
{"Tyler": "3"}
{"Duncan": "4"}
{"Yuki": "5"}
{"Duncan": "6"}
{"Duncan": "4"}
{"Duncan": "5"}


In [6]:
file_rdd.mapPartitionsWithIndex(lambda i, iterator: (i, list(iterator))).collect()

[0,
 [u'{"Jane": "2"}',
  u'{"Jane": "1"}',
  u'{"Pete": "20"}',
  u'{"Tyler": "3"}',
  u'{"Duncan": "4"}'],
 1,
 [u'{"Yuki": "5"}',
  u'{"Duncan": "6"}',
  u'{"Duncan": "4"}',
  u'{"Duncan": "5"}']]

In [7]:
file_rdd = sc.textFile('file://' + cwd + '/data/toy_data.txt', 10)

In [8]:
# some partitions are empty, can't split a single line
file_rdd.mapPartitionsWithIndex(lambda i, iterator: (i, list(iterator))).collect()

[0,
 [u'{"Jane": "2"}'],
 1,
 [u'{"Jane": "1"}'],
 2,
 [u'{"Pete": "20"}'],
 3,
 [u'{"Tyler": "3"}'],
 4,
 [u'{"Duncan": "4"}'],
 5,
 [u'{"Yuki": "5"}'],
 6,
 [u'{"Duncan": "6"}'],
 7,
 [u'{"Duncan": "4"}'],
 8,
 [],
 9,
 [u'{"Duncan": "5"}'],
 10,
 []]

In [9]:
file_rdd = sc.textFile('file://' + cwd + '/data/toy_data.txt', 1)

In [10]:
file_rdd.mapPartitionsWithIndex(lambda i, iterator: (i, list(iterator))).collect()

[0,
 [u'{"Jane": "2"}',
  u'{"Jane": "1"}',
  u'{"Pete": "20"}',
  u'{"Tyler": "3"}',
  u'{"Duncan": "4"}',
  u'{"Yuki": "5"}',
  u'{"Duncan": "6"}',
  u'{"Duncan": "4"}',
  u'{"Duncan": "5"}']]

In [11]:
tup_rdd = file_rdd.map(lambda line: json.loads(line)) \
                  .map(lambda d: (d.keys()[0], int(d.values()[0])))

out = tup_rdd.groupByKey().mapValues(lambda tup: max(tup.data)) 

In [12]:
print tup_rdd.toDebugString()

(1) PythonRDD[14] at RDD at PythonRDD.scala:43 []
 |  MapPartitionsRDD[8] at textFile at NativeMethodAccessorImpl.java:-2 []
 |  file:///Users/jonathandinu/Repositories/published/building-spark-applications-live-lessons/code/data/toy_data.txt HadoopRDD[7] at textFile at NativeMethodAccessorImpl.java:-2 []


In [13]:
print out.toDebugString()

(1) PythonRDD[15] at RDD at PythonRDD.scala:43 []
 |  MapPartitionsRDD[13] at mapPartitions at PythonRDD.scala:346 []
 |  ShuffledRDD[12] at partitionBy at NativeMethodAccessorImpl.java:-2 []
 +-(1) PairwiseRDD[11] at groupByKey at <ipython-input-11-5dd19bb9f251>:3 []
    |  PythonRDD[10] at groupByKey at <ipython-input-11-5dd19bb9f251>:3 []
    |  MapPartitionsRDD[8] at textFile at NativeMethodAccessorImpl.java:-2 []
    |  file:///Users/jonathandinu/Repositories/published/building-spark-applications-live-lessons/code/data/toy_data.txt HadoopRDD[7] at textFile at NativeMethodAccessorImpl.java:-2 []


In [14]:
print file_rdd.toDebugString()

(1) MapPartitionsRDD[8] at textFile at NativeMethodAccessorImpl.java:-2 []
 |  file:///Users/jonathandinu/Repositories/published/building-spark-applications-live-lessons/code/data/toy_data.txt HadoopRDD[7] at textFile at NativeMethodAccessorImpl.java:-2 []


In [15]:
# doesn't matter if 1 or 100 map transformations
print file_rdd.map(lambda line: json.loads(line)).toDebugString()

(1) PythonRDD[16] at RDD at PythonRDD.scala:43 []
 |  MapPartitionsRDD[8] at textFile at NativeMethodAccessorImpl.java:-2 []
 |  file:///Users/jonathandinu/Repositories/published/building-spark-applications-live-lessons/code/data/toy_data.txt HadoopRDD[7] at textFile at NativeMethodAccessorImpl.java:-2 []


In [16]:
print tup_rdd.toDebugString()

(1) PythonRDD[14] at RDD at PythonRDD.scala:43 []
 |  MapPartitionsRDD[8] at textFile at NativeMethodAccessorImpl.java:-2 []
 |  file:///Users/jonathandinu/Repositories/published/building-spark-applications-live-lessons/code/data/toy_data.txt HadoopRDD[7] at textFile at NativeMethodAccessorImpl.java:-2 []


In [17]:
print tup_rdd.groupByKey().toDebugString()

(1) PythonRDD[21] at RDD at PythonRDD.scala:43 []
 |  MapPartitionsRDD[20] at mapPartitions at PythonRDD.scala:346 []
 |  ShuffledRDD[19] at partitionBy at NativeMethodAccessorImpl.java:-2 []
 +-(1) PairwiseRDD[18] at groupByKey at <ipython-input-17-62a481cc790d>:1 []
    |  PythonRDD[17] at groupByKey at <ipython-input-17-62a481cc790d>:1 []
    |  MapPartitionsRDD[8] at textFile at NativeMethodAccessorImpl.java:-2 []
    |  file:///Users/jonathandinu/Repositories/published/building-spark-applications-live-lessons/code/data/toy_data.txt HadoopRDD[7] at textFile at NativeMethodAccessorImpl.java:-2 []


In [18]:
print out.toDebugString()

(1) PythonRDD[15] at RDD at PythonRDD.scala:43 []
 |  MapPartitionsRDD[13] at mapPartitions at PythonRDD.scala:346 []
 |  ShuffledRDD[12] at partitionBy at NativeMethodAccessorImpl.java:-2 []
 +-(1) PairwiseRDD[11] at groupByKey at <ipython-input-11-5dd19bb9f251>:3 []
    |  PythonRDD[10] at groupByKey at <ipython-input-11-5dd19bb9f251>:3 []
    |  MapPartitionsRDD[8] at textFile at NativeMethodAccessorImpl.java:-2 []
    |  file:///Users/jonathandinu/Repositories/published/building-spark-applications-live-lessons/code/data/toy_data.txt HadoopRDD[7] at textFile at NativeMethodAccessorImpl.java:-2 []


## Scala

```scala
file_rdd.mapPartitionsWithIndex((ind, part) => part.map(x => (ind, x))).collect()

file_rdd.map(line => line.split(" "))
        .map(split => (split(0), split(1).toInt))
        .groupByKey()
        .mapValues(iter => iter.reduce(_ + _)).collect()

file_rdd.map(line => line.split(" ")).map(split => (split(0), split(1).toInt))
                                     .groupByKey()
                                     .mapPartitionsWithIndex((ind, part) => part.map(x => (ind, x)))
                                     .collect()
                                     
// res68: Array[(Int, (String, Iterable[Int]))] = Array((0,(anna,CompactBuffer(1))), (0,(jesse,CompactBuffer(3))), (1,(jon,CompactBuffer(2, 1))), (1,(mary,CompactBuffer(3, 5))))
```