In [1]:
from pyspark.sql import SparkSession

# New API
spark_session = SparkSession.builder\
        .master("spark://192.168.2.51:7077") \
        .appName("TianruZ_lecture1_example2")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.cores.max", 4)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/02/12 18:34:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/02/12 18:34:07 WARN ExecutorAllocationManager: Dynamic allocation without a shuffle service is an experimental feature.


In [2]:
book=spark_context.textFile('/home/ubuntu/20417.txt.utf-8')
#book=spark_context.textFile('hdfs://192.168.2.70:9000/gutenberg.txt')
book.take(5)

                                                                                

['The Project Gutenberg EBook of The Outline of Science, Vol. 1 (of 4), by ',
 'J. Arthur Thomson',
 '',
 'This eBook is for the use of anyone anywhere at no cost and with',
 'almost no restrictions whatsoever.  You may copy it, give it away or']

In [3]:
# rdd.map(): Return a new RDD by applying a function to each element of this RDD.
## split each line into seperated words
book_sp=book.map(lambda x: x.split(" "))
book_sp.take(5)

[['The',
  'Project',
  'Gutenberg',
  'EBook',
  'of',
  'The',
  'Outline',
  'of',
  'Science,',
  'Vol.',
  '1',
  '(of',
  '4),',
  'by',
  ''],
 ['J.', 'Arthur', 'Thomson'],
 [''],
 ['This',
  'eBook',
  'is',
  'for',
  'the',
  'use',
  'of',
  'anyone',
  'anywhere',
  'at',
  'no',
  'cost',
  'and',
  'with'],
 ['almost',
  'no',
  'restrictions',
  'whatsoever.',
  '',
  'You',
  'may',
  'copy',
  'it,',
  'give',
  'it',
  'away',
  'or']]

In [4]:
# rdd.filter(): Return a new RDD containing only the elements that satisfy a predicate.
## for instance we can filter out sentences with too short phrases, which might be useless for analysis.
book_sp_1=book_sp.filter(lambda x: len(x) > 1)
book_sp_1.take(5)

[['The',
  'Project',
  'Gutenberg',
  'EBook',
  'of',
  'The',
  'Outline',
  'of',
  'Science,',
  'Vol.',
  '1',
  '(of',
  '4),',
  'by',
  ''],
 ['J.', 'Arthur', 'Thomson'],
 ['This',
  'eBook',
  'is',
  'for',
  'the',
  'use',
  'of',
  'anyone',
  'anywhere',
  'at',
  'no',
  'cost',
  'and',
  'with'],
 ['almost',
  'no',
  'restrictions',
  'whatsoever.',
  '',
  'You',
  'may',
  'copy',
  'it,',
  'give',
  'it',
  'away',
  'or'],
 ['re-use',
  'it',
  'under',
  'the',
  'terms',
  'of',
  'the',
  'Project',
  'Gutenberg',
  'License',
  'included']]

In [5]:
# rdd.flatMap(): Return a new RDD by first applying a function to all elements of this RDD, and then flattening the results.
## for example we can create single word RDD from previous result
book_sw=book_sp_1.flatMap(lambda x: x)
book_sw.take(20)

['The',
 'Project',
 'Gutenberg',
 'EBook',
 'of',
 'The',
 'Outline',
 'of',
 'Science,',
 'Vol.',
 '1',
 '(of',
 '4),',
 'by',
 '',
 'J.',
 'Arthur',
 'Thomson',
 'This',
 'eBook']

In [6]:
# rdd.groupBy(): Return an RDD of grouped items. Can be used to group the RDD elements by some condition.
## for example we group the words by their length.
book_sw_fl = book_sw.groupBy(lambda x: len(x))
book_sw_fl.take(2)

                                                                                

[(2, <pyspark.resultiterable.ResultIterable at 0x7f3c39d4eb90>),
 (8, <pyspark.resultiterable.ResultIterable at 0x7f3c39d4ebf0>)]

In [7]:
book_sw_fl.mapValues(list).take(2)

[(2,
  ['of',
   'of',
   'by',
   'J.',
   'is',
   'of',
   'at',
   'no',
   'no',
   'it',
   'or',
   'it',
   'of',
   'or',
   'at',
   'of',
   '4)',
   'J.',
   'OF',
   'OF',
   'by',
   'at',
   'OF',
   'OF',
   'TO',
   'OF',
   'OF',
   'BY',
   'J.',
   'OF',
   'IN',
   'OF',
   'OF',
   '40',
   'IN',
   'IN',
   'G.',
   'P.',
   'G.',
   'P.',
   'in',
   'of',
   'By',
   'J.',
   'it',
   'it',
   'to',
   'it',
   'of',
   'is',
   'it',
   'of',
   'of',
   'is',
   'of',
   'of',
   'It',
   'be',
   'to',
   'in',
   'to',
   'it',
   'is',
   'it',
   'is',
   'in',
   'to',
   'be',
   'of',
   'of',
   'an',
   'at',
   'of',
   'is',
   'it',
   'of',
   'at',
   'of',
   'is',
   'is',
   'of',
   'It',
   'is',
   'to',
   'in',
   'of',
   'by',
   'to',
   'to',
   'he',
   'no',
   'of',
   'by',
   'an',
   'of',
   'of',
   'to',
   'up',
   'as',
   'on',
   'be',
   'to',
   'of',
   'To',
   'it',
   'in',
   'to',
   'be',
   'he',
   'he',
   'o

In [8]:
# rdd.groupByKey(): Group the values for each key in the RDD into a single sequence, can be used to group RDD by key of elements.
## NOTICE that the elements of RDD must be a (key,value) pair.
## for example we can first construct (word,1) key-value pair, and then group by key, which is the word:
book_sw_p = book_sw.map(lambda x: (x,1))
book_wk=book_sw_p.groupByKey()
book_wk.take(5)

                                                                                

[('The', <pyspark.resultiterable.ResultIterable at 0x7f3c39d4fbb0>),
 ('Project', <pyspark.resultiterable.ResultIterable at 0x7f3c39d4fc70>),
 ('EBook', <pyspark.resultiterable.ResultIterable at 0x7f3c39d4f670>),
 ('of', <pyspark.resultiterable.ResultIterable at 0x7f3c39d4fd00>),
 ('Outline', <pyspark.resultiterable.ResultIterable at 0x7f3c39d4fd60>)]

In [9]:
# use .mapValues() to pass each value in the key-value pair through a map function
book_wk.mapValues(list).take(2)

[('The',
  [1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
 

In [10]:
# rdd.reduceByKey(): Merge the values for each key using an associative and commutative reduce function.
## NOTICE that the elements of RDD must be a (key,value) pair.
## for example we can reduce the (word,1) key-value pair, and do wordcount:
from operator import add
book_wordcount = book_sw_p.reduceByKey(add)
book_wordcount.take(5)

                                                                                

[('The', 876), ('Project', 78), ('EBook', 2), ('of', 5425), ('Outline', 7)]

In [11]:
# set hash seed to disable randomness
import os
os.environ["PYTHONHASHSEED"]=str(123)
# Frequency of the word 'Discovery'
book_wordcount.lookup('Discovery')

[5]

In [12]:
book_wordcount.keys().take(20)

['The',
 'Project',
 'EBook',
 'of',
 'Outline',
 'Science,',
 'Vol.',
 '1',
 '4),',
 '',
 'Arthur',
 'Thomson',
 'is',
 'use',
 'anyone',
 'anywhere',
 'at',
 'no',
 'restrictions',
 'whatsoever.']

In [13]:
# rdd.distinct(): Return a new RDD containing the distinct elements in this RDD.
# check the length of list before/after distinct
print('Before .distinct():',book_sw.count())
print('After  .distinct():',book_sw.distinct().count())

Before .distinct(): 122004




After  .distinct(): 17955




In [14]:
# rdd.keyBy(): Creates tuples of the elements in this RDD by applying f.
## for example we can realize FirstLetterCount with this operation.
book_sw.keyBy(lambda x: x[0]).take(5)

[('T', 'The'),
 ('P', 'Project'),
 ('G', 'Gutenberg'),
 ('E', 'EBook'),
 ('o', 'of')]

In [15]:
# Pipelined operation
sorted(                                  # sort the results by alphabet
    book.map(lambda x: x.split(" "))     # split each line into seperated words
    .filter(lambda x: len(x) > 0)        # filter out empty lines
    .flatMap(lambda x: x)                # flatMap to single words
    .filter(lambda x: len(x) > 0)        # filter out empty words
    .keyBy(lambda x: x[0].lower())       # extract the first letter and covert to lower case
    .map(lambda x: (x[0],1))             # create (first_letter, 1) pairs
    .reduceByKey(add)                    # reduce the key-value pair by adding up
    .collect()                           # collect the result
)

                                                                                

[('"', 418),
 ('#', 1),
 ('$', 1),
 ('&', 12),
 ("'", 5),
 ('(', 391),
 ('*', 8),
 ('+', 1),
 ('-', 18),
 ('.', 1),
 ('0', 4),
 ('1', 356),
 ('2', 197),
 ('3', 71),
 ('4', 62),
 ('5', 50),
 ('6', 38),
 ('7', 30),
 ('8', 27),
 ('9', 31),
 ('=', 2),
 ('[', 246),
 ('_', 473),
 ('a', 12898),
 ('b', 4861),
 ('c', 4299),
 ('d', 2743),
 ('e', 3492),
 ('f', 4130),
 ('g', 1770),
 ('h', 3106),
 ('i', 8691),
 ('j', 344),
 ('k', 465),
 ('l', 2689),
 ('m', 4819),
 ('n', 2063),
 ('o', 9720),
 ('p', 3880),
 ('q', 173),
 ('r', 2505),
 ('s', 7411),
 ('t', 18600),
 ('u', 1110),
 ('v', 936),
 ('w', 5886),
 ('x', 32),
 ('y', 479),
 ('z', 55),
 ('{', 19),
 ('|', 142),
 ('§', 77),
 ('æ', 6)]

In [16]:
# define a function and use it in spark
def key_pair(x):
    return (x[0],x,1)
book_sw.map(key_pair).take(5)

[('T', 'The', 1),
 ('P', 'Project', 1),
 ('G', 'Gutenberg', 1),
 ('E', 'EBook', 1),
 ('o', 'of', 1)]

In [18]:
spark_session.stop()