In [1]:
from pyspark.sql import SparkSession
from operator import add

spark_session = SparkSession.builder\
        .master("spark://192.168.2.70:7077") \
        .appName("TianruZ_lecture1_hdfs_example")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.cores.max", 4)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/02/10 10:52:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/02/10 10:52:26 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
23/02/10 10:52:28 WARN ExecutorAllocationManager: Dynamic allocation without a shuffle service is an experimental feature.


In [2]:
lines = spark_context.textFile('/home/ubuntu/i_have_a_dream.txt')
lines.first()

                                                                                

'I am happy to join with you today in what will go down in history as the greatest demonstration for freedom in the history of our nation.'

In [3]:
lines.getNumPartitions()

2

In [4]:
# The same example, this time using map and reduce from the Spark API, and loading the text file from HDFS.

lines = spark_context.textFile("hdfs://192.168.2.70:9000/king_dream.txt")
#lines = spark_context.textFile("/home/ubuntu/i_have_a_dream.txt")
print(lines.first())

words = lines.map(lambda line: line.split(' '))

word_counts = words.map(lambda w: len(w))

total_words = word_counts.reduce(add)

print(f'total words= {total_words}')  

# ... the same number of words?

I am happy to join with you today in what will go down in history as the greatest demonstration for freedom in the history of our nation.
total words= 1680


In [5]:
lines.take(10)

['I am happy to join with you today in what will go down in history as the greatest demonstration for freedom in the history of our nation.',
 '',
 'Five score years ago, a great American, in whose symbolic shadow we stand today, signed the Emancipation Proclamation. This momentous decree came as a great beacon light of hope to millions of Negro slaves who had been seared in the flames of withering injustice. It came as a joyous daybreak to end the long night of their captivity.',
 '',
 'But one hundred years later, the Negro still is not free. One hundred years later, the life of the Negro is still sadly crippled by the manacles of segregation and the chains of discrimination. One hundred years later, the Negro lives on a lonely island of poverty in the midst of a vast ocean of material prosperity. One hundred years later, the Negro is still languishing in the corners of American society and finds himself an exile in his own land. So we have come here today to dramatize a shameful con

In [6]:
lines_splitted = lines.map(lambda line: line.split(' '))
print(lines_splitted.first())

['I', 'am', 'happy', 'to', 'join', 'with', 'you', 'today', 'in', 'what', 'will', 'go', 'down', 'in', 'history', 'as', 'the', 'greatest', 'demonstration', 'for', 'freedom', 'in', 'the', 'history', 'of', 'our', 'nation.']


In [7]:
# Note, we're in Python, but using Java naming conventions!

all_words = lines.flatMap(lambda line: line.split(' '))
all_words.take(100)

['I',
 'am',
 'happy',
 'to',
 'join',
 'with',
 'you',
 'today',
 'in',
 'what',
 'will',
 'go',
 'down',
 'in',
 'history',
 'as',
 'the',
 'greatest',
 'demonstration',
 'for',
 'freedom',
 'in',
 'the',
 'history',
 'of',
 'our',
 'nation.',
 '',
 'Five',
 'score',
 'years',
 'ago,',
 'a',
 'great',
 'American,',
 'in',
 'whose',
 'symbolic',
 'shadow',
 'we',
 'stand',
 'today,',
 'signed',
 'the',
 'Emancipation',
 'Proclamation.',
 'This',
 'momentous',
 'decree',
 'came',
 'as',
 'a',
 'great',
 'beacon',
 'light',
 'of',
 'hope',
 'to',
 'millions',
 'of',
 'Negro',
 'slaves',
 'who',
 'had',
 'been',
 'seared',
 'in',
 'the',
 'flames',
 'of',
 'withering',
 'injustice.',
 'It',
 'came',
 'as',
 'a',
 'joyous',
 'daybreak',
 'to',
 'end',
 'the',
 'long',
 'night',
 'of',
 'their',
 'captivity.',
 '',
 'But',
 'one',
 'hundred',
 'years',
 'later,',
 'the',
 'Negro',
 'still',
 'is',
 'not',
 'free.',
 'One',
 'hundred']

In [8]:
all_words.filter(lambda word: word.startswith('d'))\
         .take(20)

['down',
 'demonstration',
 'decree',
 'daybreak',
 'discrimination.',
 'dramatize',
 'defaulted',
 'demand',
 'drug',
 'democracy.',
 'dark',
 'desolate',
 'discontent',
 'day',
 'deeds.',
 'drinking',
 'dignity',
 'discipline.',
 'degenerate',
 'distrust']

In [9]:
# release the cores for another application!
spark_context.stop()