In [6]:
import os
# absolute path to this file
cwd = os.getcwd()

from pyspark import SparkConf, SparkContext, SQLContext
# Create a spark configuration with 20 threads.
# This code will run locally on master
conf = (SparkConf ()
        . setMaster("local[20]")
        . setAppName("sample app for reading files")
        . set("spark.executor.memory", "2g"))

sc = SparkContext(conf=conf)

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=sample app for reading files, master=local[20]) created by __init__ at <ipython-input-1-0704b9424f5b>:13 

In [7]:
#get file from local paths instead of HDFS.
#lines.sc.textFile("hdfs:/user/cloudera/words.txt")
lines = sc.textFile(os.path.join (cwd, "big-data-3\spark-wordcount\words.txt"))
lines.count()

124456

In [8]:
#Split each line into words
words = lines.flatMap(lambda line: line.split(" "))
#Assign initial count value to each word. Next, we will create tuples for each word with an initial count of 1
tuples = words.map(lambda word : (word, 1))

In [9]:
#Sum all word count values. 
counts = tuples.reduceByKey(lambda a,b : (a+b))




For those working with a larger dataset:

- rdd.collect() should not be used in this case as it will collect all data as an Array in the driver, which is the easiest way to get out of memory.

- rdd.coalesce(1).saveAsTextFile() should also not be used as the parallelism of upstream stages will be lost to be performed on a single node, where data will be stored from.

- rdd.coalesce(1, shuffle = true).saveAsTextFile() is the best simple option as it will keep the processing of upstream tasks parallel and then only perform the shuffle to one node (rdd.repartition(1).saveAsTextFile() is an exact synonym).

- rdd.saveAsSingleTextFile() as provided bellow additionally allows one to store the rdd in a single file with a specific name while keeping the parallelism properties of rdd.coalesce(1, shuffle = true).saveAsTextFile().

In [10]:
#Write word counts to text file/HDFS.
#----------------------------------
# Method 1. Save file to local machine as a text file. Viable to small datasets only.
# Collect RDD if we want to output by text file.
arr = counts.collect()

with open(os.path.join (cwd,"big-data-3\spark-wordcount\count.txt"), "w") as output:
    output.write(str(arr))


In [None]:
# Method 2: save as HDFS structure to a directory. 
counts.coalesce(1, shuffle = True).saveAsTextFile("big-data-3/spark-wordcount/result")