In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install -q pyspark

[K     |████████████████████████████████| 217.8MB 59kB/s 
[K     |████████████████████████████████| 204kB 43.7MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [0]:
import pyspark, os
import math
from math import sqrt
from pyspark import SparkConf, SparkContext
os.environ["PYSPARK_PYTHON"]="python3"
os.environ["JAVA_HOME"]="/usr/lib/jvm/java-8-openjdk-amd64/"

#connects our python driver to a local Spark JVM running on the Google Colab server virtual machine
try:
  conf = SparkConf().setMaster("local[*]").set("spark.executor.memory", "1g")
  sc = SparkContext(conf = conf)
except ValueError:
  #it's ok if the server is already started
  pass


#also include this short helper function for use later in this lab
def dbg(x):
  """ A helper function to print debugging information on RDDs """
  if isinstance(x, pyspark.RDD):
    print([(t[0], list(t[1]) if 
            isinstance(t[1], pyspark.resultiterable.ResultIterable) else t[1])
           if isinstance(t, tuple) else t
           for t in x.take(100)])
  else:
    print(x)


In [14]:
data = [1, 2, 3, 4, 5]
distData = sc.parallelize(data)

print(data)
print(distData)

value = distData.reduce(lambda a, b: a + b)
print(value)


[1, 2, 3, 4, 5]
ParallelCollectionRDD[12] at parallelize at PythonRDD.scala:195
15


In [17]:
A = range(1000)
pA = sc.parallelize(A) 

#prints sum
dbg(sum(A))

#prints sum but parrallised version 
dbg(pA.reduce(lambda a,b: a+b))

499500
499500


In [29]:
pB = sc.parallelize(range(1000))
pBsquares = pB.map(lambda b: sqrt(b))
pBresult = pBsquares.reduce(lambda a, b: a + b)

print(pBresult)


21065.833110879044


In [44]:
# Let us quickly review some basic transformations availble within Spark. Lets create a smaller list of numbers to play with. 

nums = sc.parallelize([1,2,3,4,5]) 

# retain elements passing a predicate 
evens = nums.filter(lambda x: x%2 == 0) 
#print("evens")
#print(dbg(evens))

# map each element to zero or more others 
x = nums.flatMap(lambda x: range(x)) 
#print("x")
#print(dpg(x))


# retrieve RDD contents as a local collection 
dbg(x.collect())

# return first 2 elements
dbg(evens.take(2))

# count number of elements 
dbg(nums.count())


[0, 0, 1, 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 4]
[2, 4]
5


In [46]:
#def repeat_nums(numList):
#  pnumList = sc.parallelize(numList)
#  collection = 
#  return collection

def repeat_nums(numList):
  return numList.flatMap(lambda x: [x] * x)

dbg(repeat_nums(sc.parallelize([1, 3, 4, 2, 1])))
#> [1, 3, 3, 3, 4, 4, 4, 4, 2, 2, 1]

dbg(repeat_nums(sc.parallelize([5, 2, 6, 3, 1])))
#> [5, 5, 5, 5, 5, 2, 2, 6, 6, 6, 6, 6, 6, 3, 3, 3, 1]


[1, 3, 3, 3, 4, 4, 4, 4, 2, 2, 1]
[5, 5, 5, 5, 5, 2, 2, 6, 6, 6, 6, 6, 6, 3, 3, 3, 1]


In [49]:
#Due to the ease and performance of distributed hashing, 
#the original form MapReduce concept defaults to using (key,value) pairs as the data representation. 
#In python we use tuples to represent these

pair = ('a','b') 
print(pair[0], pair[1])

#Let’s consider a quick example
pets = sc.parallelize([('cat',1), ('dog',3), ('cat',2),('dog',1),('hamster',1)]) 
print("pets start")
dbg(pets.reduceByKey(lambda x,y: x+y))
dbg(pets.groupByKey())
dbg(pets.sortByKey())
print("pets end")

#Let’s try for a more complex example, word count and working with files. 
#First use a shell command to download the text of Peter Pan from the Guttenberg project
!wget -q -O peterpan.txt https://www.gutenberg.org/files/16/16-0.txt
# load the file into a distributed dataset of lines
file = sc.textFile("peterpan.txt")
# split each line into (word, 1) tuples
words = file.flatMap(lambda line: [(word.lower(), 1) for word in line.split(" ")])
# reduce by key (the word) the counts and sort descending
counts = words.reduceByKey(lambda a, b: a + b).sortBy(lambda x: x[1], False) 
dbg(counts.collect())


a b
pets start
[('cat', 3), ('dog', 4), ('hamster', 1)]
[('cat', [1, 2]), ('dog', [3, 1]), ('hamster', [1])]
[('cat', 1), ('cat', 2), ('dog', 3), ('dog', 1), ('hamster', 1)]
pets end


In [0]:
!wget -q -O peterpan.txt https://www.gutenberg.org/files/16/16-0.txt
!wget -q -O sherlockholmes.txt http://www.gutenberg.org/files/1661/1661-0.txt
!wget -q -O aliceinwonderland.txt http://www.gutenberg.org/files/11/11-0.txt
!wget -q -O huckleberryfinn.txt http://www.gutenberg.org/files/76/76-0.txt

In [0]:
def word_counts(filename):
    file = sc.textFile(filename)
    words = file.flatMap(lambda line: [(word.lower(), 1) for word in line.split(" ")])
    counts = words.reduceByKey(lambda a, b: a + b).sortBy(lambda x: x[1], False) 
    #return counts.collect()
    return counts

In [75]:
dbg(word_counts("peterpan.txt").collect())
dbg(word_counts("sherlockholmes.txt").collect())
dbg(word_counts("aliceinwonderland.txt").collect())
dbg(word_counts("huckleberryfinn.txt").collect())



In [0]:
panCounts = word_counts("peterpan.txt")
sherlockCounts = word_counts("sherlockholmes.txt")
aliceCounts = word_counts("aliceinwonderland.txt")
huckCounts = word_counts("huckleberryfinn.txt")

In [0]:
def computeTFji(scObject):
    max_count = scObject.take(1)[0][1]
    TFji = scObject.map(lambda x: (x[0], x[1]/max_count))
    return TFji

In [76]:
dbg(computeTFji(panCounts))
dbg(computeTFji(sherlockCounts))
dbg(computeTFji(aliceCounts))
dbg(computeTFji(huckCounts))

[('the', 1.0), ('', 0.899641577060932), ('and', 0.5675029868578255), ('to', 0.4942254082039028), ('he', 0.40979689366786143), ('a', 0.3906810035842294), ('of', 0.38868976503385105), ('was', 0.35961768219832735), ('in', 0.28634010354440464), ('it', 0.23655913978494625), ('that', 0.2309836718438869), ('she', 0.2285941855834329), ('they', 0.22580645161290322), ('had', 0.20111509358821186), ('his', 0.1875746714456392), ('you', 0.17801672640382318), ('but', 0.1768219832735962), ('for', 0.16248506571087215), ('not', 0.15452011150935882), ('with', 0.14854639585822382), ('her', 0.14774990043807248), ('is', 0.13978494623655913), ('on', 0.13580246913580246), ('at', 0.1354042214257268), ('as', 0.13500597371565112), ('i', 0.10075667064914377), ('be', 0.09996017522899243), ('have', 0.0991636798088411), ('peter', 0.0967741935483871), ('were', 0.0967741935483871), ('all', 0.09478295499800876), ('this', 0.0931899641577061), ('their', 0.08721624850657109), ('said', 0.08681800079649542), ('so', 0.085623