### Mike Whitley Lab 1

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install -q pyspark

[K     |████████████████████████████████| 218.4MB 61kB/s 
[K     |████████████████████████████████| 204kB 46.3MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [0]:
import pyspark, os
from pyspark import SparkConf, SparkContext
os.environ["PYSPARK_PYTHON"]="python3"
os.environ["JAVA_HOME"]="/usr/lib/jvm/java-8-openjdk-amd64/"

#connects our python driver to a local Spark JVM running on the Google Colab server virtual machine
try:
  conf = SparkConf().setMaster("local[*]").set("spark.executor.memory", "1g")
  sc = SparkContext(conf = conf)
except ValueError:
  #it's ok if the server is already started
  pass


#also include this short helper function for use later in this lab
def dbg(x):
  """ A helper function to print debugging information on RDDs """
  if isinstance(x, pyspark.RDD):
    print([(t[0], list(t[1]) if 
            isinstance(t[1], pyspark.resultiterable.ResultIterable) else t[1])
           if isinstance(t, tuple) else t
           for t in x.take(100)])
  else:
    print(x)


In [0]:

# First lets create some simple data, say the integers from 1 to 1000. We use the python command range to do this. Note that in practice our data will most likely come from data files. 

A = range(1000)

In [0]:
# Now let us distribute this data across all our processes using the sc.parallelize function.
pA = sc.parallelize(A)

In [5]:
# Let us start with a simple task of computing the sum of the values in the array:
dbg(sum(A))

499500


In [6]:
# To do this for our distributed array, we call the reduce function with a lambda function that adds two values. 
dbg(pA.reduce(lambda a,b: a+b))

499500


# **Problem 1**

In [7]:
#part 1
import math
A = [1,2,3,4,5,6,7,8]

accum = math.sqrt(A[0])
for index in range (1, len(A)):
  accum += math.sqrt(A[index])
print(accum)



16.30600052603572


In [8]:
#part 2
import math
A = [1,2,3,4,5,6,7,8]
pA = sc.parallelize(A)

sqareroot = pA.map(lambda s: math.sqrt(s))

answer = sqareroot.reduce((lambda a, b: a + b))

print(answer)

16.30600052603572


#background 2

In [0]:
nums = sc.parallelize([1,2,3,4,5])

In [20]:
# retain elements passing a predicate 
evens = nums.filter(lambda x: x%2 == 0)  #returns even numbers online 
dbg(evens)

x = nums.flatMap(lambda x: range(x)) 

# retrieve RDD contents as a local collection 
dbg(x.collect())

# return first 2 elements
dbg(evens.take(2))

# count number of elements 
dbg(nums.count())

print(type(x.collect()))


[2, 4, 6]
[0, 0, 1, 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 5]
[2, 4]
6
<class 'list'>


In [27]:
nums = sc.parallelize([1,2,3,4,5,6,7,8,9,10,11,12])
evens = nums.filter(lambda x: x%2 == 0)  #returns even numbers online 
dbg(evens.collect())
dbg(evens.take(3))



[2, 4, 6, 8, 10, 12]
[2, 4, 6]


# **Problem 2**

In [11]:
def repeat_nums(nums):
  return nums.flatMap(lambda x: [x]*x)

dbg(repeat_nums(sc.parallelize([1, 3, 4, 2, 1])))
#> [1, 3, 3, 3, 4, 4, 4, 4, 2, 2, 1]



[1, 3, 3, 3, 4, 4, 4, 4, 2, 2, 1]


In [12]:
import random
#Generate an array of 5 random numbers between 1 and 6 and parallelize it

dbg(repeat_nums(sc.parallelize(random.sample(range(1, 6), 5))))


[5, 5, 5, 5, 5, 4, 4, 4, 4, 1, 3, 3, 3, 2, 2]


In [13]:


dbg(repeat_nums(sc.parallelize([5, 2, 6, 3, 1])))
#> [5, 5, 5, 5, 5, 2, 2, 6, 6, 6, 6, 6, 6, 3, 3, 3, 1]


[5, 5, 5, 5, 5, 2, 2, 6, 6, 6, 6, 6, 6, 3, 3, 3, 1]


# Background 3


In [14]:
pair = ('a','b') 
print(pair[0], pair[1])

#Let’s consider a quick example
pets = sc.parallelize([('cat',1), ('dog',3), ('cat',2),('dog',1),('hamster',1)]) 
dbg(pets.reduceByKey(lambda x,y: x+y))
dbg(pets.groupByKey())
dbg(pets.sortByKey())

#Let’s try for a more complex example, word count and working with files. 
#First use a shell command to download the text of Peter Pan from the Guttenberg project
!wget -q -O peterpan.txt https://www.gutenberg.org/files/16/16-0.txt
# load the file into a distributed dataset of lines
file = sc.textFile("peterpan.txt")
# split each line into (word, 1) tuples
words = file.flatMap(lambda line: [(word.lower(), 1) for word in line.split(" ")])
# reduce by key (the word) the counts and sort descending
counts = words.reduceByKey(lambda a, b: a + b).sortBy(lambda x: x[1], False) 
dbg(counts.collect())


a b
[('cat', 3), ('dog', 4), ('hamster', 1)]
[('cat', [1, 2]), ('dog', [3, 1]), ('hamster', [1])]
[('cat', 1), ('cat', 2), ('dog', 3), ('dog', 1), ('hamster', 1)]


#**Problem 3**

##3a

In [15]:
!wget -q -O peterpan.txt https://www.gutenberg.org/files/16/16-0.txt
# load the file into a distributed dataset of lines
file = sc.textFile("peterpan.txt")
# split each line into (word, 1) tuples
words = file.flatMap(lambda line: [(word.lower(), 1) for word in line.split(" ")])
# reduce by key (the word) the counts and sort descending
counts = words.reduceByKey(lambda a, b: a + b).sortBy(lambda x: x[1], False) 
#dbg(counts.collect())
div = (counts.first()[1])

answer = counts.map(lambda a: (a[0], a[1] / div))
dbg(answer.collect())




##3b

In [16]:
#problem 3b load all the files in give them single value for question 3c
!wget -q -O sherlock.txt http://www.gutenberg.org/files/1661/1661-0.txt
!wget -q -O alice.txt http://www.gutenberg.org/files/11/11-0.txt
!wget -q -O huckleberry.txt http://www.gutenberg.org/files/76/76-0.txt
#removed the loading files from the function as it breaks half the time or fails to download

def itemsort(name):
  "tried to function this but keeps throwing errors"
  book = sc.textFile(name)
  words = book.flatMap(lambda line: [(word.lower(), 1) for word in line.split(" ")])
  obsolite_words_removed = words.distinct() #removes duplicates 
  return obsolite_words_removed #returns the single value words rdd

peterpan = itemsort("peterpan.txt")
sherlock = itemsort("sherlock.txt")
alice = itemsort("alice.txt")
huckleberry = itemsort("huckleberry.txt")

# print(len(peterpan.collect()))
# print(len(sherlock.collect()))
# print(len(alice.collect()))
# print(len(huckleberry.collect()))

dbg(peterpan)
dbg(sherlock)
dbg(alice)
dbg(huckleberry)



[('the', 1), ('peter', 1), ('pan,', 1), ('by', 1), ('m.', 1), ('for', 1), ('cost', 1), ('and', 1), ('with', 1), ('almost', 1), ('you', 1), ('copy', 1), ('it', 1), ('or', 1), ('under', 1), ('terms', 1), ('license', 1), ('included', 1), ('**', 1), ('a', 1), ('copyrighted', 1), ('please', 1), ('follow', 1), ('title:', 1), ('pan', 1), ('wendy', 1), ('25,', 1), ('2008', 1), ('[ebook', 1), ('#16]', 1), ('release', 1), ('july,', 1), ('updated:', 1), ('march', 1), ('10,', 1), ('2018', 1), ('english', 1), ('character', 1), ('encoding:', 1), ('[peter', 1), ('wendy]', 1), ('[james', 1), ('matthew', 1), ('barrie]', 1), ('(c)1991', 1), ('duncan', 1), ('contents:', 1), ('through', 1), ('2', 1), ('3', 1), ('come', 1), ('away,', 1), ('away!', 1), ('5', 1), ('island', 1), ('6', 1), ('little', 1), ('7', 1), ("mermaid's", 1), ('happy', 1), ('11', 1), ('children', 1), ('off', 1), ('13', 1), ('fairies?', 1), ('pirate', 1), ('ship', 1), ('15', 1), ('“hook', 1), ('me', 1), ('return', 1), ('grew', 1), ('up', 

#problem 3c IDFi (inverse document frequency)  = log2(N/ni)  where N = number of docs/4 and ni = how many documents have this term 


In [17]:
#files peterpan, sherlock, alice, huckleberry from question 3b all with a value of 1 with all duplicates removed 

#unions all files together could also reduceByKey 
union_file = sc.union([peterpan, sherlock, alice, huckleberry]).reduceByKey(lambda x,y : x+y)
# dbg(union_file)
IDFi = (union_file.map(lambda a: (a[0], math.log2(4/a[1]))).sortBy(lambda x: x[1],False))
dbg(union_file)
dbg(IDFi.collect())



[('gutenberg', 4), ('of', 4), ('james', 3), ('', 4), ('ebook,', 4), ('file.', 1), ('author:', 4), ('date:', 4), ('set', 4), ('start', 4), ('fulcrum', 2), ('research', 4), ('4', 4), ('bird', 4), ('10', 2), ('12', 1), ('are', 4), ('believe', 4), ('16', 1), ('when', 4), ('one,', 4), ('two', 4), ('garden,', 3), ('plucked', 2), ('flower', 2), ('i', 4), ('rather', 4), ('heart', 4), ('cried,', 3), ('why', 4), ('like', 4), ('them', 4), ('but', 4), ('came', 4), ('romantic', 1), ('mocking', 2), ('boxes,', 2), ('however', 2), ('mouth', 4), ('corner.', 3), ('he', 4), ('trying', 4), ('thought', 4), ('stocks', 1), ('really', 4), ('seemed', 4), ('would', 4), ('married', 3), ('cauliflowers', 1), ('dropped', 4), ('babies', 1), ("darling's", 1), ('doubtful', 2), ('frightfully', 1), ('honourable,', 1), ('imploringly.', 1), ('risk', 2), ('might,', 4), ('eight', 3), ('it!--did', 1), ('question', 4), ('seven?”', 1), ('george,”', 1), ('cried.', 4), ('mumps,”', 1), ('threateningly,', 1), ('thirty', 3), ('spea

#problem 3d Compute TF.IDF score for each term in Peter Pan and collect/display the top 100 terms by TF.IDF score   

In [18]:
#3d
#word count in the book / highest count in the  book * the number from combined  TF * IDF

def TF_IDF_score(peterpan, union_file):
  dbg(peterpan)
  #join the files so we get only what's needed
  result = peterpan.join(IDFi)
  #do the mathmatical calculation 
  result = result.map(lambda x : (x[0], x[1][0]* x[1][1]))
  return result

dbg(TF_IDF_score(answer, union_file).sortBy(lambda x: x[1], False).collect())

[('the', 1.0), ('', 0.899641577060932), ('and', 0.5675029868578255), ('to', 0.4942254082039028), ('he', 0.40979689366786143), ('a', 0.3906810035842294), ('of', 0.38868976503385105), ('was', 0.35961768219832735), ('in', 0.28634010354440464), ('it', 0.23655913978494625), ('that', 0.2309836718438869), ('she', 0.2285941855834329), ('they', 0.22580645161290322), ('had', 0.20111509358821186), ('his', 0.1875746714456392), ('you', 0.17801672640382318), ('but', 0.1768219832735962), ('for', 0.16248506571087215), ('not', 0.15452011150935882), ('with', 0.14854639585822382), ('her', 0.14774990043807248), ('is', 0.13978494623655913), ('on', 0.13580246913580246), ('at', 0.1354042214257268), ('as', 0.13500597371565112), ('i', 0.10075667064914377), ('be', 0.09996017522899243), ('have', 0.0991636798088411), ('peter', 0.0967741935483871), ('were', 0.0967741935483871), ('all', 0.09478295499800876), ('this', 0.0931899641577061), ('their', 0.08721624850657109), ('said', 0.08681800079649542), ('so', 0.085623

In [19]:
nums = sc.parallelize([1,2,3,4,5,6])
results = nums.map(lambda x: x + 1) 
dbg(results)

alpha = sc.parallelize(['the', 'poop', 'derp'])
results_alpha = alpha.map(lambda x: True if x == 'the' else False) 
dbg(results_alpha)

[2, 3, 4, 5, 6, 7]
[True, False, False]
