In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install -q pyspark

In [0]:
import pyspark, os
import math
from math import sqrt
from math import log
from random import seed
from random import randint
from pyspark import SparkConf, SparkContext
os.environ["PYSPARK_PYTHON"]="python3"
os.environ["JAVA_HOME"]="/usr/lib/jvm/java-8-openjdk-amd64/"

#connects our python driver to a local Spark JVM running on the Google Colab server virtual machine
try:
  conf = SparkConf().setMaster("local[*]").set("spark.executor.memory", "1g")
  sc = SparkContext(conf = conf)
except ValueError:
  #it's ok if the server is already started
  pass


#also include this short helper function for use later in this lab
def dbg(x):
  """ A helper function to print debugging information on RDDs """
  if isinstance(x, pyspark.RDD):
    print([(t[0], list(t[1]) if 
            isinstance(t[1], pyspark.resultiterable.ResultIterable) else t[1])
           if isinstance(t, tuple) else t
           for t in x.take(100)])
  else:
    print(x)


Create a python Spark program that does the following:

1. Loads each line of the text file pg16328.txt as an entry in an RDD:  https://www.gutenberg.org/cache/epub/16328/pg16328.txt
hint: you can use this in your colab notebook to download the file automatically:
!wget "https://www.gutenberg.org/cache/epub/16328/pg16328.txt"

2. Maps each RDD entry so that each entry contains a tuple of (lowercase set of letters, the original line) (hint: use python's set(), lower(), and isalpha() method on strings). For example the following line:

"{The famous race of Spear-Danes.}"

would be represented as the tuple 

({'a', 'c', 'd', 'e', 'f', 'h', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u'}, "{The famous race of Spear-Danes.}")

3. Finds and prints the 5 lines in the file with the highest Jaccard Similarity to your full name. Recall that Jaccard Similarity is the size of the intersection / size of the union of two sets. For example using my name "James Atlas" the output should be:

['States.', '1859. Alliterative measures.', 'BATTLE-SARK.--Armor.', '{The gleeman sings}', '          Than the head and the handle handsome with jewels;']


Full credit for efficient solutions that use Spark functions

Here is an example colab starter for Spark:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install -q pyspark
import os
os.environ["PYSPARK_PYTHON"]="python3"
os.environ["JAVA_HOME"]="/usr/lib/jvm/java-8-openjdk-amd64/"

import pyspark
from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local[*]").set("spark.executor.memory", "1g")
sc = SparkContext(conf = conf)

In [47]:
!wget "https://www.gutenberg.org/cache/epub/16328/pg16328.txt"
df = sc.textFile("pg16328.txt")

# Print the rdd of lines for marker
dbg(file)

--2020-06-16 04:24:24--  https://www.gutenberg.org/cache/epub/16328/pg16328.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 301063 (294K) [text/plain]
Saving to: ‘pg16328.txt.6’


2020-06-16 04:24:24 (1.08 MB/s) - ‘pg16328.txt.6’ saved [301063/301063]

['The Project Gutenberg EBook of Beowulf ', '', 'This eBook is for the use of anyone anywhere at no cost and with', 'almost no restrictions whatsoever.  You may copy it, give it away or', 're-use it under the terms of the Project Gutenberg License included', 'with this eBook or online at www.gutenberg.net', '', '', 'Title: Beowulf', '       An Anglo-Saxon Epic Poem, Translated From The Heyne-Socin', '       Text by Lesslie Hall', '', 'Author:  ', '', 'Release Date: July 19, 2005 [EBook #16328]', '', 'Language: English', '', '', '*** START OF THIS 

In [0]:
def strip_words(file):
  """for char in lines set them to lower and get rid of non letters, returns set of them as lower case"""
  letter_list = []
  for letter in file:
    if letter.isalpha():
      letter_list.append(letter.lower())
  return set(letter_list)

In [49]:
values = df.map(lambda x: (strip_words(x), x))
# Print for marker
dbg(values)

[({'b', 'j', 'o', 't', 'w', 'g', 'p', 'f', 'u', 'e', 'n', 'k', 'r', 'c', 'l', 'h'}, 'The Project Gutenberg EBook of Beowulf '), (set(), ''), ({'s', 'b', 'a', 'o', 't', 'w', 'f', 'e', 'u', 'n', 'k', 'r', 'c', 'd', 'i', 'h', 'y'}, 'This eBook is for the use of anyone anywhere at no cost and with'), ({'s', 'm', 'a', 'o', 't', 'w', 'g', 'p', 'v', 'e', 'u', 'n', 'r', 'c', 'l', 'i', 'h', 'y'}, 'almost no restrictions whatsoever.  You may copy it, give it away or'), ({'s', 'b', 'm', 'j', 'o', 't', 'g', 'p', 'f', 'u', 'e', 'n', 'r', 'c', 'l', 'd', 'i', 'h'}, 're-use it under the terms of the Project Gutenberg License included'), ({'s', 'b', 'a', 'o', 't', 'w', 'g', 'u', 'e', 'n', 'k', 'r', 'l', 'i', 'h'}, 'with this eBook or online at www.gutenberg.net'), (set(), ''), (set(), ''), ({'b', 'o', 't', 'w', 'f', 'u', 'e', 'l', 'i'}, 'Title: Beowulf'), ({'s', 'm', 'a', 'o', 't', 'g', 'p', 'x', 'f', 'e', 'n', 'r', 'c', 'l', 'd', 'i', 'h', 'y'}, '       An Anglo-Saxon Epic Poem, Translated From The He

In [0]:
name = ['j', 'a', 'm', 'e', 's', 'a', 't', 'l', 'a', 's']
my_name = ['g', 'r', 'e', 'y', 'h', 'a', 'r', 'r', 'i', 's']

In [63]:
def jaccard_similarity(list1, list2):
	"""calculates the jaccard similarity"""
	set_one = set(list1)
	set_two = set(list2)
	return len(set_one.intersection(set_two)) / len(set_one.union(set_two))

results = values.map(lambda x: (jaccard_similarity(x[0], my_name), x[1]))
results_sorted = results.sortByKey(ascending=False)
dbg(results_sorted)

[(0.7, 'BIGHT.--Bay, sea.'), (0.7, '{Hrothgar retires.}'), (0.7, '    he arose again_.'), (0.6666666666666666, '{He rules the Geats fifty years.}'), (0.6666666666666666, '        5 Then the trusty retainer treasure-gems many'), (0.6363636363636364, 'TARGE, TARGET.--Shield.'), (0.6363636363636364, "       45 High on his horse then Hrothgar's retainer"), (0.6363636363636364, '{The waters are gory.}'), (0.6153846153846154, '      Wiglaf the Trusty'), (0.6153846153846154, 'his goodness of heart, and his generosity._'), (0.6, 'see the strangers.}'), (0.5833333333333334, "      Hrothgar's Gratitude"), (0.5833333333333334, '       35 That greater strength in the waters I had then,'), (0.5833333333333334, "HROTHGAR'S GRATITUDE."), (0.5833333333333334, '    _Seized by the hair_.'), (0.5833333333333334, '          To the royal ring-hall, Hrothgar to greet there:'), (0.5833333333333334, '{All my gifts I lay at thy feet.}'), (0.5833333333333334, "{Higelac's death recalled.}"), (0.5833333333333334,

In [70]:
results_sorted.map(lambda x: x[1]).take(5)

['BIGHT.--Bay, sea.',
 '{Hrothgar retires.}',
 '    he arose again_.',
 '{He rules the Geats fifty years.}',
 '        5 Then the trusty retainer treasure-gems many']