In [None]:
%%html
<style>
.h1_cell, .just_text {
    box-sizing: border-box;
    padding-top:5px;
    padding-bottom:5px;
    font-family: "Times New Roman", Georgia, Serif;
    font-size: 125%;
    line-height: 22px; /* 5px +12px + 5px */
    text-indent: 25px;
    background-color: #fbfbea;
    padding: 10px;
}
.code_block {
    box-sizing: border-box;
    padding-top:5px;
    padding-bottom:5px;
    font-size: 75%;
    line-height: 22px; /* 5px +12px + 5px */
    #text-indent: 25px;
    #background-color: #fbfbea;
    padding: 5px;
}

hr { 
    display: block;
    margin-top: 0.5em;
    margin-bottom: 0.5em;
    margin-left: auto;
    margin-right: auto;
    border-style: inset;
    border-width: 2px;
}
</style>

<h1>
<center>
Download and Installation
</center>
</h1>

In [1]:
import requests
import os
import sys
import subprocess

<div class=h1_cell>
<p>
Download the spark tarball in the current directory. The URL is one of many mirrors listed on spark's official website.
</div>

In [None]:
spark_url = "http://apache.osuosl.org/spark/spark-2.3.0/spark-2.3.0-bin-hadoop2.7.tgz"
r = requests.get(spark_url, stream=True)
filename = spark_url.rsplit('/')[-1]
with open(filename, 'wb') as f:
    f.write(r.content)

<div class=h1_cell>
<p>
Extract the spark tarball in the 'spark/' directory.
</div>

In [None]:
subprocess.call('mkdir spark'.split(' '))
subprocess.call('tar -xf spark-2.3.0-bin-hadoop2.7.tgz -C spark --strip-components 1'.split(' '))

<div class=h1_cell>
<p>
Add spark_home environment variable.
<p>
Add spark_home + '/bin' to run a pyspark console.
<p>
Add spark_home + '/python*' to environment to import pyspark.
<p>
NOTE: Run this chunk even if you already have Spark installed.
</div>

In [2]:
os.environ['SPARK_HOME'] = os.environ['HOME'] + '/spark'
os.environ['PATH'] += ':' + os.environ['SPARK_HOME'] + '/bin'
sys.path.append(os.environ['SPARK_HOME'] + '/python')
sys.path.append(os.environ['SPARK_HOME'] + '/python/lib/py4j-0.10.6-src.zip')

<div class=h1_cell>
<p>
Write the names of the slave nodes in the cluster. This script currently assumes the master machine is blue0.
</div>

In [None]:
machines = ["blue1", "blue3"]
content = "\n".join(machines)
with open(os.environ['SPARK_HOME'] + "/conf/slaves", 'w') as f:
    f.write(content)

<div class=h1_cell>
<p>
We now need to do the same exact thing on all the slave nodes. We will:
<ul>
<li>
Convert this notebook to a python file.
<li>
Delete the lines after the comment in the code below.
<li>
Run the editted python script on each slave node.
</ul>
<p>
Note: These nodes must have password-less ssh tunneling configured.
</div>

In [None]:
# Convert this ipython notebook to python script
!jupyter nbconvert --to=python setup_spark.ipynb

In [None]:
read_file = open('setup_spark.py', 'r')
lines = read_file.readlines()
read_file.close()
with open('setup_spark.py', 'w') as f:
    i = 0
    while i < len(lines) and lines[i].strip() != '# Convert this ipython notebook to python script':
        f.write(lines[i])
        i += 1

<div class=h1_cell>
<p>
This will take awhile. For each node, the script is downloading spark, extracting the package and configuring the environment.
</div>

In [None]:
!ssh blue1 python < setup_spark.py
!ssh blue3 python < setup_spark.py

<h1>
<center>
Using PySpark
</center>
</h1>

<div class=h1_cell>
<p>
Start the cluster.
</div>

In [3]:
# run start-all.sh
subprocess.call(os.environ['SPARK_HOME'] + "/sbin/start-all.sh", env=os.environ)

0

<div class=h1_cell>
<p>
This is where most other programs regarding this project will start.
</div>

In [4]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
spark = SparkSession(SparkContext(master='spark://blue0:7077'))
spark

<div class=h1_cell>
<p>
Lets generate some semi-random data and run a Spark K-Means implementation on it. We'll create a 2D array with 4 centers.
</div>

In [5]:
# 4 clusters
from random import random, shuffle

upper_left = [[random()*0.5, random()*0.5 + 0.5] for _ in range(2500)]
upper_right = [[random()*0.5 + 0.5 for _ in range(2)] for _ in range(2500)]
bottom_left = [[random()*0.5 for _ in range(2)] for _ in range(2500)]
bottom_right = [[random()*.5 + 0.5, random()*0.5] for _ in range(2500)]

matrix = upper_left + upper_right + bottom_left + bottom_right
shuffle(matrix)

data = spark.createDataFrame(matrix, schema=["A", "B"])
data.show(5)

+-------------------+-------------------+
|                  A|                  B|
+-------------------+-------------------+
|0.37934741961671314|0.19842292770414172|
| 0.4827234474797606| 0.8368192843182927|
| 0.7645647616824591| 0.2901033322963689|
|0.04517736175807846| 0.2516191581625483|
| 0.0997401767803513| 0.8537358668875344|
+-------------------+-------------------+
only showing top 5 rows



In [6]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

vdf = VectorAssembler(inputCols=data.columns, outputCol="features").transform(data)
vdf.show(5)

+-------------------+-------------------+--------------------+
|                  A|                  B|            features|
+-------------------+-------------------+--------------------+
|0.37934741961671314|0.19842292770414172|[0.37934741961671...|
| 0.4827234474797606| 0.8368192843182927|[0.48272344747976...|
| 0.7645647616824591| 0.2901033322963689|[0.76456476168245...|
|0.04517736175807846| 0.2516191581625483|[0.04517736175807...|
| 0.0997401767803513| 0.8537358668875344|[0.09974017678035...|
+-------------------+-------------------+--------------------+
only showing top 5 rows



In [7]:
kmeans = KMeans(k=4, maxIter=10, initMode="random")
model = kmeans.fit(vdf)

wssse = model.computeCost(vdf)
print("Within Set Sum of Squared Errors = " + str(wssse))

print("Centers:")
model.clusterCenters()

Within Set Sum of Squared Errors = 416.826660058
Centers:


[array([ 0.26453331,  0.23473974]),
 array([ 0.23673167,  0.74518827]),
 array([ 0.7365823 ,  0.74851205]),
 array([ 0.76313915,  0.25215131])]

<div class=h1_cell>
<p>
You can compare this to a python-only single-machine k-means to get an idea of performance gain.
</div>

In [15]:
spark.stop()

In [16]:
subprocess.call(os.environ['SPARK_HOME'] + "/sbin/stop-all.sh")

0