In [None]:
%%html
<style>
.h1_cell, .just_text {
    box-sizing: border-box;
    padding-top:5px;
    padding-bottom:5px;
    font-family: "Times New Roman", Georgia, Serif;
    font-size: 125%;
    line-height: 22px; /* 5px +12px + 5px */
    text-indent: 25px;
    background-color: #fbfbea;
    padding: 10px;
}
.code_block {
    box-sizing: border-box;
    padding-top:5px;
    padding-bottom:5px;
    font-size: 75%;
    line-height: 22px; /* 5px +12px + 5px */
    #text-indent: 25px;
    #background-color: #fbfbea;
    padding: 5px;
}

hr { 
    display: block;
    margin-top: 0.5em;
    margin-bottom: 0.5em;
    margin-left: auto;
    margin-right: auto;
    border-style: inset;
    border-width: 2px;
}
</style>

<h1>
<center>
Download and Installation
</center>
</h1>

In [None]:
import requests
import os
import sys
import subprocess

<div class=h1_cell>
<p>
Download the spark tarball in the current directory. The URL is one of many mirrors listed on spark's official website.
</div>

In [None]:
spark_url = "http://apache.osuosl.org/spark/spark-2.3.0/spark-2.3.0-bin-hadoop2.7.tgz"
r = requests.get(spark_url, stream=True)
filename = spark_url.rsplit('/')[-1]
with open(filename, 'wb') as f:
    f.write(r.content)

<div class=h1_cell>
<p>
Extract the spark tarball in the 'spark/' directory.
</div>

In [None]:
subprocess.call('mkdir spark'.split(' '))
subprocess.call('tar -xf spark-2.3.0-bin-hadoop2.7.tgz -C spark --strip-components 1'.split(' '))

<div class=h1_cell>
<p>
Add spark_home environment variable.
<p>
Add spark_home + '/bin' to run a pyspark console.
<p>
Add spark_home + '/python*' to environment to import pyspark.
</div>

In [None]:
os.environ['SPARK_HOME'] = os.environ['HOME'] + '/spark'
os.environ['PATH'] += ':' + os.environ['SPARK_HOME'] + '/bin'
sys.path.append(os.environ['SPARK_HOME'] + '/python')
sys.path.append(os.environ['SPARK_HOME'] + '/python/lib/py4j-0.10.6-src.zip')

<div class=h1_cell>
<p>
Write the names of the slave nodes in the cluster. This script currently assumes the master machine is blue0.
</div>

In [None]:
machines = ["blue1", "blue3"]
content = "\n".join(machines)
with open(os.environ['SPARK_HOME'] + "/conf/slaves", 'w') as f:
    f.write(content)

<div class=h1_cell>
<p>
We now need to do the same exact thing on all the slave nodes. We will:
<ul>
<li>
Convert this notebook to a python file.
<li>
Delete the lines after the comment in the code below.
<li>
Run the editted python script on each slave node.
</ul>
<p>
Note: These nodes must have password-less ssh tunneling configured.
</div>

In [None]:
# Convert this ipython notebook to python script
!jupyter nbconvert --to=python setup_spark.ipynb

In [None]:
read_file = open('setup_spark.py', 'r')
lines = read_file.readlines()
read_file.close()
with open('setup_spark.py', 'w') as f:
    i = 0
    while i < len(lines) and lines[i].strip() != '# Convert this ipython notebook to python script':
        f.write(lines[i])
        i += 1

<div class=h1_cell>
<p>
This will take awhile. For each node, the script is downloading spark, extracting the package and configuring the environment.
</div>

In [None]:
!ssh blue1 python < setup_spark.py
!ssh blue3 python < setup_spark.py

<h1>
<center>
Using PySpark
</center>
</h1>

<div class=h1_cell>
<p>
Start the cluster.
</div>

In [None]:
# run start-all.sh
subprocess.call(os.environ['SPARK_HOME'] + "/sbin/start-all.sh", env=os.environ)

<div class=h1_cell>
<p>
This is where most other programs regarding this project will start.
</div>

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
spark = SparkSession(SparkContext(master='spark://blue0:7077'))
spark

<div class=h1_cell>
<p>
Lets generate some semi-random data and run a Spark K-Means implementation on it. We'll create a 2D array with 4 centers.
</div>

In [None]:
# 4 clusters
from random import random, shuffle

upper_left = [[random()*0.5, random()*0.5 + 0.5] for _ in range(2500)]
upper_right = [[random()*0.5 + 0.5 for _ in range(2)] for _ in range(2500)]
bottom_left = [[random()*0.5 for _ in range(2)] for _ in range(2500)]
bottom_right = [[random()*.5 + 0.5, random()*0.5] for _ in range(2500)]

matrix = upper_left + upper_right + bottom_left + bottom_right
shuffle(matrix)

data = spark.createDataFrame(matrix, schema=["A", "B"])
data.show(5)

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

vdf = VectorAssembler(inputCols=data.columns, outputCol="features").transform(data)
vdf.show(5)

In [None]:
kmeans = KMeans(k=4, maxIter=10, initMode="random")
model = kmeans.fit(vdf)

wssse = model.computeCost(vdf)
print("Within Set Sum of Squared Errors = " + str(wssse))

print("Centers:")
model.clusterCenters()

<div class=h1_cell>
<p>
You can compare this to a python-only single-machine k-means to get an idea of performance gain.
</div>

<h2>
<center>
Latent Semantic Analysis
</center>
</h2>

<div class=h1_cell>
<p>
We can look at all the words in a tweet dataset and determine which words may be related to each other, usually by topic.
</div>

In [None]:
# Spark doesn't do csv files very well, so we'll read the data into pandas
# Then we can pass it to spark
if(True):  # if on raspberry pi, change if otherwise
    subprocess.call('sudo apt-get install python-pandas'.split(' '))
else:
    subprocess.call('pip install pandas')

import pandas as pd
tweets = pd.read_csv("https://docs.google.com/spreadsheets/d/1tw90jUqTQoRt-RNOqNWronMN46y7dxb2ciQwj1YFsTo/export?format=csv")
tweets.head()

In [None]:
tweets.iloc[0]['tweet']

<div class=h1_cell>
<p>
We need to take each tweet, break it down into words and remove any unuseful words (stopwords)
</div>

In [None]:
!pip install nltk

from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk import download
from string import punctuation

word_punct_tokenizer = WordPunctTokenizer()
download('stopwords')
swords = stopwords.words('english')

def sentence_wrangler(sentence):
    word_list = word_punct_tokenizer.tokenize(sentence.lower())
    removed_words = []
    result = []
    for word in word_list:
        if word in swords:
            removed_words.append(word)
            continue
        check = False
        for char in word:
            if char in punctuation:
                check = True
                removed_words.append(word)
                break
        if not check: result.append(word)
      
    return result, removed_words

<div class=h1_cell>
<p>
Create a set of all unique words. Then represent each tweet as a RowVector of words.
<p>
For example, the vector representing "Clippers down 2 points at the second half #NBAFinals" would have a 1 in each of the columns of the words in the tweet.
</div>

In [None]:
# Get tweets with no special characters (ascii)
bag = set()
sentences = []
for i in range(len(tweets.index[:100])):
    t = tweets.iloc[i]['tweet']
    try:
        t.encode('ascii')
        words = set(sentence_wrangler(t)[0])
        for word in words:
            if word not in bag:
                bag.add(word)
        sentences.append(words)
    except UnicodeDecodeError:
        pass
bag = frozenset(bag)
print(sentences[0])

In [None]:
all_words = list(bag)
occur_matrix = pd.DataFrame(columns=all_words)
zeros = [0]*len(bag)
for sentence in sentences:
    occur_matrix = occur_matrix.append(pd.DataFrame([zeros], columns=all_words), ignore_index=True)
    for word in sentence:
        occur_matrix.loc[len(occur_matrix.index)-1, word] = 1
occur_matrix.head()

<div class=h1_cell>
<p>
Now create a co-occurence matrix to represent how many times 2 words appear together in the same dataset. Multiply the occurence matrix by its transpose to get this matrix.
</div>

In [None]:
import numpy as np
comatrix = occur_matrix.T.dot(occur_matrix)
np.fill_diagonal(comatrix.values, 0)
comatrix.head()

<div class=h1_cell>
<p>
Now, this matrix is incredibly sparse, and takes up space. By decomposing this matrix with Sigular Value Decomposition, we can find the K most significant words in the dataset and get the cooccurence matrix of only those words. This significantly reduces the size of the matrix but retains the K most connected words in the data.
</div>

In [None]:
from pyspark.mllib.linalg import DenseVector
from pyspark.mllib.linalg.distributed import RowMatrix
rows = spark.sparkContext.parallelize([DenseVector(row) for row in comatrix.values.tolist()])
row_matrix = RowMatrix(rows)
svd = row_matrix.computeSVD(len(comatrix.index)/2, computeU=True)  # reduce matrix by 

In [None]:
spark.stop()

In [None]:
subprocess.call(os.environ['SPARK_HOME'] + "/sbin/stop-all.sh")