In [1]:
import requests
import os
import sys
import subprocess

In [2]:
# Download spark tarball
spark_url = "http://apache.osuosl.org/spark/spark-2.3.0/spark-2.3.0-bin-hadoop2.7.tgz"
r = requests.get(spark_url, stream=True)
filename = spark_url.rsplit('/')[-1]
with open(filename, 'wb') as f:
    f.write(r.content)

In [3]:
# Extract spark from tarball
subprocess.call('mkdir spark'.split(' '))
subprocess.call('tar -xf spark-2.3.0-bin-hadoop2.7.tgz -C spark --strip-components 1'.split(' '))

0

In [2]:
os.environ['SPARK_HOME'] = os.environ['HOME'] + '/spark'
os.environ['PATH'] += ':' + os.environ['SPARK_HOME'] + '/bin'
sys.path.append(os.environ['SPARK_HOME'] + '/python')
sys.path.append(os.environ['SPARK_HOME'] + '/python/lib/py4j-0.10.6-src.zip')

In [5]:
# write names of slave nodes to $SPARK_HOME/conf/slaves
machines = ["blue1", "blue3"]
content = "\n".join(machines)
with open(os.environ['SPARK_HOME'] + "/conf/slaves", 'w') as f:
    f.write(content)

In [None]:
# Convert this ipython notebook to python script
!jupyter nbconvert --to=python setup_spark.ipynb

In [None]:
# Now remove everything after this in the python file
read_file = open('setup_spark.py', 'r')
lines = read_file.readlines()
read_file.close()
with open('setup_spark.py', 'w') as f:
    i = 0
    while i < len(lines) and lines[i].strip() != '# Convert this ipython notebook to python script':
        f.write(lines[i])
        i += 1

In [6]:
# run python script on slave nodes, might take awhile
!ssh blue1 python < setup_spark.py
!ssh blue3 python < setup_spark.py

/usr/local/bin:/usr/bin:/bin:/usr/games:/home/pi/spark/bin
/usr/local/bin:/usr/bin:/bin:/usr/games:/home/pi/spark/bin


In [4]:
# run start-all.sh
subprocess.call(os.environ['SPARK_HOME'] + "/sbin/start-all.sh", env=os.environ)

0

In [3]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
spark = SparkSession(SparkContext(master='spark://blue0:7077'))
spark

In [4]:
# 4 clusters
from random import random, shuffle

upper_left = [[random()*0.5, random()*0.5 + 0.5] for _ in range(2500)]
upper_right = [[random()*0.5 + 0.5 for _ in range(2)] for _ in range(2500)]
bottom_left = [[random()*0.5 for _ in range(2)] for _ in range(2500)]
bottom_right = [[random()*.5 + 0.5, random()*0.5] for _ in range(2500)]

matrix = upper_left + upper_right + bottom_left + bottom_right
shuffle(matrix)

data = spark.createDataFrame(matrix, schema=["A", "B"])
data.show(5)

+------------------+--------------------+
|                 A|                   B|
+------------------+--------------------+
|0.6542853025002375|0.025554794928640767|
|0.8165091860527975|  0.5339993198226156|
|0.9177750618344158|  0.5229050336538614|
|0.3381873526211694|  0.2217757594060099|
|0.4260065726429769|  0.6059255983700225|
+------------------+--------------------+
only showing top 5 rows



In [22]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

# Build the model (cluster the data)
va = VectorAssembler(inputCols=data.columns, outputCol="features")
vdf = va.transform(data)
vdf.show(5)

+-------------------+--------------------+--------------------+
|                  A|                   B|            features|
+-------------------+--------------------+--------------------+
|0.14550559004782893| 0.23237891229051677|[0.14550559004782...|
| 0.5350185509078498|0.028800598360005925|[0.53501855090784...|
| 0.4449379899905069|  0.0708371577813296|[0.44493798999050...|
| 0.5896988092931466|  0.8138883379910317|[0.58969880929314...|
| 0.2810040967187046|  0.5526543932128559|[0.28100409671870...|
+-------------------+--------------------+--------------------+
only showing top 5 rows



In [23]:
kmeans = KMeans(k=4, maxIter=10, initMode="random")
model = kmeans.fit(vdf)

wssse = model.computeCost(vdf)
print("Within Set Sum of Squared Errors = " + str(wssse))

model.clusterCenters()

Within Set Sum of Squared Errors = 418.406299562


[array([ 0.25452043,  0.75424167]),
 array([ 0.2344099 ,  0.25194634]),
 array([ 0.74898247,  0.74580616]),
 array([ 0.74128549,  0.24577662])]

In [5]:
# Latent Semantic Analysis

# Spark doesn't do csv files very well
if(True):  # if on raspberry pi, change if otherwise
    subprocess.call('sudo apt-get install python-pandas'.split(' '))
else:
    subprocess.call('pip install pandas')

import pandas as pd
tweets = pd.read_csv("https://docs.google.com/spreadsheets/d/1tw90jUqTQoRt-RNOqNWronMN46y7dxb2ciQwj1YFsTo/export?format=csv")
tweets.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is so...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [6]:
tweets.iloc[0]['tweet']

'@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run'

In [7]:
!pip install nltk

from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk import download
from string import punctuation

word_punct_tokenizer = WordPunctTokenizer()
download('stopwords')
swords = stopwords.words('english')

def sentence_wrangler(sentence):
    word_list = word_punct_tokenizer.tokenize(sentence.lower())
    removed_words = []
    result = []
    for word in word_list:
        if word in swords:
            removed_words.append(word)
            continue
        check = False
        for char in word:
            if char in punctuation:
                check = True
                removed_words.append(word)
                break
        if not check: result.append(word)
      
    return result, removed_words

[33mYou are using pip version 9.0.3, however version 10.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[nltk_data] Downloading package stopwords to /home/pi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Get tweets with no special characters (ascii)
bag = set()
sentences = []
for i in range(len(tweets.index[:100])):
    t = tweets.iloc[i]['tweet']
    try:
        t.encode('ascii')
        words = set(sentence_wrangler(t)[0])
        for word in words:
            if word not in bag:
                bag.add(word)
        sentences.append(words)
    except UnicodeDecodeError:
        pass
bag = frozenset(bag)
print(sentences[0])

set(['selfish', 'kids', 'run', 'father', 'drags', 'user', 'dysfunctional', 'dysfunction'])


In [33]:
all_words = list(bag)
occur_matrix = pd.DataFrame(columns=all_words)
zeros = [0]*len(bag)
for sentence in sentences:
    occur_matrix = occur_matrix.append(pd.DataFrame([zeros], columns=all_words), ignore_index=True)
    for word in sentence:
        occur_matrix.loc[len(occur_matrix.index)-1, word] = 1
occur_matrix.head()

Unnamed: 0,shop,shot,show,tells,probe,ica16,leads,basilicabotanica,go,21st,...,hu,weekend,forever,calls,wife,sometimes,gr8,points,leak,personalised
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
import numpy as np
comatrix = occur_matrix.T.dot(occur_matrix)
np.fill_diagonal(comatrix.values, 0)
comatrix.head()

Unnamed: 0,shop,shot,show,tells,probe,ica16,leads,basilicabotanica,go,21st,...,hu,weekend,forever,calls,wife,sometimes,gr8,points,leak,personalised
shop,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
shot,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
show,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
tells,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
probe,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0


In [52]:
from pyspark.mllib.linalg import DenseVector
from pyspark.mllib.linalg.distributed import RowMatrix
rows = spark.sparkContext.parallelize([DenseVector(row) for row in comatrix.values.tolist()])
row_matrix = RowMatrix(rows)
svd = row_matrix.computeSVD(len(comatrix.index)/2, computeU=True)  # reduce matrix by 

In [14]:
spark.stop()

In [15]:
subprocess.call(os.environ['SPARK_HOME'] + "/sbin/stop-all.sh")

0