In [None]:
%%html
<style>
.h1_cell, .just_text {
    box-sizing: border-box;
    padding-top:5px;
    padding-bottom:5px;
    font-family: "Times New Roman", Georgia, Serif;
    font-size: 125%;
    line-height: 22px; /* 5px +12px + 5px */
    text-indent: 25px;
    background-color: #fbfbea;
    padding: 10px;
}
.code_block {
    box-sizing: border-box;
    padding-top:5px;
    padding-bottom:5px;
    font-size: 75%;
    line-height: 22px; /* 5px +12px + 5px */
    #text-indent: 25px;
    #background-color: #fbfbea;
    padding: 5px;
}

hr { 
    display: block;
    margin-top: 0.5em;
    margin-bottom: 0.5em;
    margin-left: auto;
    margin-right: auto;
    border-style: inset;
    border-width: 2px;
}
</style>

<h2>
<center>
Can we Parallelize KNN?
</center>
</h2>

In [2]:
import os
import sys
import subprocess

In [3]:
os.environ['SPARK_HOME'] = os.environ['HOME'] + '/spark'
os.environ['PATH'] += ':' + os.environ['SPARK_HOME'] + '/bin'
sys.path.append(os.environ['SPARK_HOME'] + '/python')
sys.path.append(os.environ['SPARK_HOME'] + '/python/lib/py4j-0.10.6-src.zip')

In [4]:
# run start-all.sh
subprocess.call(os.environ['SPARK_HOME'] + "/sbin/start-all.sh", env=os.environ)

0

In [5]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
spark = SparkSession(SparkContext(master='spark://instance-1:7077'))
spark

<div class=h1_cell>
<p>
Sometimes its better think about these problems from the bottom up. Is there a row-wise operation that can be run on the dataset in parallel?
<p>
Yes. Given a row, we can calculate the distance from every other row in the distributed dataset.
<p>
Lets import our dataset using pandas and use our NLP code from pyspark_lsa.ipynb to process the text.
</div>

In [None]:
!pip install pandas

In [6]:
import pandas as pd

gothic_table = pd.read_csv('https://bit.ly/2HVSx3X', encoding='utf-8')
gothic_table.head(5)

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [None]:
!pip install nltk

In [7]:
from nlp import get_bag_and_tokenize
from lnalg import occurence_matrix

bag, sentences = get_bag_and_tokenize(gothic_table.head(10), 'text')
omatrix = occurence_matrix(bag, sentences)  # Don't need this. Build window-comatrix.
omatrix.head(5)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jakeu123/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,code,gold,groundwork,desk,fantastic,years,attempts,terrace,mentioned,yet,...,without,looked,greek,cannot,greatest,steals,left,avoid,fact,usual
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
3,0,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,1,0,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [8]:
import numpy as np
from random import random
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

In [9]:
randv = np.random.rand(len(omatrix.columns))

def distance(x):
    return float(np.linalg.norm(x-randv).item())

distance([random() for _ in range(len(omatrix.columns))])  # works for python list

5.433686995848435

In [10]:
distance([np.random.rand() for _ in range(len(omatrix.columns))])  #  and numpy ndarray

6.041331869594297

In [10]:
df = spark.createDataFrame([[word] for word in omatrix.values.tolist()], schema=["features"])
df.printSchema()
df.show(5)

root
 |-- features: array (nullable = true)
 |    |-- element: long (containsNull = true)

+--------------------+
|            features|
+--------------------+
|[0, 0, 0, 0, 0, 0...|
|[0, 0, 0, 0, 0, 0...|
|[0, 1, 0, 0, 1, 0...|
|[0, 0, 0, 0, 0, 1...|
|[0, 1, 0, 1, 0, 0...|
+--------------------+
only showing top 5 rows



In [11]:
type(df.head())

pyspark.sql.types.Row

In [12]:
distance(df.head())  # good sign, but as we'll leave, is misleading!

7.395528454706172

<div class=h1_cell>
<p>
Lets build a dataframe with a different row datatype. We'll create a spark occurence matrix and vectorize the rows.
</div>

In [13]:
omatrix_df = spark.createDataFrame(omatrix, schema=list(omatrix.columns))
omatrix_df.select('code', 'gold', 'groundwork', 'desk', 'fantastic', 'years').show(5)  # many, many columns

+----+----+----------+----+---------+-----+
|code|gold|groundwork|desk|fantastic|years|
+----+----+----------+----+---------+-----+
|   0|   0|         0|   0|        0|    0|
|   0|   0|         0|   0|        0|    0|
|   0|   1|         0|   0|        1|    0|
|   0|   0|         0|   0|        0|    1|
|   0|   1|         0|   1|        0|    0|
+----+----+----------+----+---------+-----+
only showing top 5 rows



In [14]:
from pyspark.ml.feature import VectorAssembler

vdf = VectorAssembler(inputCols=list(omatrix_df.columns), outputCol="features").transform(omatrix_df)
vdf.select('features').show(5)

+--------------------+
|            features|
+--------------------+
|(180,[10,21,25,27...|
|(180,[21,70,99,12...|
|(180,[1,4,16,28,3...|
|(180,[5,7,17,29,3...|
|(180,[1,3,6,12,34...|
+--------------------+
only showing top 5 rows



In [15]:
type(vdf.head().features)

pyspark.ml.linalg.SparseVector

In [16]:
type(vdf.head())  # same as df.head(), so:

pyspark.sql.types.Row

In [17]:
# This happened because there was more than one attribute in the Row object.
distance(vdf.head())

ValueError: setting an array element with a sequence.

In [18]:
distance(vdf.head().features)

7.395528454706172

<div class=h1_cell>
<p>
If we want to send each row of our dataframe to a function, that function has to access the features column.
<p>
Lets redefine the distance function to do this.
</div>

In [20]:
"""def distance(x):
    return float(np.linalg.norm(x.features-randv).item())"""

In [19]:
dist = udf(distance, FloatType())  # spark user-defined-function for distance

In [20]:
df = df.withColumn('distance', dist('features'))  # lazily evaulated on the next line

In [21]:
# Results Py4J Error, noooo! Why is Spark's pickle package looking for numpy.linalg.linalg?
df.select(df.distance).show(5)

+---------+
| distance|
+---------+
|7.3955283|
|7.3361974|
|7.5989823|
|7.2680264|
| 7.779583|
+---------+
only showing top 5 rows



In [22]:
vdf = vdf.withColumn('distance', dist('features'))   # lazy

In [23]:
vdf.select('distance').show(5)  # same Py4J Error

+---------+
| distance|
+---------+
|7.3955283|
|7.3361974|
|7.5989823|
|7.2680264|
| 7.779583|
+---------+
only showing top 5 rows

