In [1]:
import numpy as np
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.mllib import linalg
from pyspark.mllib.linalg import Vectors
from pyspark.sql.functions import *
from pyspark.sql.window import *
import pyspark.sql.functions as f
import pyspark.sql.types as t
from pyspark.sql import SQLContext

In [2]:
sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()
sqlContext = SQLContext(sc)

In [85]:
df = ss.read.csv("./digit-recognizer/train.csv",header=True,inferSchema=True)
df.printSchema()
#df.withColumn("row_num", row_number().over(Window.partitionBy("Group").orderBy("Date")))

root
 |-- label: integer (nullable = true)
 |-- pixel0: integer (nullable = true)
 |-- pixel1: integer (nullable = true)
 |-- pixel2: integer (nullable = true)
 |-- pixel3: integer (nullable = true)
 |-- pixel4: integer (nullable = true)
 |-- pixel5: integer (nullable = true)
 |-- pixel6: integer (nullable = true)
 |-- pixel7: integer (nullable = true)
 |-- pixel8: integer (nullable = true)
 |-- pixel9: integer (nullable = true)
 |-- pixel10: integer (nullable = true)
 |-- pixel11: integer (nullable = true)
 |-- pixel12: integer (nullable = true)
 |-- pixel13: integer (nullable = true)
 |-- pixel14: integer (nullable = true)
 |-- pixel15: integer (nullable = true)
 |-- pixel16: integer (nullable = true)
 |-- pixel17: integer (nullable = true)
 |-- pixel18: integer (nullable = true)
 |-- pixel19: integer (nullable = true)
 |-- pixel20: integer (nullable = true)
 |-- pixel21: integer (nullable = true)
 |-- pixel22: integer (nullable = true)
 |-- pixel23: integer (nullable = true)
 |-- pi

In [123]:
# create row numbers
df_with_row_num = df.select(row_number().over(Window.partitionBy("label").orderBy("pixel0")).alias('row_number'),
                            "*").limit(1000)
# convert pixel columns to dense vectors
row_num_vector_pairs = ss.createDataFrame(df_with_row_num.rdd.map(lambda x:[x[0],Vectors.dense(x[2:])]),['id','features'])

In [124]:
sqlContext.registerDataFrameAsTable(row_num_vector_pairs,'row_num_vector_pairs')

In [125]:
def norm(arr):
    return float(arr[0].dot(arr[1])**0.5)

In [126]:
sqlContext.registerFunction('NORM',norm,returnType=t.DoubleType())

<function __main__.norm(arr)>

In [127]:
knn = SQLContext.sql(sqlContext,'''
                                    SELECT
                                        *
                                    FROM
                                     (SELECT
                                         left_id,
                                         right_id,
                                         RANK() OVER (PARTITION BY left_id ORDER BY norm ) AS rank,
                                         norm
                                     FROM 
                                      (SELECT 
                                         lq.id as left_id,
                                         rq.id as right_id,
                                         lq.features as left_features,
                                         rq.features as right_features,
                                         NORM(ARRAY(lq.features,rq.features)) as norm
                                      FROM 
                                         row_num_vector_pairs as LQ
                                         JOIN
                                         row_num_vector_pairs as RQ
                                         on LQ.id != RQ.id))
                                    WHERE
                                        rank <= 5
                                   ''')

In [129]:
sqlContext.registerDataFrameAsTable(knn,'knn')

In [None]:
sqlContext.sql("SELECT * from knn limit 10").show()

In [128]:
knn.show(10)

+-------+--------+----+------------------+
|left_id|right_id|rank|              norm|
+-------+--------+----+------------------+
|     26|     865|   1| 694.4019009190571|
|     26|     214|   2| 748.8337599227215|
|     26|     597|   3| 764.3513589966332|
|     26|     171|   4| 766.8637427861614|
|     26|     524|   5| 783.5368529941652|
|     29|     865|   1|488.22433368278564|
|     29|     214|   2| 513.7557785563098|
|     29|     318|   3| 540.6052164010258|
|     29|     237|   4| 544.8788856250534|
|     29|     989|   5| 550.3426205555953|
+-------+--------+----+------------------+
only showing top 10 rows



In [200]:
left = row_num_vector_pairs.select(row_num_vector_pairs['id'].alias('left_id'),
                                   row_num_vector_pairs['features'].alias('left_features'))
right = row_num_vector_pairs.select(row_num_vector_pairs['id'].alias('right_id'),
                                   row_num_vector_pairs['features'].alias('right_features'))
joined = left.join(right,
                   left['left_id'] != right['right_id'],
                   how='inner').select('left_id','right_id','left_features','right_features')

In [207]:
joined.rdd.map(lambda x:[x[0],x[1],(x[2].dot(x[3]))**0.5]).collect()

[[1, 2, 929.0941825240324],
 [1, 3, 1052.1421006689163],
 [1, 4, 925.7045965101394],
 [1, 5, 794.0018891665183],
 [1, 6, 1140.6392944309782],
 [1, 7, 1373.1161640589626],
 [1, 8, 1056.3597871937382],
 [1, 9, 962.4214253641696],
 [1, 10, 1086.1855274307425],
 [1, 11, 991.4438965468495],
 [1, 12, 1543.893778729612],
 [1, 13, 1437.2459775556863],
 [1, 14, 808.0952914106108],
 [1, 15, 1333.3446666185005],
 [1, 16, 1036.8780063247557],
 [1, 17, 1107.6032683230942],
 [1, 18, 1903.9495791643224],
 [1, 19, 1710.3011430739325],
 [1, 20, 1224.3635081135014],
 [1, 21, 1687.2507223290793],
 [1, 22, 1162.6082745275814],
 [1, 23, 1805.4747298148484],
 [1, 24, 1060.2249761253504],
 [1, 25, 999.2262006172576],
 [1, 26, 1192.056206728525],
 [1, 27, 1634.122700411447],
 [1, 28, 1715.4497952432184],
 [1, 29, 1486.9825150283375],
 [1, 30, 842.1003503146165],
 [1, 31, 1486.7488019164502],
 [1, 32, 1196.381210150009],
 [1, 33, 826.1428447913835],
 [1, 34, 991.0050453958345],
 [1, 35, 1461.8816641575336],
 [

In [162]:
df_with_row_num

PythonRDD[342] at RDD at PythonRDD.scala:53

In [105]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- features: vector (nullable = true)



In [100]:
df.show(3)

+---+-----------+
| id|   features|
+---+-----------+
|  0|[-1.0,-1.0]|
|  1| [-1.0,1.0]|
|  2| [1.0,-1.0]|
+---+-----------+
only showing top 3 rows



In [40]:
path = "./digit-recognizer/train.csv"

In [41]:
ss = SparkSession.builder.getOrCreate()

In [47]:
mnist_train = ss.read.csv(path,header=True)

In [48]:
labs = mnist_train.select('label')
data = mnist_train.select(mnist_train.columns[1:])

In [113]:
data_rdd_list1 = ss.createDataFrame(mnist_train.rdd.map(lambda x:(int(x[0]),Vectors.dense(list(map(float,x[1:]))))), 
                                    ['id','features'])

In [125]:
data_rdd_list1.printSchema()

root
 |-- id: long (nullable = true)
 |-- features: vector (nullable = true)



In [121]:
test = data_rdd_list1.select('features').first()[0]

In [111]:
brp = BucketedRandomProjectionLSH(inputCol="features", outputCol="hashes", seed=1,bucketLength=1.0)

In [112]:
model = brp.fit(data_rdd_list1)

In [127]:
out = model.transform(data_rdd_list1)

In [134]:
out.printSchema()

root
 |-- id: long (nullable = true)
 |-- features: vector (nullable = true)
 |-- hashes: array (nullable = true)
 |    |-- element: vector (containsNull = true)



In [138]:
out.select('hashes')

DataFrame[hashes: array<vector>]

In [124]:
model.approxNearestNeighbors(data_rdd_list,,3).collect()

Py4JError: An error occurred while calling o3372.approxNearestNeighbors. Trace:
py4j.Py4JException: Method approxNearestNeighbors([class org.apache.spark.api.java.JavaRDD, class java.lang.Integer, class java.lang.Integer, class java.lang.String]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:326)
	at py4j.Gateway.invoke(Gateway.java:274)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)



In [None]:
BucketedRandomProjectionLSH

In [29]:
row_mat = RowMatrix(data.rdd.map(lambda x:list(x)))
ncols = row_mat.numCols()
nrows = row_mat.numRows()

In [34]:
dense_mat = DenseMatrix(nrows,ncols,data.rdd.flatMap(lambda x:list(x)).collect())

In [35]:
ml_mat = dense_mat.asML()

In [38]:
BlockMatrix(dense_mat,1024,1024)

TypeError: blocks should be an RDD of sub-matrix blocks as ((int, int), matrix) tuples, got <class 'pyspark.mllib.linalg.DenseMatrix'>

In [39]:
data.rdd.flmap(lambda x:list(x)).take(1)

AttributeError: 'RDD' object has no attribute 'flmap'

In [27]:
a = BlockMatrix(data.rdd.map(lambda x:list(x)),1024,1024)

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 8.0 failed 1 times, most recent failure: Lost task 0.0 in stage 8.0 (TID 22, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/Users/loftis/anaconda3/envs/distributedcomputing/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/Users/loftis/anaconda3/envs/distributedcomputing/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/Users/loftis/anaconda3/envs/distributedcomputing/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 393, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/Users/loftis/anaconda3/envs/distributedcomputing/lib/python3.7/site-packages/pyspark/rdd.py", line 1354, in takeUpToNumLeft
    yield next(iterator)
  File "/Users/loftis/anaconda3/envs/distributedcomputing/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "/Users/loftis/anaconda3/envs/distributedcomputing/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/mllib/linalg/distributed.py", line 975, in _convert_to_matrix_block_tuple
    raise TypeError("Cannot convert type %s into a sub-matrix block tuple" % type(block))
TypeError: Cannot convert type <class 'list'> into a sub-matrix block tuple

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$$anonfun$3.apply(PythonRDD.scala:153)
	at org.apache.spark.api.python.PythonRDD$$anonfun$3.apply(PythonRDD.scala:153)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:153)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/Users/loftis/anaconda3/envs/distributedcomputing/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 377, in main
    process()
  File "/Users/loftis/anaconda3/envs/distributedcomputing/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 372, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/Users/loftis/anaconda3/envs/distributedcomputing/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 393, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/Users/loftis/anaconda3/envs/distributedcomputing/lib/python3.7/site-packages/pyspark/rdd.py", line 1354, in takeUpToNumLeft
    yield next(iterator)
  File "/Users/loftis/anaconda3/envs/distributedcomputing/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/util.py", line 99, in wrapper
    return f(*args, **kwargs)
  File "/Users/loftis/anaconda3/envs/distributedcomputing/lib/python3.7/site-packages/pyspark/python/lib/pyspark.zip/pyspark/mllib/linalg/distributed.py", line 975, in _convert_to_matrix_block_tuple
    raise TypeError("Cannot convert type %s into a sub-matrix block tuple" % type(block))
TypeError: Cannot convert type <class 'list'> into a sub-matrix block tuple

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:456)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:592)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:575)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:410)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:891)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$$anonfun$3.apply(PythonRDD.scala:153)
	at org.apache.spark.api.python.PythonRDD$$anonfun$3.apply(PythonRDD.scala:153)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2101)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:123)
	at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1360)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [7]:
a = (data_rdd_list)
numRows = a.numRows()
numCols = a.numCols()

In [14]:
a.transpose()

AttributeError: 'RowMatrix' object has no attribute 'transpose'

In [25]:
a.rows

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:53831)
Traceback (most recent call last):
  File "/Users/loftis/anaconda3/envs/distributedcomputing/lib/python3.7/site-packages/py4j/java_gateway.py", line 929, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/loftis/anaconda3/envs/distributedcomputing/lib/python3.7/site-packages/py4j/java_gateway.py", line 1067, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 61] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:53831)

In [24]:
from pyspark.mllib.linalg.distributed import *
def as_block_matrix(rdd, rowsPerBlock=1024, colsPerBlock=1024):
    return IndexedRowMatrix(rdd.zipWithIndex().map(lambda xi: IndexedRow(xi[1], xi[0])))\
               .toBlockMatrix(rowsPerBlock, colsPerBlock)


as_block_matrix(data_rdd_list).multiply(as_block_matrix(data_rdd_list).transpose())

Py4JJavaError: An error occurred while calling o1236.multiply.
: org.apache.spark.SparkException: Job 43 cancelled because SparkContext was shut down
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:932)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:930)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:78)
	at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:930)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:2128)
	at org.apache.spark.util.EventLoop.stop(EventLoop.scala:84)
	at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:2041)
	at org.apache.spark.SparkContext$$anonfun$stop$6.apply$mcV$sp(SparkContext.scala:1949)
	at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1340)
	at org.apache.spark.SparkContext.stop(SparkContext.scala:1948)
	at org.apache.spark.SparkContext$$anonfun$2.apply$mcV$sp(SparkContext.scala:575)
	at org.apache.spark.util.SparkShutdownHook.run(ShutdownHookManager.scala:216)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ShutdownHookManager.scala:188)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1945)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply$mcV$sp(ShutdownHookManager.scala:188)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.util.SparkShutdownHookManager.runAll(ShutdownHookManager.scala:188)
	at org.apache.spark.util.SparkShutdownHookManager$$anon$2.run(ShutdownHookManager.scala:178)
	at org.apache.hadoop.util.ShutdownHookManager$1.run(ShutdownHookManager.java:54)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:944)
	at org.apache.spark.mllib.linalg.distributed.BlockMatrix.simulateMultiply(BlockMatrix.scala:430)
	at org.apache.spark.mllib.linalg.distributed.BlockMatrix.multiply(BlockMatrix.scala:499)
	at org.apache.spark.mllib.linalg.distributed.BlockMatrix.multiply(BlockMatrix.scala:467)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 53840)
Traceback (most recent call last):
  File "/Users/loftis/anaconda3/envs/distributedcomputing/lib/python3.7/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/Users/loftis/anaconda3/envs/distributedcomputing/lib/python3.7/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/Users/loftis/anaconda3/envs/distributedcomputing/lib/python3.7/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/Users/loftis/anaconda3/envs/distributedcomputing/lib/python3.7/socketserver.py", line 720, in __init__
    self.handle()
  File "/Users/loftis/anaconda3/envs/distributedcomputing/lib/python3.7/site-packages/pyspark/accumulators.py", line 269, in handle
    poll(accum_updates)
  File "/Users/loftis/anaconda3/envs/dist

In [63]:
a.toBlockMatrix()

AttributeError: 'RowMatrix' object has no attribute 'toBlockMatrix'

In [60]:
b = DenseMatrix(numRows,numCols, mnist_train.rdd.flatMap(lambda x:list(x)).collect())

42000