In [1]:
sc

<pyspark.context.SparkContext at 0x7f0a7d2fe5f8>

In [2]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Local Matrices

In [3]:
from pyspark.rdd import RDD
from pyspark.mllib.linalg import Vectors, Vector, Matrix, Matrices

In [11]:
md = Matrices.dense(3, 2, [1, 3, 5, 2, 4, 6])
md

DenseMatrix(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0], False)

In [5]:
ms = Matrices.sparse(5, 4, [0,0,1,2,2], [1,2], [34,55])
ms

SparseMatrix(5, 4, [0, 0, 1, 2, 2], [1, 2], [34.0, 55.0], False)

In [14]:
ms.toDense().values

array([  0.,   0.,   0.,   0.,   0.,   0.,  34.,   0.,   0.,   0.,   0.,
         0.,  55.,   0.,   0.,   0.,   0.,   0.,   0.,   0.])

## Distributed Matrices

In [15]:
from pyspark.mllib.linalg.distributed import RowMatrix, IndexedRow, IndexedRowMatrix, MatrixEntry, CoordinateMatrix

### RowMatrix

In [16]:
rows = sc.parallelize([Vectors.dense(1.0,2.0), 
                       Vectors.dense(4.0,5.0), 
                       Vectors.dense(7.0,8.0)])

In [17]:
rows.collect()

[DenseVector([1.0, 2.0]), DenseVector([4.0, 5.0]), DenseVector([7.0, 8.0])]

In [18]:
mat = RowMatrix(rows)

In [19]:
print(mat.numRows())
print(mat.numCols())

3
2


In [21]:
mat.computeCovariance()

DenseMatrix(2, 2, [9.0, 9.0, 9.0, 9.0], 0)

### IndexedRowMatrix

In [22]:
idx_rows = sc.parallelize([IndexedRow(0,Vectors.dense(1.0,2.0)), 
                           IndexedRow(1,Vectors.dense(4.0,5.0)), 
                           IndexedRow(2,Vectors.dense(7.0,8.0))])

In [23]:
idx_mat = IndexedRowMatrix(idx_rows)

In [30]:
idx_mat.rows

PythonRDD[40] at RDD at PythonRDD.scala:48

In [31]:
idx_mat.rows.collect()

[IndexedRow(0, [1.0,2.0]), IndexedRow(1, [4.0,5.0]), IndexedRow(2, [7.0,8.0])]

### CoordinateMatrix

In [45]:
entries = sc.parallelize([MatrixEntry(0,0,9.0),
                          MatrixEntry(1,1,8.0),
                          MatrixEntry(2,1,6.0),
                          MatrixEntry(2,0,1.0)])

In [46]:
coord_mat = CoordinateMatrix(entries)

In [47]:
coord_mat.toIndexedRowMatrix().rows.collect()

[IndexedRow(0, (2,[0],[9.0])),
 IndexedRow(1, (2,[1],[8.0])),
 IndexedRow(2, (2,[0,1],[1.0,6.0]))]

In [48]:
print(coord_mat.numRows(),coord_mat.numCols())

3 2
