# This notebook illustrates how to compute SVD and PCA using PYSPARK

Install necessary packages and import them as shown below: 

In [1]:
import findspark
findspark.init()

In [2]:
import numpy as np 

from pyspark import SparkContext
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix

## Load the data in the txt file and store it in the distributed form using RowMatrix

In [3]:
rows = np.loadtxt('testmatrix.txt', dtype=float)
print('finish loading the data set rows with size')
print(rows.shape)
print('\n')

sc = SparkContext(appName="PythonSVDExample")
print('Start the context')
rows = sc.parallelize(rows)
mat = RowMatrix(rows)
print('finish distributed the data by RowMatrix')

finish loading the data set rows with size
(10, 10)


Start the context
finish distributed the data by RowMatrix


## Compute SVD

In [4]:
print("Start computing SVD::\n")

svd = mat.computeSVD(5, computeU=True)
U = svd.U       # The U factor is a RowMatrix.
s = svd.s       # The singular values are stored in a local dense vector.
V = svd.V       # The V factor is a local dense matrix.
# $example off$
collected = U.rows.collect()
print("Finish computing U factor:")
#for vector in collected:
#    print(vector)

print("Singular values are: %s" % s)
print("Finish computing V factor\n")

Start computing SVD::

Finish computing U factor:
Singular values are: [11.0704477758,8.37264168736,5.31396563411,3.52897317326,0.735628240984]
Finish computing V factor



## Compute PCA

In [5]:
print("Start computing PCA::\n")
pc = mat.computePrincipalComponents(4)
projected = mat.multiply(pc)

collected1 = projected.rows.collect()
print("\n\nFinish Projected Row Matrix of principal component:")
#for vector in collected1:
#    print(vector)

Start computing PCA::



Finish Projected Row Matrix of principal component:
[-6.75127768515,3.44675019906,-0.938969881937,0.675878752993]
[0.0,0.0,0.0,0.0]
[1.50238362474,-0.769268315778,-0.176475125497,0.921105397875]
[2.73928506663,-5.00219025081,-2.14921902001,0.810028880651]
[3.66105284191,1.36657697234,2.51733382326,-1.10657138917]
[-5.55196289961,-2.76466774952,-0.47753747748,-2.05850482731]
[-2.04698940475,0.555264405988,-1.57171251428,1.2947563867]
[3.71990287804,4.72561872628,-2.15011973248,-0.303519716182]
[-0.529464576834,-0.60280632099,0.949039218102,0.565780143946]
[-2.15551180957,-0.30803511162,2.85391649935,1.71639561656]


In [7]:
sc.stop()