# This notebook illustrates how to compute SVD and PCA using PYSPARK

Install necessary packages and import them as shown below: 

In [1]:
import findspark
findspark.init()

In [2]:
import numpy as np 

from pyspark import SparkContext
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix

## Load the data in the txt file and store it in the distributed form using RowMatrix

In [None]:
rows = np.loadtxt('testmatrix.txt', dtype=float)
print('finish loading the data set rows with size')
print(rows.shape)
print('\n')

sc = SparkContext(appName="PythonSVDExample")
print('Start the context')
rows = sc.parallelize(rows)
mat = RowMatrix(rows)
print('finish distributed the data by RowMatrix')

finish loading the data set rows with size
(4000, 4000)




## Compute SVD

In [None]:
print("Start computing SVD::\n")

svd = mat.computeSVD(5, computeU=True)
U = svd.U       # The U factor is a RowMatrix.
s = svd.s       # The singular values are stored in a local dense vector.
V = svd.V       # The V factor is a local dense matrix.
# $example off$
collected = U.rows.collect()
print("Finish computing U factor:")
#for vector in collected:
#    print(vector)

print("Singular values are: %s" % s)
print("Finish computing V factor\n")

## Compute PCA

In [None]:
print("Start computing PCA::\n")

pc = mat.computePrincipalComponents(4)
projected = mat.multiply(pc)

collected = projected.rows.collect()
print("\n\nFinish Projected Row Matrix of principal component:")
#for vector in collected:
#    print(vector)


In [None]:
sc.stop()