In [544]:
import numpy as np
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.mllib import linalg
from pyspark.sql.functions import *
from pyspark.sql.window import *
import pyspark.sql.functions as f
import pyspark.sql.types as t
from pyspark.sql import SQLContext
from pyspark.ml.linalg import Vectors
from pyspark.mllib.linalg.distributed import RowMatrix
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import udf

In [549]:
sc = SparkContext.getOrCreate()
spark = SparkSession.builder.getOrCreate()
sqlContext = SQLContext(sc)

In [550]:
df = ss.read.csv("./digit-recognizer/train.csv",header=True,inferSchema=True)

In [551]:
def cosine_similarity(arr):
    return float(arr[0].dot(arr[1])/
                 ((arr[0].dot(arr[0])**0.5) * (arr[1].dot(arr[1])**0.5)))

def laplacian_vector(row_id,arr,size,k):
    lap_vec = np.zeros(size,dtype=int)
    lap_vec[np.array(arr)] = 1
    lap_vec[row_id] = -k
    return list([int(item) for item in lap_vec])

cosine_similarity_udf = udf(cosine_similarity, t.DoubleType())
laplacian_vector_udf = udf(laplacian_vector, t.ArrayType(t.IntegerType()))

#sqlContext.registerFunction('COS_SIM',cosine_similarity,returnType=t.DoubleType())
#sqlContext.registerFunction("LAP_VECTOR",laplacian_vector,returnType=t.ArrayType(t.IntegerType()))

class Spectral_Clustering():
    def __init__(self,k=2,k_nearest=7,num_eigenvectors = 10,featureCol='features',predictionCol='predictions'):
        self.k = k
        self.k_nearest = k_nearest
        self.num_eigenvectors = num_eigenvectors
        self.featureCol = featureCol
        self.predictionCol = predictionCol
    def cluster(self, df):
        #sqlContext = SQLContext(SparkContext.getOrCreate())
        n = df.count()
        # index rows
        df_index = df.select((row_number().over(Window.partitionBy().orderBy(self.featureCol)) - 1).alias('id'),"*")
        df_features = df_index.select('id',self.featureCol)
        
        
        # prep for joining
        left_df = df_features.select(df_features['id'].alias('left_id'),
                                     df_features[self.featureCol].alias('left_features'))
        right_df = df_features.select(df_features['id'].alias('right_id'),
                                      df_features[self.featureCol].alias('right_features'))
        # join on self where left_id does not equal right_id
        joined_df = left_df.join(right_df,left_df['left_id'] != right_df['right_id'])
        
        # comupte cosine similarity between vectors
        joined_df = joined_df.select('left_id','right_id',
                                     cosine_similarity_udf(array(joined_df['left_features'],
                                                                 joined_df['right_features'])).alias('norm'))
        ranked = joined_df.select('left_id','right_id',rank().over(Window.partitionBy('left_id').orderBy('norm')).alias('rank'))
        knn = ranked.where(ranked['rank'] <= 5)
        knn_grouped = knn.groupBy('left_id').agg(f.collect_list('right_id').alias('nn'))
        laplacian = knn_grouped.select('left_id', laplacian_vector_udf(knn_grouped['left_id'], knn_grouped['nn'], 
                                                                       lit(n), lit(self.k_nearest)).alias('lap_vector'))

        laplacian_matrix = RowMatrix(laplacian.select('lap_vector').rdd.map(lambda x:x[0]))
        eigenvectors = laplacian_matrix.computePrincipalComponents(k=self.num_eigenvectors)
        
        eigenvectors = [(idx,Vectors.dense([float(item) for item in row])) 
                        for idx, row in enumerate(eigenvectors.toArray().tolist())]
        
        eigen_df = spark.createDataFrame(eigenvectors,['id',self.featureCol])
        model = KMeans(featuresCol=self.featureCol,predictionCol=self.predictionCol,k=self.k).fit(eigen_df)
        predictions = model.transform(eigen_df).join(df_index,on='id')
        return predictions

In [552]:
spec_clust = Spectral_Clustering()
assembler = VectorAssembler(outputCol='features',inputCols=df.columns[1:])
df_features = assembler.transform(df).limit(100)
spec_clust.cluster(df_features).select('id','predictions','label').show(100)

+---+-----------+-----+
| id|predictions|label|
+---+-----------+-----+
|  0|          0|    6|
|  1|          0|    1|
|  2|          0|    6|
|  3|          0|    6|
|  4|          0|    6|
|  5|          0|    6|
|  6|          0|    6|
|  7|          0|    6|
|  8|          0|    2|
|  9|          0|    6|
| 10|          0|    6|
| 11|          0|    3|
| 12|          1|    2|
| 13|          0|    5|
| 14|          0|    5|
| 15|          1|    0|
| 16|          0|    5|
| 17|          0|    0|
| 18|          0|    3|
| 19|          1|    3|
| 20|          0|    2|
| 21|          0|    0|
| 22|          0|    2|
| 23|          1|    0|
| 24|          1|    1|
| 25|          0|    1|
| 26|          1|    0|
| 27|          1|    1|
| 28|          0|    3|
| 29|          0|    0|
| 30|          0|    1|
| 31|          1|    1|
| 32|          0|    1|
| 33|          0|    1|
| 34|          0|    0|
| 35|          1|    1|
| 36|          0|    4|
| 37|          1|    1|
| 38|          1

In [None]:
spark.udf.register()