In [16]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import col

In [80]:
# Loads data.
df = spark.read.csv('data/pluto.csv', header=True)

In [81]:
cols = ['lotarea',
 'bldgarea',
 'comarea',
 'resarea',
 'officearea',
 'retailarea',
 'garagearea',
 'strgearea',
 'factryarea',
 'otherarea',
 'areasource',
 'numbldgs',
 'numfloors',
 'unitsres',
 'unitstotal',
 'lotfront',
 'lotdepth',
 'bldgfront',
 'bldgdepth',
 'assessland',
 'assesstot',
 'exemptland',
 'exempttot',
 'yearbuilt',
 'builtfar',
 'residfar',
 'commfar',
 'facilfar']

In [82]:
df = df.select([col(A).cast(DoubleType()) for A in cols])
df = df.na.drop()

In [83]:
vecAssembler = VectorAssembler(inputCols=cols, outputCol="features")
new_df = vecAssembler.transform(df)

In [32]:
# Trains a k-means model.
silhuette_scores = []
for i in range(2, 20):
    kmeans = KMeans(k=i, seed=1)  # 2 clusters here
    model = kmeans.fit(new_df.select('features'))
    transformed = model.transform(new_df)
    evaluator = ClusteringEvaluator()
    silhouette = evaluator.evaluate(transformed)
    silhuette_scores.append(silhouette)
    print("Silhouette with {} clusters = ".format(i) + str(silhouette))

Silhouette with 2 clusters = 0.9999891613792797
Silhouette with 3 clusters = 0.9997347484586968
Silhouette with 4 clusters = 0.9997347891466806
Silhouette with 5 clusters = 0.9978423180139647
Silhouette with 6 clusters = 0.9977768216857009
Silhouette with 7 clusters = 0.9979956260911077
Silhouette with 8 clusters = 0.9978777742031587
Silhouette with 9 clusters = 0.9977512949654372
Silhouette with 10 clusters = 0.9972133246990466
Silhouette with 11 clusters = 0.9930501731287215
Silhouette with 12 clusters = 0.9953466402936372
Silhouette with 13 clusters = 0.9953480171720165
Silhouette with 14 clusters = 0.9935756886033832
Silhouette with 15 clusters = 0.9937739691710925
Silhouette with 16 clusters = 0.9896525707538683
Silhouette with 17 clusters = 0.9910882997615047
Silhouette with 18 clusters = 0.9899316095034388
Silhouette with 19 clusters = 0.9832976786616847


In [65]:
from pyspark.ml.feature import PCA as PCAml
pca = PCAml(k=len(cols), inputCol="features", outputCol="pca")
model = pca.fit(new_df.select('features'))
transformed1 = model.transform(new_df)

In [78]:
from pyspark.ml.feature import Normalizer

In [85]:
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)
dfn = normalizer.transform(new_df)

In [86]:
dfn.show(5)

+-------+--------+-------+-------+----------+----------+----------+---------+----------+---------+----------+--------+---------+--------+----------+--------+--------+---------+---------+----------+---------+----------+---------+---------+--------+--------+-------+--------+--------------------+--------------------+
|lotarea|bldgarea|comarea|resarea|officearea|retailarea|garagearea|strgearea|factryarea|otherarea|areasource|numbldgs|numfloors|unitsres|unitstotal|lotfront|lotdepth|bldgfront|bldgdepth|assessland|assesstot|exemptland|exempttot|yearbuilt|builtfar|residfar|commfar|facilfar|            features|        normFeatures|
+-------+--------+-------+-------+----------+----------+----------+---------+----------+---------+----------+--------+---------+--------+----------+--------+--------+---------+---------+----------+---------+----------+---------+---------+--------+--------+-------+--------+--------------------+--------------------+
| 9750.0|  1000.0|    0.0| 1000.0|       0.0|       