In [0]:
df = spark.createDataFrame([[0, 33.3, -17.5],
                              [1, 40.4, -20.5],
                              [2, 28., -23.9],
                              [3, 29.5, -19.0],
                              [4, 32.8, -18.84]
                             ],
                              ["id","lat", "long"])

df.show()

+---+----+------+
| id| lat|  long|
+---+----+------+
|  0|33.3| -17.5|
|  1|40.4| -20.5|
|  2|28.0| -23.9|
|  3|29.5| -19.0|
|  4|32.8|-18.84|
+---+----+------+



In [0]:
from pyspark.ml.feature import VectorAssembler
vecAssembler = VectorAssembler(inputCols=["lat", "long"], outputCol="features")
new_df = vecAssembler.transform(df)
new_df.show()

+---+----+------+-------------+
| id| lat|  long|     features|
+---+----+------+-------------+
|  0|33.3| -17.5| [33.3,-17.5]|
|  1|40.4| -20.5| [40.4,-20.5]|
|  2|28.0| -23.9| [28.0,-23.9]|
|  3|29.5| -19.0| [29.5,-19.0]|
|  4|32.8|-18.84|[32.8,-18.84]|
+---+----+------+-------------+



In [0]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=5, seed=1)  
model = kmeans.fit(new_df.select('features'))

In [0]:
transformed = model.transform(new_df)
transformed.show() 

+---+----+------+-------------+----------+
| id| lat|  long|     features|prediction|
+---+----+------+-------------+----------+
|  0|33.3| -17.5| [33.3,-17.5]|         1|
|  1|40.4| -20.5| [40.4,-20.5]|         2|
|  2|28.0| -23.9| [28.0,-23.9]|         0|
|  3|29.5| -19.0| [29.5,-19.0]|         3|
|  4|32.8|-18.84|[32.8,-18.84]|         4|
+---+----+------+-------------+----------+



In [0]:
summary = model.summary
wssse = summary.trainingCost
print("Within Set Sum of Squared Errors = " + str(wssse))

centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Within Set Sum of Squared Errors = 0.0
Cluster Centers: 
[ 28.  -23.9]
[ 33.3 -17.5]
[ 40.4 -20.5]
[ 29.5 -19. ]
[ 32.8  -18.84]


In [0]:
data_list = [{'name': 'Alice', 'age': '1'}, 
     {'name': 'Mike', 'age': '2'},
     {'name': 'json', 'age': '3'},
    ]
 
df_new = spark.createDataFrame(data_list)

from pyspark.sql.types import DoubleType

changedTypedf = df_new.withColumn("age", df_new["age"].cast(DoubleType()))

changedTypedf.show()

+---+-----+
|age| name|
+---+-----+
|1.0|Alice|
|2.0| Mike|
|3.0| json|
+---+-----+

