In [4]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("VectorAssemblerExample").getOrCreate()

In [8]:
from pyspark.ml.feature import Normalizer
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline

# Sample data
data = [(1.60193653, -1.8679101),
        (-1.1328963, -1.9607465),
        (2.40675869, -1.5994823),
        (0.09330145, -2.9446696),
        (1.3795901, -2.5489864),
        (-0.42065496, -2.8165693),
        (0.55753398, -2.0145494),
        (1.3066549, -2.3208153),
        (0.66224722, -0.9406476),
        (1.19072851, -2.4178092),
        (4.67961769, 1.6375689),
        (5.03015133, 1.4575724),
        (6.1003413, 2.1673923),
        (4.20259176, 1.8237144),
        (4.93339445, 1.8983999),
        (6.70975052, 1.5899655),
        (5.01461979, 1.6051478),
        (5.00005277, 1.6855351),
        (4.12926186, 2.0582398)]

columns = ["x1", "x2"]

df2 = spark.createDataFrame(data, columns)
# Split the data into training and testing sets
split = df2.randomSplit([0.7, 0.3])
training = split[0]
test = split[1]

# Create the Vector Assembler
assembler = VectorAssembler(inputCols=["x1", "x2"], outputCol="features")

# Create a Normalizer
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)

# Create a K-Means model
kmeans = KMeans(featuresCol="features", k=2)

# Create a Pipeline
pipeline = Pipeline(stages=[assembler,normalizer, kmeans])

# Fit the Pipeline on the training data
model = pipeline.fit(training)

# Transform the test data and show the results
transformed_data = model.transform(test)
transformed_data.show(truncate=False)

+----------+----------+-----------------------+-----------------------------------------+----------+
|x1        |x2        |features               |normFeatures                             |prediction|
+----------+----------+-----------------------+-----------------------------------------+----------+
|-1.1328963|-1.9607465|[-1.1328963,-1.9607465]|[-0.3662013920934893,-0.6337986079065107]|1         |
|1.3795901 |-2.5489864|[1.3795901,-2.5489864] |[0.35116793576502836,-0.6488320642349716]|1         |
|1.19072851|-2.4178092|[1.19072851,-2.4178092]|[0.3299753544767584,-0.6700246455232416] |1         |
|1.3066549 |-2.3208153|[1.3066549,-2.3208153] |[0.3602110638979198,-0.6397889361020801] |1         |
|4.20259176|1.8237144 |[4.20259176,1.8237144] |[0.6973744194901641,0.3026255805098359]  |0         |
|4.12926186|2.0582398 |[4.12926186,2.0582398] |[0.667355273081252,0.332644726918748]    |0         |
+----------+----------+-----------------------+-----------------------------------------+--