In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler

# Create a Spark session
spark = SparkSession.builder.appName("VectorAssemblerExample").getOrCreate()

23/12/15 11:38:23 WARN Utils: Your hostname, user-HP-EliteBook-840-G7-Notebook-PC resolves to a loopback address: 127.0.1.1; using 192.168.1.141 instead (on interface wlp0s20f3)


In [3]:
# Create a DataFrame
data = [(-3.0965012, 5.2371198, -0.7370271),
        (-0.2100299, -0.7810844, -1.3284768),
        (8.3525083, 5.3337562, 21.8897181),
        (-3.0380369, 6.5357180, 0.3469820),
        (5.9354651, 6.0223208, 17.9566144),
        (-6.8357707, 5.6629804, -8.1598308),
        (8.8919844, -2.5149762, 15.3622538),
        (6.3404984, 4.1778706, 16.7931822)]

columns = ["x1", "x2", "y"]

df = spark.createDataFrame(data, columns)

# Create a VectorAssembler
assembler = VectorAssembler(inputCols=["x1", "x2"], outputCol="features")

# Transform the DataFrame
output = assembler.transform(df)

# Show the resulting DataFrame
output.show(truncate=False)


+----------+----------+----------+-----------------------+
|x1        |x2        |y         |features               |
+----------+----------+----------+-----------------------+
|-3.0965012|5.2371198 |-0.7370271|[-3.0965012,5.2371198] |
|-0.2100299|-0.7810844|-1.3284768|[-0.2100299,-0.7810844]|
|8.3525083 |5.3337562 |21.8897181|[8.3525083,5.3337562]  |
|-3.0380369|6.535718  |0.346982  |[-3.0380369,6.535718]  |
|5.9354651 |6.0223208 |17.9566144|[5.9354651,6.0223208]  |
|-6.8357707|5.6629804 |-8.1598308|[-6.8357707,5.6629804] |
|8.8919844 |-2.5149762|15.3622538|[8.8919844,-2.5149762] |
|6.3404984 |4.1778706 |16.7931822|[6.3404984,4.1778706]  |
+----------+----------+----------+-----------------------+



In [4]:
from pyspark.ml.feature import Normalizer

# Create a Normalizer
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)

# Transform the DataFrame using the Normalizer
l1NormData = normalizer.transform(output)

# Show the resulting DataFrame
l1NormData.show(truncate=False)

# Stop the Spark session
spark.stop()


+----------+----------+----------+-----------------------+-----------------------------------------+
|x1        |x2        |y         |features               |normFeatures                             |
+----------+----------+----------+-----------------------+-----------------------------------------+
|-3.0965012|5.2371198 |-0.7370271|[-3.0965012,5.2371198] |[-0.3715673174962,0.6284326825037999]    |
|-0.2100299|-0.7810844|-1.3284768|[-0.2100299,-0.7810844]|[-0.2119128944058218,-0.7880871055941782]|
|8.3525083 |5.3337562 |21.8897181|[8.3525083,5.3337562]  |[0.6102840040830718,0.38971599591692824] |
|-3.0380369|6.535718  |0.346982  |[-3.0380369,6.535718]  |[-0.3173297135484427,0.6826702864515573] |
|5.9354651 |6.0223208 |17.9566144|[5.9354651,6.0223208]  |[0.49636823653114576,0.5036317634688542] |
|-6.8357707|5.6629804 |-8.1598308|[-6.8357707,5.6629804] |[-0.5469162995013158,0.45308370049868424]|
|8.8919844 |-2.5149762|15.3622538|[8.8919844,-2.5149762] |[0.7795226714467656,-0.2204773285