In [18]:
import pyspark
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, ArrayType
from pyspark.sql import functions as F
from pyspark.sql.functions import lit
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.pipeline import Pipeline

In [2]:
spark = SparkSession \
    .builder \
    .appName("example") \
    .getOrCreate()
sc = spark.sparkContext
sqlc = SQLContext(sc)

In [3]:
df = sqlc.createDataFrame([
    (0, 18, 1),
    (1, 25, 0),
    (2, 40, 1),
    (3, 36, 0),
    (3, 36, 1),
], ['uid', 'age', 'gender'])

In [4]:
transformer1 = VectorAssembler(inputCols=['age', 'gender'], outputCol='features1')
transformer2 = VectorAssembler(inputCols=['uid', 'gender'], outputCol='features2')

pipeline = Pipeline(stages=[transformer1, transformer2])
model = pipeline.fit(df)
model.transform(df).show()

+---+---+------+----------+---------+
|uid|age|gender| features1|features2|
+---+---+------+----------+---------+
|  0| 18|     1|[18.0,1.0]|[0.0,1.0]|
|  1| 25|     0|[25.0,0.0]|[1.0,0.0]|
|  2| 40|     1|[40.0,1.0]|[2.0,1.0]|
|  3| 36|     0|[36.0,0.0]|[3.0,0.0]|
|  3| 36|     1|[36.0,1.0]|[3.0,1.0]|
+---+---+------+----------+---------+



## 把两个vectors压成一个vectors

In [11]:
df = sqlc.createDataFrame([
    (0, Vectors.dense([1,2,3]), Vectors.sparse(4, [1, 3], [1.0, 5.5]), ),
    (1, Vectors.dense([1,2,3]), Vectors.sparse(4, [1, 3], [1.0, 5.5]), ),
    (2, Vectors.dense([1,2,3]), Vectors.sparse(4, [1, 3], [1.0, 5.5]), ),
    (3, Vectors.dense([1,2,3]), Vectors.sparse(4, [1, 3], [1.0, 5.5]), ),
    (3, Vectors.dense([1,2,3]), None, ),
], ['id', 'c1', 'c2'])

In [20]:
empty_array_udf = F.udf(lambda : Vectors.dense([]), VectorUDT())
df = df.withColumn('c1', F.coalesce(df['c1'], empty_array_udf()))
df = df.withColumn('c2', F.coalesce(df['c2'], empty_array_udf()))


In [21]:
transformer = VectorAssembler(inputCols=['c1', 'c2'], outputCol='o')

transformer.transform(df).show()

+---+-------------+-------------------+--------------------+
| id|           c1|                 c2|                   o|
+---+-------------+-------------------+--------------------+
|  0|[1.0,2.0,3.0]|(4,[1,3],[1.0,5.5])|[1.0,2.0,3.0,0.0,...|
|  1|[1.0,2.0,3.0]|(4,[1,3],[1.0,5.5])|[1.0,2.0,3.0,0.0,...|
|  2|[1.0,2.0,3.0]|(4,[1,3],[1.0,5.5])|[1.0,2.0,3.0,0.0,...|
|  3|[1.0,2.0,3.0]|(4,[1,3],[1.0,5.5])|[1.0,2.0,3.0,0.0,...|
|  3|[1.0,2.0,3.0]|                 []|       [1.0,2.0,3.0]|
+---+-------------+-------------------+--------------------+



In [25]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- c1: vector (nullable = true)
 |-- c2: vector (nullable = true)
 |-- o: vector (nullable = true)

