In [9]:
import pandas as pd
pdf = pd.DataFrame({
        'x1': ['a','a','b','b', 'b', 'c'],
        'x2': ['apple', 'orange', 'orange','orange', 'peach', 'peach'],
        'x3': [1, 1, 2, 2, 2, 4],
        'x4': [2.4, 2.5, 3.5, 1.4, 2.1,1.5],
        'y1': [1, 0, 1, 0, 0, 1],
        'y2': ['yes', 'no', 'no', 'yes', 'yes', 'yes']
    })
df = spark.createDataFrame(pdf)

In [10]:
df.show()

+---+------+---+---+---+---+
| x1|    x2| x3| x4| y1| y2|
+---+------+---+---+---+---+
|  a| apple|  1|2.4|  1|yes|
|  a|orange|  1|2.5|  0| no|
|  b|orange|  2|3.5|  1| no|
|  b|orange|  2|1.4|  0|yes|
|  b| peach|  2|2.1|  0|yes|
|  c| peach|  4|1.5|  1|yes|
+---+------+---+---+---+---+



In [17]:
from pyspark.ml.feature import StringIndexer

# build indexer
string_indexer = StringIndexer(inputCol='x1', outputCol='indexed_x1')

# learn the model
string_indexer_model = string_indexer.fit(df)

# transform the data
df_stringindexer = string_indexer_model.transform(df)

# resulting df
df_stringindexer.show()

+---+------+---+---+---+---+----------+
| x1|    x2| x3| x4| y1| y2|indexed_x1|
+---+------+---+---+---+---+----------+
|  a| apple|  1|2.4|  1|yes|       1.0|
|  a|orange|  1|2.5|  0| no|       1.0|
|  b|orange|  2|3.5|  1| no|       0.0|
|  b|orange|  2|1.4|  0|yes|       0.0|
|  b| peach|  2|2.1|  0|yes|       0.0|
|  c| peach|  4|1.5|  1|yes|       2.0|
+---+------+---+---+---+---+----------+



In [18]:
from pyspark.ml.feature import OneHotEncoder

# build indexer
onehotencoder = OneHotEncoder(inputCol='indexed_x1', outputCol='onehotencoded_x1')

# transform the data
df_onehotencoder = onehotencoder.transform(df_stringindexer)

# resulting df
df_onehotencoder.show()

+---+------+---+---+---+---+----------+----------------+
| x1|    x2| x3| x4| y1| y2|indexed_x1|onehotencoded_x1|
+---+------+---+---+---+---+----------+----------------+
|  a| apple|  1|2.4|  1|yes|       1.0|   (2,[1],[1.0])|
|  a|orange|  1|2.5|  0| no|       1.0|   (2,[1],[1.0])|
|  b|orange|  2|3.5|  1| no|       0.0|   (2,[0],[1.0])|
|  b|orange|  2|1.4|  0|yes|       0.0|   (2,[0],[1.0])|
|  b| peach|  2|2.1|  0|yes|       0.0|   (2,[0],[1.0])|
|  c| peach|  4|1.5|  1|yes|       2.0|       (2,[],[])|
+---+------+---+---+---+---+----------+----------------+



In [29]:
categorical_columns = ['x1', 'x2', 'x3']

##=== build stages ======
stringindexer_stages = [StringIndexer(inputCol=c, outputCol='stringindexed_' + c) for c in categorical_columns]
onehotencoder_stages = [OneHotEncoder(inputCol='stringindexed_' + c, outputCol='onehotencoded_' + c) for c in categorical_columns]
all_stages = stringindexer_stages + onehotencoder_stages

## build pipeline model
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=all_stages)

## fit pipeline model
pipeline_mode = pipeline.fit(df)

## transform data
df_coded = pipeline_mode.transform(df)

## remove uncoded columns
selected_columns = ['onehotencoded_' + c for c in categorical_columns] + ['x4', 'y1', 'y2']
df_coded = df_coded.select(selected_columns)

In [30]:
df_coded.show()

+----------------+----------------+----------------+---+---+---+
|onehotencoded_x1|onehotencoded_x2|onehotencoded_x3| x4| y1| y2|
+----------------+----------------+----------------+---+---+---+
|   (2,[1],[1.0])|       (2,[],[])|   (2,[1],[1.0])|2.4|  1|yes|
|   (2,[1],[1.0])|   (2,[0],[1.0])|   (2,[1],[1.0])|2.5|  0| no|
|   (2,[0],[1.0])|   (2,[0],[1.0])|   (2,[0],[1.0])|3.5|  1| no|
|   (2,[0],[1.0])|   (2,[0],[1.0])|   (2,[0],[1.0])|1.4|  0|yes|
|   (2,[0],[1.0])|   (2,[1],[1.0])|   (2,[0],[1.0])|2.1|  0|yes|
|       (2,[],[])|   (2,[1],[1.0])|       (2,[],[])|1.5|  1|yes|
+----------------+----------------+----------------+---+---+---+



In [25]:
['onehotencode_' + c for c in categorical_columns] + ['y1', 'y2']

['onehotencode_x1', 'onehotencode_x2', 'onehotencode_x3', 'y1', 'y2']

In [32]:
from pyspark.ml.feature import VectorAssembler

# feature columns
feature_columns = df_coded.columns[0:4]

# build VectorAssembler instance
vectorassembler = VectorAssembler(inputCols=feature_columns, outputCol='features')

# transform data
df_features = vectorassembler.transform(df_coded)

In [34]:
df_features.show(truncate=False)

+----------------+----------------+----------------+---+---+---+-----------------------------+
|onehotencoded_x1|onehotencoded_x2|onehotencoded_x3|x4 |y1 |y2 |features                     |
+----------------+----------------+----------------+---+---+---+-----------------------------+
|(2,[1],[1.0])   |(2,[],[])       |(2,[1],[1.0])   |2.4|1  |yes|(7,[1,5,6],[1.0,1.0,2.4])    |
|(2,[1],[1.0])   |(2,[0],[1.0])   |(2,[1],[1.0])   |2.5|0  |no |[0.0,1.0,1.0,0.0,0.0,1.0,2.5]|
|(2,[0],[1.0])   |(2,[0],[1.0])   |(2,[0],[1.0])   |3.5|1  |no |[1.0,0.0,1.0,0.0,1.0,0.0,3.5]|
|(2,[0],[1.0])   |(2,[0],[1.0])   |(2,[0],[1.0])   |1.4|0  |yes|[1.0,0.0,1.0,0.0,1.0,0.0,1.4]|
|(2,[0],[1.0])   |(2,[1],[1.0])   |(2,[0],[1.0])   |2.1|0  |yes|[1.0,0.0,0.0,1.0,1.0,0.0,2.1]|
|(2,[],[])       |(2,[1],[1.0])   |(2,[],[])       |1.5|1  |yes|(7,[3,6],[1.0,1.5])          |
+----------------+----------------+----------------+---+---+---+-----------------------------+

