In [0]:
df_with_nulls = spark.createDataFrame([(None, None), (1.0, None), (float('nan'), 2.0), (11.0, 22.0)], ("A", "B"))
df_with_nulls.show()

+----+----+
|   A|   B|
+----+----+
|null|null|
| 1.0|null|
| NaN| 2.0|
|11.0|22.0|
+----+----+



In [0]:
df_with_nulls.dropna(how='all').show()

+----+----+
|   A|   B|
+----+----+
| 1.0|null|
| NaN| 2.0|
|11.0|22.0|
+----+----+



In [0]:
df_with_nulls.dropna(how='any').show()

+----+----+
|   A|   B|
+----+----+
|11.0|22.0|
+----+----+



In [0]:
df_with_nulls.na.fill(0).show()

+----+----+
|   A|   B|
+----+----+
| 0.0| 0.0|
| 1.0| 0.0|
| 0.0| 2.0|
|11.0|22.0|
+----+----+



In [0]:
from pyspark.ml.feature import Imputer
imputer = Imputer(inputCols=["A", "B"], outputCols=["out_a", "out_b"])
model = imputer.fit(df_with_nulls)  
df_mean = model.transform(df_with_nulls)  
df_mean.show()

+----+----+-----+-----+
|   A|   B|out_a|out_b|
+----+----+-----+-----+
|null|null|  6.0| 12.0|
| 1.0|null|  1.0| 12.0|
| NaN| 2.0|  6.0|  2.0|
|11.0|22.0| 11.0| 22.0|
+----+----+-----+-----+



In [0]:
df_mean = df_mean.select ('out_a', 'out_b')

In [0]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.linalg import DenseVector
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Normalizer
from math import *

In [0]:
veca = VectorAssembler(inputCols = ['out_a', 'out_b'], \
outputCol = 'feature_vector')
df_feat = veca.transform(df_mean)
df_feat.show()

+-----+-----+--------------+
|out_a|out_b|feature_vector|
+-----+-----+--------------+
|  6.0| 12.0|    [6.0,12.0]|
|  1.0| 12.0|    [1.0,12.0]|
|  6.0|  2.0|     [6.0,2.0]|
| 11.0| 22.0|   [11.0,22.0]|
+-----+-----+--------------+



In [0]:
normalizer = Normalizer(inputCol="feature_vector", outputCol="features")
normalizer.transform(df_feat).collect()

Out[9]: [Row(out_a=6.0, out_b=12.0, feature_vector=DenseVector([6.0, 12.0]), features=DenseVector([0.4472, 0.8944])),
 Row(out_a=1.0, out_b=12.0, feature_vector=DenseVector([1.0, 12.0]), features=DenseVector([0.083, 0.9965])),
 Row(out_a=6.0, out_b=2.0, feature_vector=DenseVector([6.0, 2.0]), features=DenseVector([0.9487, 0.3162])),
 Row(out_a=11.0, out_b=22.0, feature_vector=DenseVector([11.0, 22.0]), features=DenseVector([0.4472, 0.8944]))]