In [1]:
#     >>> from pyspark.ml.linalg import Vectors
#     >>> from pyspark.ml.stat import ChiSquareTest
#     >>> dataset = [[0, Vectors.dense([0, 0, 1])],
#     ...            [0, Vectors.dense([1, 0, 1])],
#     ...            [1, Vectors.dense([2, 1, 1])],
#     ...            [1, Vectors.dense([3, 1, 1])]]
#     >>> dataset = spark.createDataFrame(dataset, ["label", "features"])
#     >>> chiSqResult = ChiSquareTest.test(dataset, 'features', 'label')
#     >>> chiSqResult.select("degreesOfFreedom").collect()[0]
#     Row(degreesOfFreedom=[3, 1, 0])

In [15]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import ChiSquareTest
from pyspark.ml.feature import ChiSqSelector

import numpy as np
from scipy import stats

In [4]:
dataset = [[0, Vectors.dense([0, 0, 1])],
           [0, Vectors.dense([1, 0, 1])],
           [1, Vectors.dense([2, 1, 1])],
           [1, Vectors.dense([3, 1, 1])]]

In [5]:
dataset = spark.createDataFrame(dataset, ["label", "features"])
chiSqResult = ChiSquareTest.test(dataset, 'features', 'label')

In [8]:
chiSqResult.show(truncate=False)

+--------------------------------------------+----------------+-------------+
|pValues                                     |degreesOfFreedom|statistics   |
+--------------------------------------------+----------------+-------------+
|[0.2614641299491107,0.04550026389635764,1.0]|[3, 1, 0]       |[4.0,4.0,0.0]|
+--------------------------------------------+----------------+-------------+



In [11]:
obs = np.array([[1,1,0,0], [0,0,1,1]])

In [12]:
stats.chi2_contingency(obs)

(4.0, 0.26146412994911117, 3, array([[ 0.5,  0.5,  0.5,  0.5],
        [ 0.5,  0.5,  0.5,  0.5]]))

In [44]:
selector = (ChiSqSelector()
            .setSelectorType("fpr")
            .setFpr(0.5)
            .setFeaturesCol("features")
            .setLabelCol("label")
            .setOutputCol("selectedFeatures"))

In [45]:
selector.fit(dataset).transform(dataset)

DataFrame[label: bigint, features: vector, selectedFeatures: vector]

In [47]:
model = selector.fit(dataset)

model.transform(dataset).show()

+-----+-------------+----------------+
|label|     features|selectedFeatures|
+-----+-------------+----------------+
|    0|[0.0,0.0,1.0]|       [0.0,0.0]|
|    0|[1.0,0.0,1.0]|       [1.0,0.0]|
|    1|[2.0,1.0,1.0]|       [2.0,1.0]|
|    1|[3.0,1.0,1.0]|       [3.0,1.0]|
+-----+-------------+----------------+



In [50]:
model.fpr

Param(parent=u'ChiSqSelector_424c8837ce5b75800f25', name='fpr', doc='The highest p-value for features to be kept.')