In [66]:
from pyspark.ml.linalg import Vectors, Vector, VectorUDT
from pyspark.sql.types import ArrayType, DoubleType, IntegerType
from pyspark.ml.stat import ChiSquareTest
from pyspark.ml.feature import ChiSqSelector
from pyspark.sql.functions import udf, col
from pyspark.sql import Row

import numpy as np
from scipy import stats

## Case study test f the chi square test

In [4]:
dataset = [[0, Vectors.dense([0, 0, 1])],
           [0, Vectors.dense([1, 0, 1])],
           [1, Vectors.dense([2, 1, 1])],
           [1, Vectors.dense([3, 1, 1])]]

In [5]:
dataset = spark.createDataFrame(dataset, ["label", "features"])
chiSqResult = ChiSquareTest.test(dataset, 'features', 'label')

In [8]:
chiSqResult.show(truncate=False)

+--------------------------------------------+----------------+-------------+
|pValues                                     |degreesOfFreedom|statistics   |
+--------------------------------------------+----------------+-------------+
|[0.2614641299491107,0.04550026389635764,1.0]|[3, 1, 0]       |[4.0,4.0,0.0]|
+--------------------------------------------+----------------+-------------+



In [11]:
obs = np.array([[1,1,0,0], [0,0,1,1]])

In [12]:
stats.chi2_contingency(obs)

(4.0, 0.26146412994911117, 3, array([[ 0.5,  0.5,  0.5,  0.5],
        [ 0.5,  0.5,  0.5,  0.5]]))

In [44]:
selector = (ChiSqSelector()
            .setSelectorType("fpr")
            .setFpr(0.5)
            .setFeaturesCol("features")
            .setLabelCol("label")
            .setOutputCol("selectedFeatures"))

In [45]:
selector.fit(dataset).transform(dataset)

DataFrame[label: bigint, features: vector, selectedFeatures: vector]

In [47]:
model = selector.fit(dataset)

model.transform(dataset).show()

+-----+-------------+----------------+
|label|     features|selectedFeatures|
+-----+-------------+----------------+
|    0|[0.0,0.0,1.0]|       [0.0,0.0]|
|    0|[1.0,0.0,1.0]|       [1.0,0.0]|
|    1|[2.0,1.0,1.0]|       [2.0,1.0]|
|    1|[3.0,1.0,1.0]|       [3.0,1.0]|
+-----+-------------+----------------+



In [50]:
model.fpr

Param(parent=u'ChiSqSelector_424c8837ce5b75800f25', name='fpr', doc='The highest p-value for features to be kept.')

## Application on the costa poverty train data set on specific features

In [97]:
train = spark.read.csv("../../../../data/train/train.csv", header=True, inferSchema=True)

In [98]:
selected_features = ["paredblolad", "paredzocalo", "paredpreb", "pareddes", "paredmad", "paredzinc", "paredfibras", "paredother"]
features_broadcast = spark.sparkContext.broadcast(selected_features)

In [99]:
train.select(selected_features + ["Target"]).show()

+-----------+-----------+---------+--------+--------+---------+-----------+----------+------+
|paredblolad|paredzocalo|paredpreb|pareddes|paredmad|paredzinc|paredfibras|paredother|Target|
+-----------+-----------+---------+--------+--------+---------+-----------+----------+------+
|          1|          0|        0|       0|       0|        0|          0|         0|     4|
|          0|          0|        0|       0|       1|        0|          0|         0|     4|
|          0|          0|        0|       0|       1|        0|          0|         0|     4|
|          1|          0|        0|       0|       0|        0|          0|         0|     4|
|          1|          0|        0|       0|       0|        0|          0|         0|     4|
|          1|          0|        0|       0|       0|        0|          0|         0|     4|
|          1|          0|        0|       0|       0|        0|          0|         0|     4|
|          1|          0|        0|       0|       0|       

In [100]:
train.select(selected_features + ["Target"]).printSchema()

root
 |-- paredblolad: integer (nullable = true)
 |-- paredzocalo: integer (nullable = true)
 |-- paredpreb: integer (nullable = true)
 |-- pareddes: integer (nullable = true)
 |-- paredmad: integer (nullable = true)
 |-- paredzinc: integer (nullable = true)
 |-- paredfibras: integer (nullable = true)
 |-- paredother: integer (nullable = true)
 |-- Target: integer (nullable = true)



In [101]:
# train.stat.crosstab("Target", "paredblolad").show()

In [102]:
train.select(selected_features).dropDuplicates().show()

+-----------+-----------+---------+--------+--------+---------+-----------+----------+
|paredblolad|paredzocalo|paredpreb|pareddes|paredmad|paredzinc|paredfibras|paredother|
+-----------+-----------+---------+--------+--------+---------+-----------+----------+
|          0|          0|        0|       1|       0|        0|          0|         0|
|          0|          0|        1|       0|       0|        0|          0|         0|
|          1|          0|        0|       0|       0|        0|          0|         0|
|          0|          0|        0|       0|       0|        0|          0|         1|
|          0|          1|        0|       0|       0|        0|          0|         0|
|          0|          0|        0|       0|       0|        1|          0|         0|
|          0|          0|        0|       0|       0|        0|          1|         0|
|          0|          0|        0|       0|       1|        0|          0|         0|
+-----------+-----------+---------+--------

In [103]:
train.select(selected_features).dropDuplicates().count()

8

In [110]:
udf_create_dense_vector = udf(lambda values: Vectors.dense(values), VectorUDT())
udf_get_target = udf(lambda values: Vectors.dense([values.index(1)]), VectorUDT())

In [111]:
Person = Row("Target", "values")

rdd = (train
 .select(selected_features + ["Target"])
 .rdd
 .map(lambda row: (Person(row["Target"], [row[target] for target in features_broadcast.value]))))

In [115]:
df = (spark.createDataFrame(rdd)
      .withColumn("Features", udf_create_dense_vector(col("values")))
      .select(col("Target").cast(DoubleType()).alias("label"), col("values"), col("Features").alias("features"))
      .withColumn("target", udf_get_target(col("values")))
     )

In [116]:
df.first()

Row(label=4.0, values=[1, 0, 0, 0, 0, 0, 0, 0], features=DenseVector([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), target=DenseVector([0.0]))

#### ChiSquareTest

In [117]:
chiSqResult_features = ChiSquareTest.test(df, 'features', 'label')
chiSqResult_target = ChiSquareTest.test(df, 'target', 'label')

In [46]:
chiSqResult.printSchema()

root
 |-- pValues: vector (nullable = true)
 |-- degreesOfFreedom: array (nullable = true)
 |    |-- element: integer (containsNull = false)
 |-- statistics: vector (nullable = true)



In [49]:
chiSqResult_features.select("pValues").show(truncate=False)

chiSqResult_features.select("degreesOfFreedom").show(truncate=False)

chiSqResult_features.select("statistics").show(truncate=False)

+----------------------------------------------------------------------------------------------------+
|pValues                                                                                             |
+----------------------------------------------------------------------------------------------------+
|[0.0,7.66053886991358E-15,0.0,0.0,0.0,7.11292535982011E-9,0.0037971881924065976,0.11572771545742688]|
+----------------------------------------------------------------------------------------------------+

+------------------------+
|degreesOfFreedom        |
+------------------------+
|[3, 3, 3, 3, 3, 3, 3, 3]|
+------------------------+

+------------------------------------------------------------------------------------------------------------------------------------------------+
|statistics                                                                                                                                      |
+-----------------------------------------------------

In [118]:
chiSqResult_target.select("pValues").show(truncate=False)

chiSqResult_target.select("degreesOfFreedom").show(truncate=False)

chiSqResult_target.select("statistics").show(truncate=False)

+-------+
|pValues|
+-------+
|[0.0]  |
+-------+

+----------------+
|degreesOfFreedom|
+----------------+
|[21]            |
+----------------+

+------------------+
|statistics        |
+------------------+
|[837.425781559185]|
+------------------+



#### ChiSqSelector

In [122]:
alpha = 0.05

chiSelector = (ChiSqSelector()
               .setSelectorType("fpr")
               .setFpr(alpha)
               .setLabelCol("label")
               .setFeaturesCol("features")
               .setOutputCol("$featureColumn-Selected"))

chiSelector.fit(df).transform(df).show(truncate=False)

+-----+------------------------+---------------------------------+------+-----------------------------+
|label|values                  |features                         |target|$featureColumn-Selected      |
+-----+------------------------+---------------------------------+------+-----------------------------+
|4.0  |[1, 0, 0, 0, 0, 0, 0, 0]|[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]|[0.0] |[1.0,0.0,0.0,0.0,0.0,0.0,0.0]|
|4.0  |[0, 0, 0, 0, 1, 0, 0, 0]|[0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0]|[4.0] |[0.0,0.0,0.0,0.0,1.0,0.0,0.0]|
|4.0  |[0, 0, 0, 0, 1, 0, 0, 0]|[0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0]|[4.0] |[0.0,0.0,0.0,0.0,1.0,0.0,0.0]|
|4.0  |[1, 0, 0, 0, 0, 0, 0, 0]|[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]|[0.0] |[1.0,0.0,0.0,0.0,0.0,0.0,0.0]|
|4.0  |[1, 0, 0, 0, 0, 0, 0, 0]|[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]|[0.0] |[1.0,0.0,0.0,0.0,0.0,0.0,0.0]|
|4.0  |[1, 0, 0, 0, 0, 0, 0, 0]|[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]|[0.0] |[1.0,0.0,0.0,0.0,0.0,0.0,0.0]|
|4.0  |[1, 0, 0, 0, 0, 0, 0, 0]|[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0

In [124]:
alpha = 0.05

chiSelector = (ChiSqSelector()
               .setSelectorType("fpr")
               .setFpr(alpha)
               .setLabelCol("label")
               .setFeaturesCol("target")
               .setOutputCol("$featureColumn-Selected"))

chiSelector.fit(df).transform(df).show(truncate=False)

+-----+------------------------+---------------------------------+------+-----------------------+
|label|values                  |features                         |target|$featureColumn-Selected|
+-----+------------------------+---------------------------------+------+-----------------------+
|4.0  |[1, 0, 0, 0, 0, 0, 0, 0]|[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]|[0.0] |[0.0]                  |
|4.0  |[0, 0, 0, 0, 1, 0, 0, 0]|[0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0]|[4.0] |[4.0]                  |
|4.0  |[0, 0, 0, 0, 1, 0, 0, 0]|[0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0]|[4.0] |[4.0]                  |
|4.0  |[1, 0, 0, 0, 0, 0, 0, 0]|[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]|[0.0] |[0.0]                  |
|4.0  |[1, 0, 0, 0, 0, 0, 0, 0]|[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]|[0.0] |[0.0]                  |
|4.0  |[1, 0, 0, 0, 0, 0, 0, 0]|[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]|[0.0] |[0.0]                  |
|4.0  |[1, 0, 0, 0, 0, 0, 0, 0]|[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]|[0.0] |[0.0]                  |
|4.0  |[1, 0, 0, 0, 

+-----+------------------------+---------------------------------+------+-----------------------------+
|label|values                  |features                         |target|$featureColumn-Selected      |
+-----+------------------------+---------------------------------+------+-----------------------------+
|4.0  |[1, 0, 0, 0, 0, 0, 0, 0]|[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]|[0.0] |[1.0,0.0,0.0,0.0,0.0,0.0,0.0]|
|4.0  |[0, 0, 0, 0, 1, 0, 0, 0]|[0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0]|[4.0] |[0.0,0.0,0.0,0.0,1.0,0.0,0.0]|
|4.0  |[0, 0, 0, 0, 1, 0, 0, 0]|[0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0]|[4.0] |[0.0,0.0,0.0,0.0,1.0,0.0,0.0]|
|4.0  |[1, 0, 0, 0, 0, 0, 0, 0]|[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]|[0.0] |[1.0,0.0,0.0,0.0,0.0,0.0,0.0]|
|4.0  |[1, 0, 0, 0, 0, 0, 0, 0]|[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]|[0.0] |[1.0,0.0,0.0,0.0,0.0,0.0,0.0]|
|4.0  |[1, 0, 0, 0, 0, 0, 0, 0]|[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]|[0.0] |[1.0,0.0,0.0,0.0,0.0,0.0,0.0]|
|4.0  |[1, 0, 0, 0, 0, 0, 0, 0]|[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0