In [1]:
sc

<pyspark.context.SparkContext at 0x7f5b779645f8>

In [2]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Hypothesis Testing

### Goodness of Fit

In [3]:
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.stat import Statistics

vec = Vectors.dense([0.3, 0.2, 0.15, 0.1, 0.1, 0.1, 0.05])

In [8]:
goodnessOfFitTestResult = Statistics.chiSqTest(vec)

In [9]:
goodnessOfFitTestResult.statistic

0.295

In [10]:
goodnessOfFitTestResult.pValue

0.999520973435643

In [11]:
goodnessOfFitTestResult.nullHypothesis

'observed follows the same distribution as expected.'

### Independence

In [12]:
from pyspark.mllib.linalg import Matrices

mat = Matrices.dense(3, 2, [13.0, 47.0, 40.0, 80.0, 11.0, 9.0])

In [13]:
independenceTestResult = Statistics.chiSqTest(mat)

In [14]:
independenceTestResult.statistic

90.22588968846716

In [15]:
independenceTestResult.pValue

0.0

In [16]:
independenceTestResult.nullHypothesis

'the occurrence of the outcomes is statistically independent.'

### Independence - Labeled Points

In [17]:
from pyspark.mllib.regression import LabeledPoint

obs = sc.parallelize([LabeledPoint(0,Vectors.dense(1.0,2.0)),
                      LabeledPoint(0,Vectors.dense(0.5,1.5)),
                      LabeledPoint(1,Vectors.dense(1.0,8.0))])

In [18]:
featTestResults = Statistics.chiSqTest(obs)

In [19]:
featTestResults

[<pyspark.mllib.stat.test.ChiSqTestResult at 0x7f5b4b90bf98>,
 <pyspark.mllib.stat.test.ChiSqTestResult at 0x7f5b4b90bcc0>]

In [20]:
map(lambda r: {r.statistic, r.pValue, r.nullHypothesis}, featTestResults)

<map at 0x7f5b4b90b160>

### Distribution

In [21]:
from pyspark.mllib.random import RandomRDDs

data = RandomRDDs.normalRDD(sc, size=100, numPartitions=1, seed=13)

In [22]:
ks_result = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)

In [23]:
ks_result.statistic

0.12019890461912125

In [24]:
ks_result.pValue

0.10230385223938121

In [25]:
ks_result.nullHypothesis

'Sample follows theoretical distribution'

### Kernel Density Estimation

In [26]:
from pyspark.mllib.stat import KernelDensity

In [27]:
kd = KernelDensity()
kd.setSample(data)
kd.setBandwidth(0.1)

In [28]:
kd.estimate([-1.5, -1.0, -0.5, 0.0, 0.5, 1.0, 1.5])

array([ 0.1023487 ,  0.15699217,  0.2957955 ,  0.51760411,  0.38091952,
        0.30242779,  0.1841904 ])