In [1]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setAppName("test")
sc = SparkContext(conf=conf)

In [2]:
import numpy as np

TOTAL = 1000000
dots = sc.parallelize([2.0 * np.random.random(2) - 1.0
                       for i in range(TOTAL)]) \
         .cache()
print("Number of random points:", dots.count())

stats = dots.stats()
print('Mean:', stats.mean())
print('stdev:', stats.stdev())

Number of random points: 1000000
Mean: [ 0.00015477 -0.0005326 ]
stdev: [0.57721785 0.57704843]


In [None]:
%matplotlib inline
from operator import itemgetter
from matplotlib import pyplot as plt

plt.figure(figsize = (10, 5))

# Plot 1
plt.subplot(1, 2, 1)
plt.xlim((-1.0, 1.0))
plt.ylim((-1.0, 1.0))

sample = dots.sample(False, 0.01)
X = sample.map(itemgetter(0)) \
          .collect()
Y = sample.map(itemgetter(1)) \
          .collect()
plt.scatter(X, Y)

# Plot 2
plt.subplot(1, 2, 2)
plt.xlim((-1.0, 1.0))
plt.ylim((-1.0, 1.0))

inCircle = lambda v: np.linalg.norm(v) <= 1.0
dotsIn = sample.filter(inCircle) \
               .cache()
dotsOut = sample.filter(lambda v: not inCircle(v)) \
                .cache()

# inside circle
Xin = dotsIn.map(itemgetter(0)) \
            .collect()
Yin = dotsIn.map(itemgetter(1)) \
            .collect()
plt.scatter(Xin, Yin, color = 'r')

# outside circle
Xout = dotsOut.map(itemgetter(0)) \
              .collect()
Yout = dotsOut.map(itemgetter(1)) \
              .collect()
plt.scatter(Xout, Yout)