In [9]:
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext('local[*]')
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [3]:
import numpy as np
from sklearn import datasets

In [4]:
iris = datasets.load_iris()
size = len(iris.target)
indices = np.random.permutation(size)

cutoff = int(size * .30)

testX = iris.data[indices[0:cutoff],:]
trainX = iris.data[indices[cutoff:],:]
testY = iris.target[indices[0:cutoff]]
trainY = iris.target[indices[cutoff:]]

In [5]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(trainX, trainY)

predictions = knn.predict(testX)

print sum(predictions == testY) / float(len(testY))

0.977777777778


In [6]:
from sklearn.cross_validation import train_test_split

def runNearestNeighbors(k):
    irisData = datasets.load_iris()
    
    # Split into train and test using sklearn.cross_validation.train_test_split
    yTrain, yTest, XTrain, XTest = train_test_split(irisData.target, 
                                                    irisData.data)
    
    # Build the model
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(XTrain, yTrain)
    
    # Calculate predictions and accuracy
    predictions = knn.predict(XTest)
    accuracy = (predictions == yTest).sum() / float(len(yTest))
    
    return (k, accuracy)   

In [11]:
k = sc.parallelize(xrange(1,11))
results = k.map(runNearestNeighbors)
print '\n'.join(map(str, results.collect()))

(1, 0.94736842105263153)
(2, 0.97368421052631582)
(3, 0.97368421052631582)
(4, 0.97368421052631582)
(5, 0.94736842105263153)
(6, 0.97368421052631582)
(7, 0.94736842105263153)
(8, 0.94736842105263153)
(9, 1.0)
(10, 0.97368421052631582)


In [12]:
irisBroadcast = sc.broadcast(iris)

In [13]:
def runNearestNeighborsBroadcast(k):
    # Using the data in the irisBroadcast variable split into train and test using
    # sklearn.cross_validation.train_test_split
    yTrain, yTest, XTrain, XTest = train_test_split(irisBroadcast.value.target,
                                                    irisBroadcast.value.data)
    
    # Build the model
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(XTrain, yTrain)
    
    # Calculate predictions and accuracy
    predictions = knn.predict(XTest)
    accuracy = (predictions == yTest).sum() / float(len(yTest))
    
    return (k, accuracy) 

k = sc.parallelize(xrange(1, 11))
results = k.map(runNearestNeighborsBroadcast)
print '\n'.join(map(str, results.collect()))

(1, 0.94736842105263153)
(2, 0.97368421052631582)
(3, 0.97368421052631582)
(4, 0.97368421052631582)
(5, 0.94736842105263153)
(6, 0.97368421052631582)
(7, 0.94736842105263153)
(8, 0.94736842105263153)
(9, 1.0)
(10, 0.97368421052631582)


In [16]:
from sklearn.cross_validation import KFold

kf = KFold(size, n_folds=10)
folds = sc.parallelize(kf)
print folds.take(2)

[(array([ 15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
        28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,
        41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
        54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
        67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,
        93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105,
       106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
       119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
       132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144,
       145, 146, 147, 148, 149]), array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])), (array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,
        41,  42,  43,  44,  4

In [17]:
import numpy as np
def runNearestNeighborsWithFolds((trainIndex, testIndex)):
    # Assign training and test sets from irisBroadcast using trainIndex and testIndex
    XTrain = irisBroadcast.value.data[trainIndex]
    yTrain = irisBroadcast.value.target[trainIndex]
    XTest = irisBroadcast.value.data[testIndex]
    yTest = irisBroadcast.value.target[testIndex]
    
    # Build the model
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(XTrain, yTrain)
    
    # Calculate predictions
    predictions = knn.predict(XTest)
    
    # Compute the number of correct predictions and total predictions
    correct = (predictions == yTest).sum() 
    total = len(testIndex)
    
    # Return an np.array of the number of correct predictions and total predictions
    return np.array([correct, total])

foldResults = folds.map(runNearestNeighborsWithFolds)
print 'correct / total\n' + '\n'.join(map(str, foldResults.collect()))

correct, total = foldResults.sum()
print correct / float(total)

correct / total
[15 15]
[15 15]
[15 15]
[15 15]
[12 15]
[13 15]
[15 15]
[13 15]
[12 15]
[15 15]
0.933333333333


In [19]:
irisData = sc.parallelize(zip(iris.target, iris.data), 4)
print irisData.take(2), '\n'

# View the number of elements found in each of the eight partitions
print (irisData
       .mapPartitions(lambda x: [len(list(x))])
       .collect())

# View the target (y) stored by partition
print '\n', irisData.keys().glom().collect()

[(0, array([ 5.1,  3.5,  1.4,  0.2])), (0, array([ 4.9,  3. ,  1.4,  0.2]))] 

[37, 37, 37, 39]

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]]
