In [1]:
sc

<pyspark.context.SparkContext at 0x7fda71230630>

In [2]:
from pyspark.sql import SQLContext
sqlc = SQLContext(sc)

## Random Forests

### Classification

In [3]:
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils

data = MLUtils.loadLibSVMFile(sc, 'sample_libsvm_data.txt')

trainingData, testData = data.randomSplit([0.7, 0.3])

labels = testData.map(lambda x: x.label)
features = testData.map(lambda x: x.features)

In [4]:
model = RandomForest.trainClassifier(trainingData, 
                                     numClasses=2, 
                                     categoricalFeaturesInfo={},
                                     numTrees=3, 
                                     featureSubsetStrategy="auto",
                                     impurity='gini', 
                                     maxDepth=4, 
                                     maxBins=32)

print(model.toDebugString())

TreeEnsembleModel classifier with 3 trees

  Tree 0:
    If (feature 482 <= 0.0)
     If (feature 466 <= 0.0)
      If (feature 300 <= 3.0)
       Predict: 1.0
      Else (feature 300 > 3.0)
       Predict: 0.0
     Else (feature 466 > 0.0)
      Predict: 0.0
    Else (feature 482 > 0.0)
     Predict: 0.0
  Tree 1:
    If (feature 243 <= 71.0)
     If (feature 608 <= 40.0)
      Predict: 1.0
     Else (feature 608 > 40.0)
      Predict: 0.0
    Else (feature 243 > 71.0)
     If (feature 407 <= 0.0)
      Predict: 0.0
     Else (feature 407 > 0.0)
      Predict: 1.0
  Tree 2:
    If (feature 400 <= 0.0)
     If (feature 323 <= 38.0)
      If (feature 408 <= 0.0)
       Predict: 0.0
      Else (feature 408 > 0.0)
       Predict: 1.0
     Else (feature 323 > 38.0)
      Predict: 1.0
    Else (feature 400 > 0.0)
     Predict: 0.0



In [12]:
predictions = model.predict(features)

labelsAndPredictions = labels.zip(predictions)

testErr = labelsAndPredictions.filter(lambda args: args[0] != args[1]).count() / float(testData.count())
print('Test Error = ' + str(testErr))

Test Error = 0.0


### Regression

In [9]:
model = RandomForest.trainRegressor(trainingData, 
                                    categoricalFeaturesInfo={},
                                    numTrees=3, 
                                    featureSubsetStrategy="auto",
                                    impurity='variance', 
                                    maxDepth=4, 
                                    maxBins=32)

print(model.toDebugString())

TreeEnsembleModel regressor with 3 trees

  Tree 0:
    If (feature 406 <= 72.0)
     If (feature 293 <= 253.0)
      Predict: 0.0
     Else (feature 293 > 253.0)
      Predict: 1.0
    Else (feature 406 > 72.0)
     Predict: 1.0
  Tree 1:
    If (feature 540 <= 41.0)
     Predict: 1.0
    Else (feature 540 > 41.0)
     Predict: 0.0
  Tree 2:
    If (feature 540 <= 65.0)
     Predict: 1.0
    Else (feature 540 > 65.0)
     Predict: 0.0



In [11]:
predictions = model.predict(features)

labelsAndPredictions = labels.zip(predictions)

testMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(testData.count())
print('Test Mean Squared Error = ' + str(testMSE))

Test Mean Squared Error = 0.0292397660819
