In [1]:
import pyspark
from pyspark import SparkContext
from pyspark import SparkConf

In [2]:
conf=SparkConf().setAppName("miniProject").setMaster("local").set("spark.executor.memory","3g")\
        .set("spark.executor.instances","2")
sc=SparkContext.getOrCreate(conf)


In [3]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.util import MLUtils

In [4]:
data = MLUtils.loadLibSVMFile(sc, "hdfs://node1:9000/user/root/exp4/procd_train_real")

# Split data into training (60%) and test (40%)
training, test = data.randomSplit([0.6, 0.4], seed=11)
training.cache()


PythonRDD[4] at RDD at PythonRDD.scala:53

## Logistic Regression

In [5]:
# Logistic Regression
# Run training algorithm to build the model
model = LogisticRegressionWithLBFGS.train(training)

In [6]:
# Compute raw scores on the test set
predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

# Instantiate metrics object
metrics = BinaryClassificationMetrics(predictionAndLabels)

# Area under precision-recall curve
print("Area under PR = %s" % metrics.areaUnderPR)

# Area under ROC curve
print("Area under ROC = %s" % metrics.areaUnderROC)

Area under PR = 0.12141015007401622
Area under ROC = 0.5016382862362231


In [7]:
testErr = predictionAndLabels.filter(lambda lp: lp[0] != lp[1]).count() / float(test.count())
print("Test Error = " + str(testErr))

Test Error = 0.0625555970042182


## SVM

In [8]:
# SVM
# Build the model
from pyspark.mllib.classification import SVMWithSGD, SVMModel
svm_model = SVMWithSGD.train(training, iterations=100)

In [9]:
# Evaluating the model on training data
labelsAndPreds = training.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(training.count())
print("Training Error = " + str(trainErr))

Training Error = 0.06195103539602219


In [10]:
# Evaluating the model on test data
labelsAndPreds = test.map(lambda p: (p.label, svm_model.predict(p.features)))
testErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(test.count())
print("Test Error = " + str(testErr))

Test Error = 0.06154169894879815


In [None]:
# model.save(sc, "target/tmp/pythonSVMWithSGDModel")
# sameModel = SVMModel.load(sc, "target/tmp/pythonSVMWithSGDModel")

## Decision Tree

In [11]:
# DecisionTree
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
                                         impurity='gini', maxDepth=5, maxBins=32)

In [12]:
# Evaluate model on test instances and compute test error
predictions = model.predict(test.map(lambda x: x.features))
labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(
    lambda lp: lp[0] != lp[1]).count() / float(test.count())
print('Test Error = ' + str(testErr))
print('Learned classification tree model:')
print(model.toDebugString())

Test Error = 0.061589524328770795
Learned classification tree model:
DecisionTreeModel classifier of depth 5 with 21 nodes
  If (feature 3 <= 6.5)
   If (feature 8 <= 1.5)
    If (feature 3 <= 2.5)
     If (feature 4 <= 3.5)
      Predict: 0.0
     Else (feature 4 > 3.5)
      Predict: 1.0
    Else (feature 3 > 2.5)
     Predict: 0.0
   Else (feature 8 > 1.5)
    Predict: 0.0
  Else (feature 3 > 6.5)
   If (feature 3 <= 19.5)
    Predict: 0.0
   Else (feature 3 > 19.5)
    If (feature 4 <= 5.5)
     If (feature 8 <= 8.5)
      Predict: 0.0
     Else (feature 8 > 8.5)
      If (feature 5 <= 1.5)
       Predict: 0.0
      Else (feature 5 > 1.5)
       Predict: 1.0
    Else (feature 4 > 5.5)
     If (feature 8 <= 2.5)
      Predict: 0.0
     Else (feature 8 > 2.5)
      If (feature 5 <= 17.5)
       Predict: 0.0
      Else (feature 5 > 17.5)
       Predict: 1.0



In [None]:
# Save and load model
# model.save(sc, "target/tmp/myDecisionTreeClassificationModel")
# sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeClassificationModel")

## Naive Bayes

In [13]:
import shutil
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
# Train a naive Bayes model.
model = NaiveBayes.train(training, 1.0)

In [14]:
# Make prediction and test accuracy.
predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
accuracy = 1.0 * predictionAndLabel.filter(lambda pl: pl[0] == pl[1]).count() / test.count()
print('model accuracy {}'.format(accuracy))

model accuracy 0.8911876954862407


In [None]:
# Save and load model
# output_dir = 'target/tmp/myNaiveBayesModel'
# shutil.rmtree(output_dir, ignore_errors=True)
# model.save(sc, output_dir)
# sameModel = NaiveBayesModel.load(sc, output_dir)
# predictionAndLabel = test.map(lambda p: (sameModel.predict(p.features), p.label))
# accuracy = 1.0 * predictionAndLabel.filter(lambda pl: pl[0] == pl[1]).count() / test.count()
# print('sameModel accuracy {}'.format(accuracy))

## Random Forest

In [16]:
from pyspark.mllib.tree import RandomForest, RandomForestModel
# Train a RandomForest model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
#  Note: Use larger numTrees in practice.
#  Setting featureSubsetStrategy="auto" lets the algorithm choose.
model = RandomForest.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
                                        numTrees=3, featureSubsetStrategy="auto",
                                        impurity='gini', maxDepth=4, maxBins=32)

In [17]:
# Evaluate model on test instances and compute test error
predictions = model.predict(test.map(lambda x: x.features))
labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(
    lambda lp: lp[0] != lp[1]).count() / float(test.count())
print('Test Error = ' + str(testErr))
print('Learned classification forest model:')
print(model.toDebugString())

Test Error = 0.061551264024792676
Learned classification forest model:
TreeEnsembleModel classifier with 3 trees

  Tree 0:
    If (feature 3 <= 8.5)
     If (feature 3 <= 2.5)
      If (feature 3 <= 1.5)
       Predict: 0.0
      Else (feature 3 > 1.5)
       If (feature 6 <= 10.5)
        Predict: 0.0
       Else (feature 6 > 10.5)
        Predict: 1.0
     Else (feature 3 > 2.5)
      Predict: 0.0
    Else (feature 3 > 8.5)
     If (feature 6 <= 8.5)
      Predict: 0.0
     Else (feature 6 > 8.5)
      If (feature 8 <= 3.5)
       Predict: 0.0
      Else (feature 8 > 3.5)
       If (feature 5 <= 11.5)
        Predict: 0.0
       Else (feature 5 > 11.5)
        Predict: 1.0
  Tree 1:
    If (feature 8 <= 1.5)
     Predict: 0.0
    Else (feature 8 > 1.5)
     If (feature 3 <= 18.5)
      Predict: 0.0
     Else (feature 3 > 18.5)
      If (feature 8 <= 8.5)
       Predict: 0.0
      Else (feature 8 > 8.5)
       If (feature 6 <= 11.5)
        Predict: 0.0
       Else (feature 6 > 11.5)

In [None]:
# Save and load model
# model.save(sc, "target/tmp/myRandomForestClassificationModel")
# sameModel = RandomForestModel.load(sc, "target/tmp/myRandomForestClassificationModel")

## Gradient Boosted Trees

In [18]:
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
# Train a GradientBoostedTrees model.
#  Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
#         (b) Use more iterations in practice.
model = GradientBoostedTrees.trainClassifier(training,
                                                categoricalFeaturesInfo={}, numIterations=3)

In [19]:
# Evaluate model on test instances and compute test error
predictions = model.predict(test.map(lambda x: x.features))
labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(
    lambda lp: lp[0] != lp[1]).count() / float(test.count())
print('Test Error = ' + str(testErr))
print('Learned classification GBT model:')
print(model.toDebugString())

Test Error = 0.06154169894879815
Learned classification GBT model:
TreeEnsembleModel classifier with 3 trees

  Tree 0:
    If (feature 3 <= 6.5)
     If (feature 8 <= 1.5)
      If (feature 3 <= 2.5)
       Predict: -0.9135673567356736
      Else (feature 3 > 2.5)
       Predict: -0.8897163920860603
     Else (feature 8 > 1.5)
      If (feature 8 <= 2.5)
       Predict: -0.8504097017371354
      Else (feature 8 > 2.5)
       Predict: -0.8043789645999591
    Else (feature 3 > 6.5)
     If (feature 3 <= 19.5)
      If (feature 8 <= 1.5)
       Predict: -0.8410674825769165
      Else (feature 8 > 1.5)
       Predict: -0.7775098640946954
     Else (feature 3 > 19.5)
      If (feature 4 <= 5.5)
       Predict: -0.7669565217391304
      Else (feature 4 > 5.5)
       Predict: -0.6267470539873938
  Tree 1:
    If (feature 4 <= 2.5)
     If (feature 8 <= 1.5)
      If (feature 0 <= 3.5)
       Predict: -0.3896669891412022
      Else (feature 0 > 3.5)
       Predict: -0.3561260039709395
     El

In [None]:
# Save and load model
# model.save(sc, "target/tmp/myGradientBoostingClassificationModel")
# sameModel = GradientBoostedTreesModel.load(sc,
#                                             "target/tmp/myGradientBoostingClassificationModel")

# Don't forget to stop the SparkContext!

In [20]:
sc.stop()