# 用test_format1训练模型并保存

In [1]:
import pyspark
from pyspark import SparkContext
from pyspark import SparkConf

In [2]:
conf=SparkConf().setAppName("miniProject").setMaster("local").set("spark.executor.memory","3g")\
        .set("spark.executor.instances","2")
sc=SparkContext.getOrCreate(conf)


In [3]:
from pyspark.mllib.util import MLUtils

In [4]:
training = MLUtils.loadLibSVMFile(sc, "hdfs://node1:9000/user/root/exp4/procd_train_real")

# Split data into training (60%) and test (40%)
#training, test = data.randomSplit([0.6, 0.4], seed=11)
training.cache()


PythonRDD[4] at RDD at PythonRDD.scala:53

## Logistic Regression

In [5]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel
#from pyspark.mllib.evaluation import BinaryClassificationMetrics
# Logistic Regression
# Run training algorithm to build the model
model = LogisticRegressionWithLBFGS.train(training)

In [6]:
# Evaluating the model on training data
labelsAndPreds = training.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(training.count())
print("Training Error = " + str(trainErr))

Training Error = 0.06215499263984298


In [7]:
model.save(sc, "hdfs://node1:9000/user/root/exp4/models/LogisticRegressionModel")
# sameModel = LogisticRegressionModel.load(sc, "hdfs://node1:9000/user/root/exp4/models/LogisticRegressionModel")

## SVM

In [5]:
# SVM
# Build the model
from pyspark.mllib.classification import SVMWithSGD, SVMModel
model = SVMWithSGD.train(training, iterations=200)

In [6]:
# Evaluating the model on training data
labelsAndPreds = training.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(training.count())
print("Training Error = " + str(trainErr))

Training Error = 0.08762036923454367


In [7]:
model.save(sc, "hdfs://node1:9000/user/root/exp4/models/SVMWithSGDModel")
# sameModel = SVMModel.load(sc, "hdfs://node1:9000/user/root/exp4/models/SVMWithSGDModel")

## Decision Tree

In [17]:
# DecisionTree
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
                                         impurity='gini', maxDepth=5, maxBins=32)

In [19]:
# Evaluating the model on training data
predictions = model.predict(training.map(lambda x: x.features))
labelsAndPreds = training.map(lambda lp: lp.label).zip(predictions)
trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(training.count())
print("Training Error = " + str(trainErr))

Training Error = 0.06113913763493621


In [20]:
print('Learned classification tree model:')
print(model.toDebugString())

Learned classification tree model:
DecisionTreeModel classifier of depth 5 with 25 nodes
  If (feature 3 <= 8.5)
   If (feature 8 <= 1.5)
    If (feature 3 <= 2.5)
     If (feature 0 <= 3.5)
      Predict: 0.0
     Else (feature 0 > 3.5)
      If (feature 4 <= 3.5)
       Predict: 0.0
      Else (feature 4 > 3.5)
       Predict: 1.0
    Else (feature 3 > 2.5)
     Predict: 0.0
   Else (feature 8 > 1.5)
    Predict: 0.0
  Else (feature 3 > 8.5)
   If (feature 3 <= 19.5)
    If (feature 8 <= 1.5)
     Predict: 0.0
    Else (feature 8 > 1.5)
     If (feature 5 <= 4.5)
      Predict: 0.0
     Else (feature 5 > 4.5)
      If (feature 2 <= 12.5)
       Predict: 1.0
      Else (feature 2 > 12.5)
       Predict: 0.0
   Else (feature 3 > 19.5)
    If (feature 5 <= 7.5)
     Predict: 0.0
    Else (feature 5 > 7.5)
     If (feature 3 <= 65.0)
      Predict: 0.0
     Else (feature 3 > 65.0)
      If (feature 8 <= 4.5)
       Predict: 0.0
      Else (feature 8 > 4.5)
       Predict: 1.0



In [21]:
# Save and load model
model.save(sc, "hdfs://node1:9000/user/root/exp4/models/myDecisionTreeClassificationModel")
# sameModel = DecisionTreeModel.load(sc, "hdfs://node1:9000/user/root/exp4/models/myDecisionTreeClassificationModel")

## Naive Bayes

In [22]:
import shutil
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
# Train a naive Bayes model.
model = NaiveBayes.train(training, 1.0)

In [23]:
# Evaluating the model on training data
labelsAndPreds = training.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(training.count())
print("Training Error = " + str(trainErr))

Training Error = 0.10636576913640824


In [24]:
# Save and load model
model.save(sc, "hdfs://node1:9000/user/root/exp4/models/NaiveBayesModel")
# sameModel = NaiveBayesModel.load(sc, "hdfs://node1:9000/user/root/exp4/models/NaiveBayesModel")

## Random Forest

In [25]:
from pyspark.mllib.tree import RandomForest, RandomForestModel
# Train a RandomForest model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
#  Note: Use larger numTrees in practice.
#  Setting featureSubsetStrategy="auto" lets the algorithm choose.
model = RandomForest.trainClassifier(training, numClasses=2, categoricalFeaturesInfo={},
                                        numTrees=3, featureSubsetStrategy="auto",
                                        impurity='gini', maxDepth=4, maxBins=32)

In [26]:
# Evaluating the model on training data
predictions = model.predict(training.map(lambda x: x.features))
labelsAndPreds = training.map(lambda lp: lp.label).zip(predictions)
trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(training.count())
print("Training Error = " + str(trainErr))

Training Error = 0.06115063788027478


In [27]:
print('Learned classification forest model:')
print(model.toDebugString())

Learned classification forest model:
TreeEnsembleModel classifier with 3 trees

  Tree 0:
    Predict: 0.0
  Tree 1:
    Predict: 0.0
  Tree 2:
    If (feature 4 <= 2.5)
     Predict: 0.0
    Else (feature 4 > 2.5)
     If (feature 3 <= 19.5)
      If (feature 3 <= 8.5)
       Predict: 0.0
      Else (feature 3 > 8.5)
       If (feature 5 <= 18.5)
        Predict: 0.0
       Else (feature 5 > 18.5)
        Predict: 1.0
     Else (feature 3 > 19.5)
      Predict: 0.0



In [28]:
# Save and load model
model.save(sc, "hdfs://node1:9000/user/root/exp4/models/myRandomForestClassificationModel")
# sameModel = RandomForestModel.load(sc, "hdfs://node1:9000/user/root/exp4/models/myRandomForestClassificationModel")

## Gradient Boosted Trees

In [8]:
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
# Train a GradientBoostedTrees model.
#  Notes: (a) Empty categoricalFeaturesInfo indicates all features are continuous.
#         (b) Use more iterations in practice.
model = GradientBoostedTrees.trainClassifier(training,
                                                categoricalFeaturesInfo={}, numIterations=30)

In [9]:
# Evaluating the model on training data
predictions = model.predict(training.map(lambda x: x.features))
labelsAndPreds = training.map(lambda lp: lp.label).zip(predictions)
trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(training.count())
print("Training Error = " + str(trainErr))

Training Error = 0.061142971050049066


In [10]:
print('Learned classification GBT model:')
print(model.toDebugString())


     Else (feature 8 > 1.5)
      If (feature 6 <= 10.5)
       Predict: -0.32301894307771395
      Else (feature 6 > 10.5)
       Predict: 0.16717204043018258
    Else (feature 4 > 2.5)
     If (feature 4 <= 7.5)
      If (feature 5 <= 3.5)
       Predict: -0.2941068285134676
      Else (feature 5 > 3.5)
       Predict: -0.20809792196806162
     Else (feature 4 > 7.5)
      If (feature 5 <= 26.5)
       Predict: -0.11963992385736985
      Else (feature 5 > 26.5)
       Predict: 1.1170996374172817
  Tree 2:
    If (feature 4 <= 2.5)
     If (feature 8 <= 1.5)
      If (feature 0 <= 3.5)
       Predict: -0.35234887156188743
      Else (feature 0 > 3.5)
       Predict: -0.3227571761025556
     Else (feature 8 > 1.5)
      If (feature 6 <= 10.5)
       Predict: -0.2890697339853412
      Else (feature 6 > 10.5)
       Predict: 0.1467771630807103
    Else (feature 4 > 2.5)
     If (feature 8 <= 3.5)
      If (feature 5 <= 3.5)
       Predict: -0.27113921230756444
      Else (feature 5 > 3.

In [11]:
# Save and load model
model.save(sc, "hdfs://node1:9000/user/root/exp4/models/myGradientBoostingClassificationModel")
# sameModel = GradientBoostedTreesModel.load(sc,
#                                             "hdfs://node1:9000/user/root/exp4/models/myGradientBoostingClassificationModel")

# Don't forget to stop the SparkContext!

In [33]:
sc.stop()