# Decision Tree

1. Decision tree classifier
2. Decision tree regression <br/>

The evaluatiors are "BinaryClassificationEvaluator", "RegressionEvaluator", and "MulticlassClassificationEvaluator". All of them take two inputs (label, and prediction), and one output (metric). The model prediction performance is evluated by the performance indicator (metric). See detail:<br/>
Binary:https://spark.apache.org/docs/2.3.0/api/java/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.html#metricName--
Regreesion:https://spark.apache.org/docs/2.3.0/api/java/org/apache/spark/ml/evaluation/RegressionEvaluator.html#metricName--
Multiclass:https://spark.apache.org/docs/2.3.0/api/java/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.html#metricName--

In [3]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.regression.{DecisionTreeRegressor, DecisionTreeRegressionModel}
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.ml.classification.{DecisionTreeClassifier, DecisionTreeClassificationModel}
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, RegressionEvaluator}

val sparkSession = SparkSession.builder.
    master("local[4]").
    appName("Decision Tree").
    getOrCreate()

import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.regression.{DecisionTreeRegressor, DecisionTreeRegressionModel}
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.ml.classification.{DecisionTreeClassifier, DecisionTreeClassificationModel}
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, RegressionEvaluator}
sparkSession: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@50b1565a


In [4]:
// Read Data
val rawRDD = sparkSession.sparkContext.textFile("files/credit_card_clients.csv", 1)
val fileEnd = rawRDD.count.toInt // convert long into int for arithmetic operation
val noHeaderRDD = rawRDD.mapPartitions(_.take(fileEnd-1)).mapPartitions(_.drop(2)) // remove the last empty line and the first two lines (header)
val dataRDD = noHeaderRDD.map(_.replace("\"", "").split(",").map(_.toDouble)).repartition(20)
//*dataRDD.partitions.size // check partition size
//*dataRDD.groupBy("label").count.show // check the labels
val dataDF = dataRDD.map(x => (x(24), Vectors.dense(x.take(24)))).toDF("label", "features")
val Array(trainingData, testData) = dataDF.randomSplit(Array(0.7, 0.3), seed = 123)

// Build pipeline about decision tree classification
val labelIndexer = new StringIndexer().
  setInputCol("label").
  setOutputCol("indexedLabel").
  fit(dataDF)

val featureIndexer = new VectorIndexer().
  setInputCol("features").
  setOutputCol("indexedFeatures").
  setMaxCategories(10). // features with > 10 distinct values are treated as continuous.
  fit(dataDF)

val dt = new DecisionTreeClassifier().
  setLabelCol("indexedLabel").
  setFeaturesCol("indexedFeatures")

val labelConverter = new IndexToString().
  setInputCol("prediction").
  setOutputCol("predictedLabel").
  setLabels(labelIndexer.labels)

val pipeline = new Pipeline().
  setStages(Array(labelIndexer, featureIndexer, dt, labelConverter))

// Cross validation
val paramGrid = new ParamGridBuilder().
  addGrid(dt.maxBins, Array(16, 32, 64)).
  addGrid(dt.maxDepth, Array(5, 7, 9)).
  addGrid(dt.impurity, Array("gini", "entropy")).
  build()

val cv = new CrossValidator().
  setEstimator(pipeline).
  setEvaluator(new BinaryClassificationEvaluator).
  setEstimatorParamMaps(paramGrid).
  setNumFolds(5)  // Use 3+ in practice

val cvModel = cv.fit(trainingData)

// Evaluation
val evaluator = new BinaryClassificationEvaluator().
    setLabelCol("indexedLabel").
    setRawPredictionCol("prediction").
    setMetricName("areaUnderROC")

val predictions = cvModel.transform(testData)
val roc = evaluator.evaluate(predictions)
println("The ROC on the test data is " + roc)

// Check the model property
val bestDTM = cvModel.bestModel.asInstanceOf[PipelineModel].stages(2).asInstanceOf[DecisionTreeClassificationModel]
//*bestDTM.getImpurity
//*bestDTM.getMaxBins
//*bestDTM.getMaxDepth

The ROC on the test data is 0.6484120120797818


rawRDD: org.apache.spark.rdd.RDD[String] = files/credit_card_clients.csv MapPartitionsRDD[1] at textFile at <console>:43
fileEnd: Int = 30003
noHeaderRDD: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[3] at mapPartitions at <console>:45
dataRDD: org.apache.spark.rdd.RDD[Array[Double]] = MapPartitionsRDD[8] at repartition at <console>:46
dataDF: org.apache.spark.sql.DataFrame = [label: double, features: vector]
trainingData: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double, features: vector]
testData: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double, features: vector]
labelIndexer: org.apache.spark.ml.feature.StringIndexerModel = strIdx_8e203391ea78
featureIndexer: org.apache.spark.ml.feature.VectorIndexerModel = vecIdx_009e55388bba
dt:...

In [5]:
// Read data
val rawRDD = sparkSession.sparkContext.textFile("files/winequality-white.csv", 1)
val noHeaderRDD = rawRDD.mapPartitions(_.drop(1)) // remove first line (header)
val dataRDD = noHeaderRDD.map(_.split(";").map(_.toDouble)).repartition(20)
//*dataRDD.partitions.size // check partition size
//*dataRDD.groupBy("label").count.show // check the labels
val dataDF = dataRDD.map(x => (x(11), Vectors.dense(x.take(11)))).toDF("label", "features")
val Array(trainingData, testData) = dataDF.randomSplit(Array(0.7, 0.3), seed = 123)

// Build pipeline about decision tree regression
val featureIndexer = new VectorIndexer().
  setInputCol("features").
  setOutputCol("indexedFeatures").
  setMaxCategories(4). //we treat features with > 4 distinct values as continuous.
  fit(dataDF)

val dt = new DecisionTreeRegressor().
  setLabelCol("label").
  setFeaturesCol("indexedFeatures")

val pipeline = new Pipeline().
  setStages(Array(featureIndexer, dt))

// Cross validation
val paramGrid = new ParamGridBuilder().
  addGrid(dt.maxBins, Array(10, 15, 20)).
  addGrid(dt.maxDepth, Array(5, 7, 9)).
  build()

val cv = new CrossValidator().
  setEstimator(pipeline).
  setEvaluator(new RegressionEvaluator).
  setEstimatorParamMaps(paramGrid).
  setNumFolds(5)  // Use 3+ in practice

val cvModel = cv.fit(trainingData)

// Evaluation
val evaluator = new RegressionEvaluator().
    setLabelCol("label").
    setPredictionCol("prediction").
    setMetricName("rmse")

val predictions = cvModel.transform(testData)
val rmse = evaluator.evaluate(predictions)
println("The RMSE on the test data is " + rmse)

// Check the model property
val bestDTM = cvModel.bestModel.asInstanceOf[PipelineModel].stages(1).asInstanceOf[DecisionTreeRegressionModel]
//*bestDTM.getImpurity
//*bestDTM.getMaxBins
//*bestDTM.getMaxDepth

The RMSE on the test data is 0.7285336650923595


rawRDD: org.apache.spark.rdd.RDD[String] = files/winequality-white.csv MapPartitionsRDD[4899] at textFile at <console>:61
noHeaderRDD: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[4900] at mapPartitions at <console>:62
dataRDD: org.apache.spark.rdd.RDD[Array[Double]] = MapPartitionsRDD[4905] at repartition at <console>:63
dataDF: org.apache.spark.sql.DataFrame = [label: double, features: vector]
trainingData: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double, features: vector]
testData: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double, features: vector]
featureIndexer: org.apache.spark.ml.feature.VectorIndexerModel = vecIdx_a6bd7a8e7f61
dt: org.apache.spark.ml.regression.DecisionTreeRegressor = dtr_ad4c21322509
pipeline: org.apache.spa...