# Decision Tree

1. Decision tree classifier
2. Decision tree regression <br/>

The evaluatiors are "BinaryClassificationEvaluator", "RegressionEvaluator", and "MulticlassClassificationEvaluator". All of them take two inputs (label, and prediction), and one output (metric). The model prediction performance is evluated by the performance indicator (metric). See detail:<br/>
Binary:https://spark.apache.org/docs/2.3.0/api/java/org/apache/spark/ml/evaluation/BinaryClassificationEvaluator.html#metricName--
Regreesion:https://spark.apache.org/docs/2.3.0/api/java/org/apache/spark/ml/evaluation/RegressionEvaluator.html#metricName--
Multiclass:https://spark.apache.org/docs/2.3.0/api/java/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.html#metricName--<br/><br/>
Supported impurity for classfier and regressor<br/>
Classifier:https://spark.apache.org/docs/2.3.0/api/java/org/apache/spark/ml/classification/DecisionTreeClassifier.html#supportedImpurities--
Regressor:https://spark.apache.org/docs/2.3.0/api/java/org/apache/spark/ml/regression/DecisionTreeRegressor.html#supportedImpurities--

In [3]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.regression.{DecisionTreeRegressor, DecisionTreeRegressionModel}
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.ml.classification.{DecisionTreeClassifier, DecisionTreeClassificationModel}
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, RegressionEvaluator}
import org.apache.spark.sql.Row

val sparkSession = SparkSession.builder.
    master("local[4]").
    appName("Decision Tree").
    getOrCreate()

import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.regression.{DecisionTreeRegressor, DecisionTreeRegressionModel}
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.ml.classification.{DecisionTreeClassifier, DecisionTreeClassificationModel}
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, RegressionEvaluator}
import org.apache.spark.sql.Row
sparkSession: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@69954667


## Classification Decision Tree 1

In [2]:
// Read Data
val rawRDD = sparkSession.sparkContext.textFile("files/credit_card_clients.csv", 1)
val fileEnd = rawRDD.count.toInt // convert long into int for arithmetic operation
val noHeaderRDD = rawRDD.mapPartitions(_.take(fileEnd-1)).mapPartitions(_.drop(2)) // remove the last empty line and the first two lines (header)
val dataRDD = noHeaderRDD.map(_.replace("\"", "").split(",").map(_.toDouble)).repartition(20)
//*dataRDD.partitions.size // check partition size
//*dataRDD.groupBy("label").count.show // check the labels
val dataDF = dataRDD.map(x => (x(24), Vectors.dense(x.take(24)))).toDF("label", "features")
val Array(trainingData, testData) = dataDF.randomSplit(Array(0.7, 0.3), seed = 123)

// Build pipeline about decision tree classification
val labelIndexer = new StringIndexer().
  setInputCol("label").
  setOutputCol("indexedLabel").
  fit(dataDF)

val featureIndexer = new VectorIndexer().
  setInputCol("features").
  setOutputCol("indexedFeatures").
  setMaxCategories(10). // features with > 10 distinct values are treated as continuous.
  fit(dataDF)

val dt = new DecisionTreeClassifier().
  setLabelCol("indexedLabel").
  setFeaturesCol("indexedFeatures")

val labelConverter = new IndexToString().
  setInputCol("prediction").
  setOutputCol("predictedLabel").
  setLabels(labelIndexer.labels)

val pipeline = new Pipeline().
  setStages(Array(labelIndexer, featureIndexer, dt, labelConverter))

// Cross validation
val paramGrid = new ParamGridBuilder().
  addGrid(dt.maxBins, Array(16, 32, 64)).
  addGrid(dt.maxDepth, Array(5, 7, 9)).
  addGrid(dt.impurity, Array("gini", "entropy")).
  build()

val cv = new CrossValidator().
  setEstimator(pipeline).
  setEvaluator(new BinaryClassificationEvaluator).
  setEstimatorParamMaps(paramGrid).
  setNumFolds(5)  // Use 3+ in practice

val cvModel = cv.fit(trainingData)

// Evaluation
val evaluator = new BinaryClassificationEvaluator().
    setLabelCol("indexedLabel").
    setRawPredictionCol("prediction").
    setMetricName("areaUnderROC")

val predictions = cvModel.transform(testData)
// Check the prediction
// predictions.filter(_.getDouble(0) > 0.5).count
// predictions.filter(_(5).asInstanceOf[Vector](0) > 0.5).count
// Note: both "Vector" and "Vectors" are needed to be imported, i.e.
// import org.apache.spark.ml.linalg.{Vector, Vectors}
// Vector for the data type in "asInstanceOf[]"
// Vectors for creating a vector in "Vectors.dense()"

val roc = evaluator.evaluate(predictions)
println("The ROC on the test data is " + roc)

// Check the model property
val bestDTM = cvModel.bestModel.asInstanceOf[PipelineModel].stages(2).asInstanceOf[DecisionTreeClassificationModel]
//*bestDTM.getImpurity
//*bestDTM.getMaxBins
//*bestDTM.getMaxDepth

The ROC on the test data is 0.6484120120797818


rawRDD: org.apache.spark.rdd.RDD[String] = files/credit_card_clients.csv MapPartitionsRDD[1] at textFile at <console>:35
fileEnd: Int = 30003
noHeaderRDD: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[3] at mapPartitions at <console>:37
dataRDD: org.apache.spark.rdd.RDD[Array[Double]] = MapPartitionsRDD[8] at repartition at <console>:38
dataDF: org.apache.spark.sql.DataFrame = [label: double, features: vector]
trainingData: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double, features: vector]
testData: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double, features: vector]
labelIndexer: org.apache.spark.ml.feature.StringIndexerModel = strIdx_fbffeb218290
featureIndexer: org.apache.spark.ml.feature.VectorIndexerModel = vecIdx_c4f7c9863390
dt:...

## Classification Decision Tree 2
1. Drop lines containing <tt>NaN</tt>
2. Replace <tt>NaN</tt> with the average value from that column<br><br>

### Read data with 1:

In [None]:
// Read data Drop lines containing NaN
val rawRDD = sparkSession.sparkContext.textFile("files/subject101.dat")
val dataRDD = rawRDD.filter(!_.contains("NaN")).map(_.split(" ").map(_.toDouble)).repartition(20) // "!" operator can not used in "!a.contains()"
val dataDF = dataRDD.map(x => (x(1), Vectors.dense(x.take(54).drop(2)))).toDF("label", "features")

### Or read data with 2:

In [None]:
// Replace NaN with the average value from that column
val rawRDD = sparkSession.sparkContext.textFile("files/subject101.dat")
val rawDF = rawRDD.repartition(20).toDF
// val auxStr = (1 to 54).map(x => "$\"_tmp\"(" + (x-1).toString + ").as(" + "\"col" + x.toString + "\")")
// println(auxStr)
val dataNaNDF = rawDF.withColumn("_tmp", split($"value", " ")).select(
  $"_tmp"(0).as("col1"), $"_tmp"(1).as("col2"), $"_tmp"(2).as("col3"), $"_tmp"(3).as("col4"), $"_tmp"(4).as("col5"), $"_tmp"(5).as("col6"), $"_tmp"(6).as("col7"), $"_tmp"(7).as("col8"), $"_tmp"(8).as("col9"), $"_tmp"(9).as("col10"), $"_tmp"(10).as("col11"), $"_tmp"(11).as("col12"), $"_tmp"(12).as("col13"), $"_tmp"(13).as("col14"), $"_tmp"(14).as("col15"), $"_tmp"(15).as("col16"), $"_tmp"(16).as("col17"), $"_tmp"(17).as("col18"), $"_tmp"(18).as("col19"), $"_tmp"(19).as("col20"), $"_tmp"(20).as("col21"), $"_tmp"(21).as("col22"), $"_tmp"(22).as("col23"), $"_tmp"(23).as("col24"), $"_tmp"(24).as("col25"), $"_tmp"(25).as("col26"), $"_tmp"(26).as("col27"), $"_tmp"(27).as("col28"), $"_tmp"(28).as("col29"), $"_tmp"(29).as("col30"), $"_tmp"(30).as("col31"), $"_tmp"(31).as("col32"), $"_tmp"(32).as("col33"), $"_tmp"(33).as("col34"), $"_tmp"(34).as("col35"), $"_tmp"(35).as("col36"), $"_tmp"(36).as("col37"), $"_tmp"(37).as("col38"), $"_tmp"(38).as("col39"), $"_tmp"(39).as("col40"), $"_tmp"(40).as("col41"), $"_tmp"(41).as("col42"), $"_tmp"(42).as("col43"), $"_tmp"(43).as("col44"), $"_tmp"(44).as("col45"), $"_tmp"(45).as("col46"), $"_tmp"(46).as("col47"), $"_tmp"(47).as("col48"), $"_tmp"(48).as("col49"), $"_tmp"(49).as("col50"), $"_tmp"(50).as("col51"), $"_tmp"(51).as("col52"), $"_tmp"(52).as("col53"), $"_tmp"(53).as("col54")
  ).drop("_tmp")
val colLen = dataNaNDF.columns.length
val nonNaN = (0 until colLen).map(i => dataNaNDF.select(dataNaNDF.columns(i)).filter(x => x.getAs[String](0) != "NaN"))
val colMean = (0 until colLen).map(i => nonNaN(i).rdd.map(_(0).toString.toDouble).mean)
// Compute mean value without NaN
// val data = Seq((1,2,3), (3,4,5), (1,2,4)).toDF("A", "B", "C")
// data.select(data.columns.map(mean(_)): _*).show() // ":_*" means unpack a Array

// Creat column objectives
val columnsNaNReplace = dataNaNDF.columns.map(col).map(colName => {
    val colNum = colName.toString.drop(3).toInt-1;
    when(colName.isNaN,colMean(colNum).toString).otherwise(colName).as(s"${colName}")
})
val allDataDF = dataNaNDF.select(columnsNaNReplace: _*)

val dataDF = allDataDF.rdd.map(y => {
    val a = y.toSeq.map(x => x.toString.toDouble).toArray; 
    (a(1), Vectors.dense(a.take(54).drop(2)))}).
    toDF("label", "features")

In [None]:
val Array(trainingData, testData) = dataDF.randomSplit(Array(0.7, 0.3), seed = 123)

// Build pipeline about decision tree classification
val featureIndexer = new VectorIndexer().
  setInputCol("features").
  setOutputCol("indexedFeatures").
  setMaxCategories(10). // features with > 10 distinct values are treated as continuous.
  fit(dataDF)

val dt = new DecisionTreeClassifier().
  setLabelCol("label").
  setFeaturesCol("indexedFeatures")

val pl = new Pipeline().
  setStages(Array(featureIndexer, dt))

val dtModel = pl.fit(trainingData)
val predictions = dtModel.transform(testData)

val testErr = predictions.select("label", "prediction").
            filter(r => r.getDouble(0) != r.getDouble(1)).
            count().
            toDouble / testData.count()
println("Test Error = " + testErr)

## Regression Decision Tree 1

In [3]:
// Read data
val rawRDD = sparkSession.sparkContext.textFile("files/winequality-white.csv", 1)
val noHeaderRDD = rawRDD.mapPartitions(_.drop(1)) // remove first line (header)
val dataRDD = noHeaderRDD.map(_.split(";").map(_.toDouble)).repartition(20)
//*dataRDD.partitions.size // check partition size
//*dataRDD.groupBy("label").count.show // check the labels
val dataDF = dataRDD.map(x => (x(11), Vectors.dense(x.take(11)))).toDF("label", "features")
val Array(trainingData, testData) = dataDF.randomSplit(Array(0.7, 0.3), seed = 123)

// Build pipeline about decision tree regression
val featureIndexer = new VectorIndexer().
  setInputCol("features").
  setOutputCol("indexedFeatures").
  setMaxCategories(4). //we treat features with > 4 distinct values as continuous.
  fit(dataDF)

val dt = new DecisionTreeRegressor().
  setLabelCol("label").
  setFeaturesCol("indexedFeatures")

val pipeline = new Pipeline().
  setStages(Array(featureIndexer, dt))

// Cross validation
val paramGrid = new ParamGridBuilder().
  addGrid(dt.maxBins, Array(10, 15, 20)).
  addGrid(dt.maxDepth, Array(5, 7, 9)).
  build()

val cv = new CrossValidator().
  setEstimator(pipeline).
  setEvaluator(new RegressionEvaluator).
  setEstimatorParamMaps(paramGrid).
  setNumFolds(5)  // Use 3+ in practice

val cvModel = cv.fit(trainingData)

// Evaluation
val evaluator = new RegressionEvaluator().
    setLabelCol("label").
    setPredictionCol("prediction").
    setMetricName("rmse")

val predictions = cvModel.transform(testData)
val rmse = evaluator.evaluate(predictions)
println("The RMSE on the test data is " + rmse)

// Check the model property
val bestDTM = cvModel.bestModel.asInstanceOf[PipelineModel].stages(1).asInstanceOf[DecisionTreeRegressionModel]
//*bestDTM.getImpurity
//*bestDTM.getMaxBins
//*bestDTM.getMaxDepth

The RMSE on the test data is 0.7285336650923595


rawRDD: org.apache.spark.rdd.RDD[String] = files/winequality-white.csv MapPartitionsRDD[4899] at textFile at <console>:53
noHeaderRDD: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[4900] at mapPartitions at <console>:54
dataRDD: org.apache.spark.rdd.RDD[Array[Double]] = MapPartitionsRDD[4905] at repartition at <console>:55
dataDF: org.apache.spark.sql.DataFrame = [label: double, features: vector]
trainingData: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double, features: vector]
testData: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double, features: vector]
featureIndexer: org.apache.spark.ml.feature.VectorIndexerModel = vecIdx_4efdb9ef68aa
dt: org.apache.spark.ml.regression.DecisionTreeRegressor = dtr_73cad0b094ca
pipeline: org.apache.spa...

## Regression Decision Tree 2

In [9]:
// Read Data
val rawRDD = sparkSession.sparkContext.textFile("files/spambase.data", 20)
val dataDF = rawRDD.map(line => line.split(',').map(_.toDouble)).
                    map(x => (x(57), org.apache.spark.ml.linalg.Vectors.dense(x.take(57)))).
                    toDF("label", "features")

val Array(trainingData, testData) = dataDF.randomSplit(Array(0.7, 0.3), seed = 123)

// Build pipeline about decision tree classification
val featureIndexer = new VectorIndexer().
  setInputCol("features").
  setOutputCol("indexedFeatures").
  setMaxCategories(10). // features with > 10 distinct values are treated as continuous.
  fit(dataDF)

val dt = new DecisionTreeRegressor().
  setLabelCol("label").
  setFeaturesCol("indexedFeatures").
  setImpurity("variance").
  setMaxBins(32).
  setMaxDepth(5)


val pipeline = new Pipeline().
  setStages(Array(featureIndexer, dt))

val dtModel = pipeline.fit(trainingData)
val predictions = dtModel.transform(testData)

// Evaluate model on test instances and compute test error
val testMSE = predictions.select("label", "prediction").
    map{ case Row(v: Double, p: Double) => math.pow(v - p, 2) }.
    rdd.mean()
println("Test Mean Squared Error = " + testMSE)
println("Learned regression tree model:\n" + dtModel.stages(1).asInstanceOf[DecisionTreeRegressionModel].toDebugString)

Test Mean Squared Error = 0.07879840636625587
Learned regression tree model:
DecisionTreeRegressionModel (uid=dtr_b605f8eb0a48) of depth 5 with 51 nodes
  If (feature 51 <= 0.059)
   If (feature 6 <= 0.0)
    If (feature 23 <= 0.0)
     If (feature 15 <= 0.19)
      If (feature 52 <= 0.184)
       Predict: 0.049800796812749
      Else (feature 52 > 0.184)
       Predict: 0.5
     Else (feature 15 > 0.19)
      If (feature 4 <= 1.07)
       Predict: 0.22784810126582278
      Else (feature 4 > 1.07)
       Predict: 0.8181818181818182
    Else (feature 23 > 0.0)
     If (feature 52 <= 0.052)
      If (feature 54 <= 2.63)
       Predict: 0.2
      Else (feature 54 > 2.63)
       Predict: 0.8571428571428571
     Else (feature 52 > 0.052)
      If (feature 9 <= 0.21)
       Predict: 0.9411764705882353
      Else (feature 9 > 0.21)
       Predict: 0.25
   Else (feature 6 > 0.0)
    If (feature 26 <= 0.0)
     If (feature 45 <= 0.0)
      If (feature 49 <= 0.472)
       Predict: 0.938775510204

rawRDD: org.apache.spark.rdd.RDD[String] = files/spambase.data MapPartitionsRDD[46] at textFile at <console>:55
dataDF: org.apache.spark.sql.DataFrame = [label: double, features: vector]
trainingData: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double, features: vector]
testData: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double, features: vector]
featureIndexer: org.apache.spark.ml.feature.VectorIndexerModel = vecIdx_89d043b16003
dt: org.apache.spark.ml.regression.DecisionTreeRegressor = dtr_b605f8eb0a48
pipeline: org.apache.spark.ml.Pipeline = pipeline_a8294a84635b
dtModel: org.apache.spark.ml.PipelineModel = pipeline_a8294a84635b
predictions: org.apache.spark.sql.DataFrame = [label: double, features: vector ... 2 more fields]
testMSE: Double...