In [1]:
import org.apache.spark.ml.{PipelineModel, Pipeline}
import org.apache.spark.ml.classification.{DecisionTreeClassifier, RandomForestClassifier, RandomForestClassificationModel}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{VectorAssembler, VectorIndexer}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions._
import scala.util.Random

In [2]:
val dataWithoutHeader = spark.read.
    option("inferSchema",true).
    option("header",false).
    csv("covtype.data")

dataWithoutHeader = [_c0: int, _c1: int ... 53 more fields]


[_c0: int, _c1: int ... 53 more fields]

In [3]:
dataWithoutHeader.first

[2596,51,3,258,0,510,221,232,148,6279,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,5]

In [4]:
dataWithoutHeader.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- _c1: integer (nullable = true)
 |-- _c2: integer (nullable = true)
 |-- _c3: integer (nullable = true)
 |-- _c4: integer (nullable = true)
 |-- _c5: integer (nullable = true)
 |-- _c6: integer (nullable = true)
 |-- _c7: integer (nullable = true)
 |-- _c8: integer (nullable = true)
 |-- _c9: integer (nullable = true)
 |-- _c10: integer (nullable = true)
 |-- _c11: integer (nullable = true)
 |-- _c12: integer (nullable = true)
 |-- _c13: integer (nullable = true)
 |-- _c14: integer (nullable = true)
 |-- _c15: integer (nullable = true)
 |-- _c16: integer (nullable = true)
 |-- _c17: integer (nullable = true)
 |-- _c18: integer (nullable = true)
 |-- _c19: integer (nullable = true)
 |-- _c20: integer (nullable = true)
 |-- _c21: integer (nullable = true)
 |-- _c22: integer (nullable = true)
 |-- _c23: integer (nullable = true)
 |-- _c24: integer (nullable = true)
 |-- _c25: integer (nullable = true)
 |-- _c26: integer (nullable = true)
 |-- _

In [5]:
val colNames = Seq(
        "Elevation","Aspect","Slope","Horizontal_Distance_To_Hydrology","Vertical_Distance_To_Hydrology",
        "Horizontal_Distance_To_Roadways","Hillshae_9am","Hillshae_Noon","Hillshade_3pm",
        "Horizontal_Distance_To_Fire_Points"
    ) ++ (
     (0 until 4).map(i => s"Wilderness_Area_$i")
    ) ++ (
     (0 until 40).map(i => s"Soil_Type_$i")
    ) ++ Seq("Cover_Type")

colNames = List(Elevation, Aspect, Slope, Horizontal_Distance_To_Hydrology, Vertical_Distance_To_Hydrology, Horizontal_Distance_To_Roadways, Hillshae_9am, Hillshae_Noon, Hillshade_3pm, Horizontal_Distance_To_Fire_Points, Wilderness_Area_0, Wilderness_Area_1, Wilderness_Area_2, Wilderness_Area_3, Soil_Type_0, Soil_Type_1, Soil_Type_2, Soil_Type_3, Soil_Type_4, Soil_Type_5, Soil_Type_6, Soil_Type_7, Soil_Type_8, Soil_Type_9, Soil_Type_10, Soil_Type_11, Soil_Type_12, Soil_Type_13, Soil_Type_14, Soil_Type_15, Soil_Type_16, Soil_Type_17, Soil_Type_18, Soil_Type_19, Soil_Type_20, Soil_Type_21, Soil_Type_22, Soil_Type_23, Soil_Type_24, Soil_Type_25, Soil_Type_26, Soil_Type_27, Soil_Type_28, Soil_Type_29, Soil_Type_30, Soil_Type_31, Soil_Type_32, Soil_Type_33, Soil_Type_34, Soil_Ty...


List(Elevation, Aspect, Slope, Horizontal_Distance_To_Hydrology, Vertical_Distance_To_Hydrology, Horizontal_Distance_To_Roadways, Hillshae_9am, Hillshae_Noon, Hillshade_3pm, Horizontal_Distance_To_Fire_Points, Wilderness_Area_0, Wilderness_Area_1, Wilderness_Area_2, Wilderness_Area_3, Soil_Type_0, Soil_Type_1, Soil_Type_2, Soil_Type_3, Soil_Type_4, Soil_Type_5, Soil_Type_6, Soil_Type_7, Soil_Type_8, Soil_Type_9, Soil_Type_10, Soil_Type_11, Soil_Type_12, Soil_Type_13, Soil_Type_14, Soil_Type_15, Soil_Type_16, Soil_Type_17, Soil_Type_18, Soil_Type_19, Soil_Type_20, Soil_Type_21, Soil_Type_22, Soil_Type_23, Soil_Type_24, Soil_Type_25, Soil_Type_26, Soil_Type_27, Soil_Type_28, Soil_Type_29, Soil_Type_30, Soil_Type_31, Soil_Type_32, Soil_Type_33, Soil_Type_34, Soil_Type_35, Soil_Type_36, Soil_Type_37, Soil_Type_38, Soil_Type_39, Cover_Type)

In [6]:
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._

spark = org.apache.spark.sql.SparkSession@550c26b5


In [7]:
val data = dataWithoutHeader.toDF(colNames:_*).
    withColumn("Cover_Type", $"Cover_Type".cast("double"))

data = [Elevation: int, Aspect: int ... 53 more fields]


[Elevation: int, Aspect: int ... 53 more fields]

In [8]:
data.printSchema

root
 |-- Elevation: integer (nullable = true)
 |-- Aspect: integer (nullable = true)
 |-- Slope: integer (nullable = true)
 |-- Horizontal_Distance_To_Hydrology: integer (nullable = true)
 |-- Vertical_Distance_To_Hydrology: integer (nullable = true)
 |-- Horizontal_Distance_To_Roadways: integer (nullable = true)
 |-- Hillshae_9am: integer (nullable = true)
 |-- Hillshae_Noon: integer (nullable = true)
 |-- Hillshade_3pm: integer (nullable = true)
 |-- Horizontal_Distance_To_Fire_Points: integer (nullable = true)
 |-- Wilderness_Area_0: integer (nullable = true)
 |-- Wilderness_Area_1: integer (nullable = true)
 |-- Wilderness_Area_2: integer (nullable = true)
 |-- Wilderness_Area_3: integer (nullable = true)
 |-- Soil_Type_0: integer (nullable = true)
 |-- Soil_Type_1: integer (nullable = true)
 |-- Soil_Type_2: integer (nullable = true)
 |-- Soil_Type_3: integer (nullable = true)
 |-- Soil_Type_4: integer (nullable = true)
 |-- Soil_Type_5: integer (nullable = true)
 |-- Soil_Type_6

In [9]:
val Array(trainData, testData) = data.randomSplit(Array(0.9,0.1))
trainData.cache()
testData.cache()

trainData = [Elevation: int, Aspect: int ... 53 more fields]
testData = [Elevation: int, Aspect: int ... 53 more fields]


[Elevation: int, Aspect: int ... 53 more fields]

In [10]:
val inputCols = trainData.columns.filter(_!="Cover_Type")
var assembler = new VectorAssembler().
    setInputCols(inputCols).
    setOutputCol("featureVector")

inputCols = Array(Elevation, Aspect, Slope, Horizontal_Distance_To_Hydrology, Vertical_Distance_To_Hydrology, Horizontal_Distance_To_Roadways, Hillshae_9am, Hillshae_Noon, Hillshade_3pm, Horizontal_Distance_To_Fire_Points, Wilderness_Area_0, Wilderness_Area_1, Wilderness_Area_2, Wilderness_Area_3, Soil_Type_0, Soil_Type_1, Soil_Type_2, Soil_Type_3, Soil_Type_4, Soil_Type_5, Soil_Type_6, Soil_Type_7, Soil_Type_8, Soil_Type_9, Soil_Type_10, Soil_Type_11, Soil_Type_12, Soil_Type_13, Soil_Type_14, Soil_Type_15, Soil_Type_16, Soil_Type_17, Soil_Type_18, Soil_Type_19, Soil_Type_20, Soil_Type_21, Soil_Type_22, Soil_Type_23, Soil_Type_24, Soil_Type_25, Soil_Type_26, Soil_Type_27, Soil_Type_28, Soil_Type_29, Soil_Type_30, Soil_Type_31, Soil_Type_32, Soil_Type_33, Soil_Type_34, Soi...


[Elevation, Aspect, Slope, Horizontal_Distance_To_Hydrology, Vertical_Distance_To_Hydrology, Horizontal_Distance_To_Roadways, Hillshae_9am, Hillshae_Noon, Hillshade_3pm, Horizontal_Distance_To_Fire_Points, Wilderness_Area_0, Wilderness_Area_1, Wilderness_Area_2, Wilderness_Area_3, Soil_Type_0, Soil_Type_1, Soil_Type_2, Soil_Type_3, Soil_Type_4, Soil_Type_5, Soil_Type_6, Soil_Type_7, Soil_Type_8, Soil_Type_9, Soil_Type_10, Soil_Type_11, Soil_Type_12, Soil_Type_13, Soil_Type_14, Soil_Type_15, Soil_Type_16, Soil_Type_17, Soil_Type_18, Soil_Type_19, Soil_Type_20, Soil_Type_21, Soil_Type_22, Soil_Type_23, Soil_Type_24, Soil_Type_25, Soil_Type_26, Soil_Type_27, Soil_Type_28, Soil_Type_29, Soil_Type_30, Soil_Type_31, Soil_Type_32, Soil_Type_33, Soil_Type_34, Soil_Type_35, Soil_Type_36, Soil_Type_37, Soil_Type_38, Soil_Type_39]

In [11]:
var assembledTrainData = assembler.transform(trainData)
assembledTrainData.select("featureVector").show(truncate=false)

+----------------------------------------------------------------------------------------------------+
|featureVector                                                                                       |
+----------------------------------------------------------------------------------------------------+
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1859.0,18.0,12.0,67.0,11.0,90.0,211.0,215.0,139.0,792.0,1.0,1.0])  |
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1860.0,18.0,13.0,95.0,15.0,90.0,210.0,213.0,138.0,780.0,1.0,1.0])  |
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1861.0,35.0,14.0,60.0,11.0,85.0,218.0,209.0,124.0,832.0,1.0,1.0])  |
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1863.0,37.0,17.0,120.0,18.0,90.0,217.0,202.0,115.0,769.0,1.0,1.0]) |
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1866.0,23.0,14.0,85.0,16.0,108.0,212.0,210.0,133.0,819.0,1.0,1.0]) |
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1868.0,27.0,16.0,67.0,17.0,95.0,212.0,204.0,125.0,859.0,1.0,1.0])  |
|(54,[0,1,2,3,4,5,6,7,8,9,13,18],[1871.0,22.0,22.0,60.0,12.0,85.0,200.0,1

assembledTrainData = [Elevation: int, Aspect: int ... 54 more fields]


[Elevation: int, Aspect: int ... 54 more fields]

In [12]:
var classifier = new DecisionTreeClassifier().
    setSeed(Random.nextLong()).
    setLabelCol("Cover_Type").
    setFeaturesCol("featureVector").
    setPredictionCol("prediction")

classifier = dtc_afeeb7605f2b


dtc_afeeb7605f2b

In [13]:
var model = classifier.fit(assembledTrainData)
println(model.toDebugString)

DecisionTreeClassificationModel (uid=dtc_afeeb7605f2b) of depth 5 with 63 nodes
  If (feature 0 <= 3049.5)
   If (feature 0 <= 2555.5)
    If (feature 10 <= 0.5)
     If (feature 0 <= 2449.5)
      If (feature 3 <= 15.0)
       Predict: 4.0
      Else (feature 3 > 15.0)
       Predict: 3.0
     Else (feature 0 > 2449.5)
      If (feature 17 <= 0.5)
       Predict: 2.0
      Else (feature 17 > 0.5)
       Predict: 3.0
    Else (feature 10 > 0.5)
     If (feature 9 <= 5463.5)
      If (feature 22 <= 0.5)
       Predict: 2.0
      Else (feature 22 > 0.5)
       Predict: 2.0
     Else (feature 9 > 5463.5)
      If (feature 5 <= 572.0)
       Predict: 2.0
      Else (feature 5 > 572.0)
       Predict: 5.0
   Else (feature 0 > 2555.5)
    If (feature 0 <= 2956.5)
     If (feature 15 <= 0.5)
      If (feature 17 <= 0.5)
       Predict: 2.0
      Else (feature 17 > 0.5)
       Predict: 3.0
     Else (feature 15 > 0.5)
      If (feature 9 <= 1427.5)
       Predict: 3.0
      Else (feature 9 > 1

model = DecisionTreeClassificationModel (uid=dtc_afeeb7605f2b) of depth 5 with 63 nodes


DecisionTreeClassificationModel (uid=dtc_afeeb7605f2b) of depth 5 with 63 nodes

In [14]:
model.featureImportances.toArray.zip(inputCols).
    sorted.reverse.foreach(println)

(0.7869596411791475,Elevation)
(0.039920067119428145,Horizontal_Distance_To_Hydrology)
(0.031113994544377067,Hillshae_Noon)
(0.030161930010014843,Soil_Type_3)
(0.02726093146245442,Wilderness_Area_0)
(0.024044208405017763,Soil_Type_1)
(0.02319146702516172,Soil_Type_31)
(0.012532249953920679,Horizontal_Distance_To_Roadways)
(0.011140655100118852,Wilderness_Area_2)
(0.005980226714491093,Soil_Type_22)
(0.0028787848256090856,Wilderness_Area_1)
(0.002552828953956752,Hillshae_9am)
(0.001974745062831754,Horizontal_Distance_To_Fire_Points)
(2.882696434702999E-4,Soil_Type_8)
(0.0,Wilderness_Area_3)
(0.0,Vertical_Distance_To_Hydrology)
(0.0,Soil_Type_9)
(0.0,Soil_Type_7)
(0.0,Soil_Type_6)
(0.0,Soil_Type_5)
(0.0,Soil_Type_4)
(0.0,Soil_Type_39)
(0.0,Soil_Type_38)
(0.0,Soil_Type_37)
(0.0,Soil_Type_36)
(0.0,Soil_Type_35)
(0.0,Soil_Type_34)
(0.0,Soil_Type_33)
(0.0,Soil_Type_32)
(0.0,Soil_Type_30)
(0.0,Soil_Type_29)
(0.0,Soil_Type_28)
(0.0,Soil_Type_27)
(0.0,Soil_Type_26)
(0.0,Soil_Type_25)
(0.0,Soil_T

In [15]:
var predictions = model.transform(assembledTrainData)
predictions.select("Cover_Type","prediction","probability").show(truncate= false)

+----------+----------+------------------------------------------------------------------------------------------------+
|Cover_Type|prediction|probability                                                                                     |
+----------+----------+------------------------------------------------------------------------------------------------+
|3.0       |3.0       |[0.0,0.0,0.033393439212150634,0.631735904015535,0.05107843817185658,0.0,0.2837922186004577,0.0] |
|3.0       |3.0       |[0.0,0.0,0.033393439212150634,0.631735904015535,0.05107843817185658,0.0,0.2837922186004577,0.0] |
|3.0       |3.0       |[0.0,0.0,0.033393439212150634,0.631735904015535,0.05107843817185658,0.0,0.2837922186004577,0.0] |
|6.0       |3.0       |[0.0,0.0,0.033393439212150634,0.631735904015535,0.05107843817185658,0.0,0.2837922186004577,0.0] |
|3.0       |3.0       |[0.0,0.0,0.033393439212150634,0.631735904015535,0.05107843817185658,0.0,0.2837922186004577,0.0] |
|3.0       |3.0       |[0.0,0.0,

predictions = [Elevation: int, Aspect: int ... 57 more fields]


[Elevation: int, Aspect: int ... 57 more fields]

In [16]:
var evaluator = new MulticlassClassificationEvaluator().
    setLabelCol("Cover_Type").
    setPredictionCol("prediction")

evaluator = mcEval_8ee974a3672d


mcEval_8ee974a3672d

In [17]:
var accuracy = evaluator.setMetricName("accuracy").evaluate(predictions)

accuracy = 0.7027188366975988


0.7027188366975988

In [18]:
val predictionRDD = predictions.
    select("prediction","Cover_Type").
    as[(Double,Double)].rdd
val multiclassMetrics = new MulticlassMetrics(predictionRDD)
println(multiclassMetrics.confusionMatrix)

126669.0  58699.0   161.0    0.0    0.0   0.0  5042.0   
45980.0   203635.0  4329.0   95.0   32.0  0.0  710.0    
0.0       5744.0    25762.0  654.0  0.0   0.0  0.0      
0.0       20.0      1473.0   991.0  0.0   0.0  0.0      
3.0       7747.0    758.0    0.0    74.0  0.0  0.0      
0.0       6201.0    8916.0   535.0  0.0   0.0  0.0      
8019.0    200.0     56.0     0.0    0.0   0.0  10145.0  


predictionRDD = MapPartitionsRDD[82] at rdd at <console>:57
multiclassMetrics = org.apache.spark.mllib.evaluation.MulticlassMetrics@53d74aad


org.apache.spark.mllib.evaluation.MulticlassMetrics@53d74aad

In [19]:
val inputCols = trainData.columns.filter(_!="Cover_Type")
var assembler = new VectorAssembler().
    setInputCols(inputCols).
    setOutputCol("featureVector")
var classifier = new DecisionTreeClassifier().
    setSeed(Random.nextLong()).
    setLabelCol("Cover_Type").
    setFeaturesCol("featureVector").
    setPredictionCol("prediction")
var pipeline = new Pipeline().setStages(Array(assembler, classifier))

inputCols = Array(Elevation, Aspect, Slope, Horizontal_Distance_To_Hydrology, Vertical_Distance_To_Hydrology, Horizontal_Distance_To_Roadways, Hillshae_9am, Hillshae_Noon, Hillshade_3pm, Horizontal_Distance_To_Fire_Points, Wilderness_Area_0, Wilderness_Area_1, Wilderness_Area_2, Wilderness_Area_3, Soil_Type_0, Soil_Type_1, Soil_Type_2, Soil_Type_3, Soil_Type_4, Soil_Type_5, Soil_Type_6, Soil_Type_7, Soil_Type_8, Soil_Type_9, Soil_Type_10, Soil_Type_11, Soil_Type_12, Soil_Type_13, Soil_Type_14, Soil_Type_15, Soil_Type_16, Soil_Type_17, Soil_Type_18, Soil_Type_19, Soil_Type_20, Soil_Type_21, Soil_Type_22, Soil_Type_23, Soil_Type_24, Soil_Type_25, Soil_Type_26, Soil_Type_27, Soil_Type_28, Soil_Type_29, Soil_Type_30, Soil_Type_31, Soil_Type_32, Soil_Type_33, Soil_Type_34, Soi...


[Elevation, Aspect, Slope, Horizontal_Distance_To_Hydrology, Vertical_Distance_To_Hydrology, Horizontal_Distance_To_Roadways, Hillshae_9am, Hillshae_Noon, Hillshade_3pm, Horizontal_Distance_To_Fire_Points, Wilderness_Area_0, Wilderness_Area_1, Wilderness_Area_2, Wilderness_Area_3, Soil_Type_0, Soil_Type_1, Soil_Type_2, Soil_Type_3, Soil_Type_4, Soil_Type_5, Soil_Type_6, Soil_Type_7, Soil_Type_8, Soil_Type_9, Soil_Type_10, Soil_Type_11, Soil_Type_12, Soil_Type_13, Soil_Type_14, Soil_Type_15, Soil_Type_16, Soil_Type_17, Soil_Type_18, Soil_Type_19, Soil_Type_20, Soil_Type_21, Soil_Type_22, Soil_Type_23, Soil_Type_24, Soil_Type_25, Soil_Type_26, Soil_Type_27, Soil_Type_28, Soil_Type_29, Soil_Type_30, Soil_Type_31, Soil_Type_32, Soil_Type_33, Soil_Type_34, Soil_Type_35, Soil_Type_36, Soil_Type_37, Soil_Type_38, Soil_Type_39]

In [20]:
var paramGrid  = new ParamGridBuilder().
    addGrid(classifier.impurity, Seq("entropy")).
    addGrid(classifier.maxDepth, Seq(1,20)).
    addGrid(classifier.maxBins, Seq(40,300)).
    addGrid(classifier.minInfoGain, Seq(0.0,0.05)).
    build()

var multiclassEval = new MulticlassClassificationEvaluator().
    setLabelCol("Cover_Type").
    setPredictionCol("prediction").
    setMetricName("accuracy")

paramGrid = 


Array({
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 40,
	dtc_65a78540e07a-maxDepth: 1,
	dtc_65a78540e07a-minInfoGain: 0.0
}, {
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 300,
	dtc_65a78540e07a-maxDepth: 1,
	dtc_65a78540e07a-minInfoGain: 0.0
}, {
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 40,
	dtc_65a78540e07a-maxDepth: 20,
	dtc_65a78540e07a-minInfoGain: 0.0
}, {
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 300,
	dtc_65a78540e07a-maxDepth: 20,
	dtc_65a78540e07a-minInfoGain: 0.0
}, {
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 40,
	dtc_65a78540e07a-maxDepth: 1,
	dtc_65a78540e07a-minInfoGain: 0.05
}, {
	dtc_65a78540e07a-impurity: entrop...


[{
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 40,
	dtc_65a78540e07a-maxDepth: 1,
	dtc_65a78540e07a-minInfoGain: 0.0
}, {
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 300,
	dtc_65a78540e07a-maxDepth: 1,
	dtc_65a78540e07a-minInfoGain: 0.0
}, {
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 40,
	dtc_65a78540e07a-maxDepth: 20,
	dtc_65a78540e07a-minInfoGain: 0.0
}, {
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 300,
	dtc_65a78540e07a-maxDepth: 20,
	dtc_65a78540e07a-minInfoGain: 0.0
}, {
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 40,
	dtc_65a78540e07a-maxDepth: 1,
	dtc_65a78540e07a-minInfoGain: 0.05
}, {
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 300,
	dtc_65a78540e07a-maxDepth: 1,
	dtc_65a78540e07a-minInfoGain: 0.05
}, {
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 40,
	dtc_65a78540e07a-maxDepth: 20,
	dtc_65a78540e07a-minInfoGain: 0.05
}, {
	dtc_65a78540e0

In [21]:
var validator = new TrainValidationSplit().
    setSeed(Random.nextLong()).
    setEstimator(pipeline).
    setEvaluator(multiclassEval).
    setEstimatorParamMaps(paramGrid).
    setTrainRatio(0.9)

validator = tvs_e0dcf17a6c41


tvs_e0dcf17a6c41

In [22]:
var validatorModel = validator.fit(trainData)

validatorModel = tvs_e0dcf17a6c41


tvs_e0dcf17a6c41

In [23]:
val bestModel = validatorModel.bestModel
bestModel.asInstanceOf[PipelineModel].stages.last.extractParamMap

bestModel = pipeline_7a06ee253309


{
	dtc_65a78540e07a-cacheNodeIds: false,
	dtc_65a78540e07a-checkpointInterval: 10,
	dtc_65a78540e07a-featuresCol: featureVector,
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-labelCol: Cover_Type,
	dtc_65a78540e07a-maxBins: 300,
	dtc_65a78540e07a-maxDepth: 20,
	dtc_65a78540e07a-maxMemoryInMB: 256,
	dtc_65a78540e07a-minInfoGain: 0.0,
	dtc_65a78540e07a-minInstancesPerNode: 1,
	dtc_65a78540e07a-predictionCol: prediction,
	dtc_65a78540e07a-probabilityCol: probability,
	dtc_65a78540e07a-rawPredictionCol: rawPrediction,
	dtc_65a78540e07a-seed: 5722693384343268808
}

In [24]:
val paramsAndMetrics = validatorModel.validationMetrics.
zip(validatorModel.getEstimatorParamMaps).sortBy(-_._1)
paramsAndMetrics.foreach{case(metric,params)=> 
println(metric)
println(params)
println()
}

0.9140595119712385
{
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 300,
	dtc_65a78540e07a-maxDepth: 20,
	dtc_65a78540e07a-minInfoGain: 0.0
}

0.9118794461867972
{
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 40,
	dtc_65a78540e07a-maxDepth: 20,
	dtc_65a78540e07a-minInfoGain: 0.0
}

0.7247380096381856
{
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 300,
	dtc_65a78540e07a-maxDepth: 20,
	dtc_65a78540e07a-minInfoGain: 0.05
}

0.7234184961370764
{
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 40,
	dtc_65a78540e07a-maxDepth: 20,
	dtc_65a78540e07a-minInfoGain: 0.05
}

0.49221678268186336
{
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 40,
	dtc_65a78540e07a-maxDepth: 1,
	dtc_65a78540e07a-minInfoGain: 0.0
}

0.49221678268186336
{
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 300,
	dtc_65a78540e07a-maxDepth: 1,
	dtc_65a78540e07a-minInfoGain: 0.0
}

0.49221678268186336
{
	dtc_65a78540e07a-i

paramsAndMetrics = 


Array((0.9140595119712385,{
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 300,
	dtc_65a78540e07a-maxDepth: 20,
	dtc_65a78540e07a-minInfoGain: 0.0
}), (0.9118794461867972,{
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 40,
	dtc_65a78540e07a-maxDepth: 20,
	dtc_65a78540e07a-minInfoGain: 0.0
}), (0.7247380096381856,{
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 300,
	dtc_65a78540e07a-maxDepth: 20,
	dtc_65a78540e07a-minInfoGain: 0.05
}), (0.7234184961370764,{
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 40,
	dtc_65a78540e07a-maxDepth: 20,
	dtc_65a78540e07a-minInfoGain: 0.05
}), (0.49221678268186336,{
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a785...


[(0.9140595119712385,{
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 300,
	dtc_65a78540e07a-maxDepth: 20,
	dtc_65a78540e07a-minInfoGain: 0.0
}), (0.9118794461867972,{
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 40,
	dtc_65a78540e07a-maxDepth: 20,
	dtc_65a78540e07a-minInfoGain: 0.0
}), (0.7247380096381856,{
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 300,
	dtc_65a78540e07a-maxDepth: 20,
	dtc_65a78540e07a-minInfoGain: 0.05
}), (0.7234184961370764,{
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 40,
	dtc_65a78540e07a-maxDepth: 20,
	dtc_65a78540e07a-minInfoGain: 0.05
}), (0.49221678268186336,{
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 40,
	dtc_65a78540e07a-maxDepth: 1,
	dtc_65a78540e07a-minInfoGain: 0.0
}), (0.49221678268186336,{
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 300,
	dtc_65a78540e07a-maxDepth: 1,
	dtc_65a78540e07a-minInfoGain: 0.0
}), (0.49221678268186336,{
	dtc_

In [25]:
var paramGrid  = new ParamGridBuilder().
    addGrid(classifier.impurity, Seq("entropy")).
    addGrid(classifier.maxDepth, Seq(20)).
    addGrid(classifier.maxBins, Seq(300)).
    addGrid(classifier.minInfoGain, Seq(0.0)).
    build()

paramGrid = 


Array({
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 300,
	dtc_65a78540e07a-maxDepth: 20,
	dtc_65a78540e07a-minInfoGain: 0.0
})


[{
	dtc_65a78540e07a-impurity: entropy,
	dtc_65a78540e07a-maxBins: 300,
	dtc_65a78540e07a-maxDepth: 20,
	dtc_65a78540e07a-minInfoGain: 0.0
}]

In [26]:
var validator = new TrainValidationSplit().
    setSeed(Random.nextLong()).
    setEstimator(pipeline).
    setEvaluator(multiclassEval).
    setEstimatorParamMaps(paramGrid)

validator = tvs_e5cc88cca230


tvs_e5cc88cca230

In [28]:
var goodModel = validator.fit(trainData)

goodModel = tvs_e5cc88cca230


lastException: Throwable = null


tvs_e5cc88cca230

In [29]:
multiclassEval.evaluate(goodModel.transform(testData))

0.9143963537918509