In [1]:
/*
     Step #1 :  Import Libraries
*/
import org.joda.time.{DateTime, DateTimeZone}
println("Import libraries started by = " +DateTime.now(DateTimeZone.UTC))

import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Row, SparkSession, SQLContext}
import spark.sqlContext.implicits._
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.tree.model.DecisionTreeModel
import org.apache.spark.mllib.linalg.{Vectors,Vector}

println("Import libraries ended by = " +DateTime.now(DateTimeZone.UTC))


Intitializing Scala interpreter ...

Spark Web UI available at http://dsclusterbq-m:8088/proxy/application_1620947548405_0001
SparkContext available as 'sc' (version = 3.1.1, master = yarn, app id = application_1620947548405_0001)
SparkSession available as 'spark'


Import libraries started by = 2021-05-13T23:17:33.664Z
Import libraries ended by = 2021-05-13T23:17:33.712Z


import org.joda.time.{DateTime, DateTimeZone}
import org.apache.spark._
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Row, SparkSession, SQLContext}
import spark.sqlContext.implicits._
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.tree.model.DecisionTreeModel
import org.apache.spark.mllib.linalg.{Vectors, Vector}


In [34]:
/*
Step #2 :  Load Training Data
*/

/* Reading csv data as RDD using .textFile method[takes the path and number of partitions as arguments]
   maping each row with split. LabeledPoint assembles the target vector and the features of a dataset. 
   Cache stores all the RDD in-memory. */
println("Data load started by: " + DateTime.now(DateTimeZone.UTC))
val vAR_SalesData_csv_rdd = sc.textFile("gs://dssparkbucket/DS.AI_SalesData_RDD.csv").map { line =>
      val p = line.split(',')
      LabeledPoint(p(6).toDouble, Vectors.dense(p(0).toDouble,p(1).toDouble,p(2).toDouble,p(3).toDouble,p(4).toDouble,p(5).toDouble,p(7).toDouble,p(8).toDouble,p(9).toDouble,p(10).toDouble,p(11).toDouble,p(12).toDouble))
    }.cache()

println("Data load ended by: " + DateTime.now(DateTimeZone.UTC))
println("Default partition applied by system =  = " + vAR_SalesData_csv_rdd.partitions.size)

Data load started by: 2021-05-12T20:50:06.085Z
Data load ended by: 2021-05-12T20:50:06.122Z
Default partition applied by system =  = 2


vAR_SalesData_csv_rdd: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[167] at map at <console>:105


In [35]:
/* 
Step #3 :Training and testing subsets by spliting using randomSplit method at 7:3 ratio. seed makes sure the subsets are mutualy exclusive 
*/
println("Split the loaded data into training and test")
val Array(vAR_training, vAR_test) = vAR_SalesData_csv_rdd.randomSplit(Array(0.7, 0.3), seed = 11L)


Split the loaded data into training and test


vAR_training: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[168] at randomSplit at <console>:102
vAR_test: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[169] at randomSplit at <console>:102


In [36]:
/*
  Step #4 :  Setup Parameters
*/
println("Parameter Update started by = " + DateTime.now(DateTimeZone.UTC))
val vAR_categoricalFeaturesInfo = Map[Int, Int]()
val vAR_impurity =  "variance"
val vAR_maxBins = 32
val vAR_numTrees = 15 // number of trees to run in parallel
val vAR_featureSubsetStrategy = "auto" // Let the algorithm choose.
println("Parameter Update Ended by = " + DateTime.now(DateTimeZone.UTC))



Parameter Update started by = 2021-05-12T20:50:07.042Z
Parameter Update Ended by = 2021-05-12T20:50:07.042Z


vAR_categoricalFeaturesInfo: scala.collection.immutable.Map[Int,Int] = Map()
vAR_impurity: String = variance
vAR_maxBins: Int = 32
vAR_numTrees: Int = 15
vAR_featureSubsetStrategy: String = auto


In [37]:
/*
  Step #5 :  Setup Pipeline, Train , Test , Validate and Predict
*/
println("Train, Test , Validate and Prediction of DecisionTree Algorithm started by = " + DateTime.now(DateTimeZone.UTC))

val vAR_model = DecisionTree.trainRegressor(vAR_training, vAR_categoricalFeaturesInfo, vAR_impurity,
  vAR_maxDepth, vAR_maxBins)
val vAR_labelsAndPredictions = vAR_test.map { point =>
  val vAR_prediction = vAR_model.predict(point.features)
  (point.label, vAR_prediction)
}
val vAR_testMSE = vAR_labelsAndPredictions.map{ case (v, p) => math.pow(v - p, 2) }.mean()
println(s"Test Mean Squared Error = $vAR_testMSE")

println("Train, Test , Validate and Prediction of DecisionTree Algorithm ended by = " + DateTime.now(DateTimeZone.UTC))


Train, Test , Validate and Prediction of RandomForest Algorithm started by = 2021-05-12T20:50:07.606Z
Test Mean Squared Error = 367.07838749999996
Train, Test , Validate and Prediction of RandomForest Algorithm ended by = 2021-05-12T20:50:08.523Z


vAR_model: org.apache.spark.mllib.tree.model.DecisionTreeModel = DecisionTreeModel regressor of depth 4 with 19 nodes
vAR_labelsAndPredictions: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[194] at map at <console>:111
vAR_testMSE: Double = 367.07838749999996


In [38]:
/*
Step #6 : Apply Repartition count
*/
println("Apply Repartition started by= " + DateTime.now(DateTimeZone.UTC))
//Apply Repartition to compare performance with default partition
val vAR_repart_count_set = 4
println("Repartition count set as = " + vAR_repart_count_set)
val vAR_SalesData_csv_rdd_part = vAR_SalesData_csv_rdd.repartition(4)
println("Apply Repartition Ended by= " + DateTime.now(DateTimeZone.UTC))
println("Repartition Applied count = " + vAR_SalesData_csv_rdd_part.partitions.size)



Apply Repartition started by= 2021-05-12T20:50:08.917Z
Repartition count set as = 4
Apply Repartition Ended by= 2021-05-12T20:50:08.925Z
Repartition Applied count = 4


vAR_repart_count_set: Int = 4
vAR_SalesData_csv_rdd_part: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[200] at repartition at <console>:105


In [39]:
/* 
Step #7 :Training and testing subsets by spliting using randomSplit method at 7:3 ratio. seed makes sure the subsets are mutualy exclusive 
*/
println("After Repartition Split the loaded data into training and test")
val Array(vAR_training_part,vAR_test_part) = vAR_SalesData_csv_rdd_part.randomSplit(Array(0.7, 0.3), seed = 11L)

After Repartition Split the loaded data into training and test


vAR_training_part: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[201] at randomSplit at <console>:102
vAR_test_part: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[202] at randomSplit at <console>:102


In [40]:
/* 
Step #8 : Setup parameters
*/
println("After Repartition parameter setup started by: " +DateTime.now(DateTimeZone.UTC))

val vAR_categoricalFeaturesInfo_part = Map[Int, Int]()
val vAR_numTrees_part = 30 // number of trees to run in parallel  after repartition
val vAR_featureSubsetStrategy_part = "auto" // Let the algorithm choose.
val vAR_impurity_part = "variance"
val vAR_maxBins_part = 32

println("After Repartition parameter setup ended by: " +DateTime.now(DateTimeZone.UTC))



After Repartition parameter setup started by: 2021-05-12T20:50:09.834Z
After Repartition parameter setup ended by: 2021-05-12T20:50:09.834Z


vAR_categoricalFeaturesInfo_part: scala.collection.immutable.Map[Int,Int] = Map()
vAR_numTrees_part: Int = 30
vAR_featureSubsetStrategy_part: String = auto
vAR_impurity_part: String = variance
vAR_maxBins_part: Int = 32


In [41]:
/*
Step #9 : Train Data  after repartition
*/
println("After Repartition Model Fit, Prediction  of DecisionTree Algorithm  started by: " + DateTime.now(DateTimeZone.UTC))
val vAR_model_part = DecisionTree.trainRegressor(vAR_training_part, vAR_categoricalFeaturesInfo_part, vAR_impurity_part,
  vAR_maxDepth_part, vAR_maxBins_part)

/*
Step #10 : Evaluate model on test instances and compute test error
*/
val vAR_labelsAndPredictions_part = vAR_test_part.map { point =>
  val vAR_prediction_part = vAR_model_part.predict(point.features)
  (point.label, vAR_prediction_part)
}

val vAR_testMSE_part = vAR_labelsAndPredictions_part.map{ case(v, p) => math.pow((v - p), 2)}.mean()
println(s"Test Mean Squared Error = $vAR_testMSE_part")
println(s"Learned regression forest model:\n ${vAR_model_part.toDebugString}")

println("After Repartition Model Fit, Prediction  of DecisionTree Algorithm  ended by: " + DateTime.now(DateTimeZone.UTC))

After Repartition Model Fit, Prediction  of RandomForest Algorithm  started by: 2021-05-12T20:50:10.546Z
Test Mean Squared Error = 184.345905
Learned regression forest model:
 DecisionTreeModel regressor of depth 4 with 11 nodes
  If (feature 4 <= 14366.5)
   If (feature 1 <= 875.0)
    Predict: 13.639999999999999
   Else (feature 1 > 875.0)
    If (feature 8 <= 5.5)
     If (feature 3 <= 6.470000000000001)
      Predict: 3.64
     Else (feature 3 > 6.470000000000001)
      Predict: 10.055
    Else (feature 8 > 5.5)
     If (feature 3 <= 6.470000000000001)
      Predict: 13.639999999999999
     Else (feature 3 > 6.470000000000001)
      Predict: 6.470000000000001
  Else (feature 4 > 14366.5)
   Predict: 18.295

After Repartition Model Fit, Prediction  of RandomForest Algorithm  ended by: 2021-05-12T20:50:11.447Z


vAR_model_part: org.apache.spark.mllib.tree.model.DecisionTreeModel = DecisionTreeModel regressor of depth 4 with 11 nodes
vAR_labelsAndPredictions_part: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[224] at map at <console>:114
vAR_testMSE_part: Double = 184.345905
