In [37]:
/*
Step #1 :  Import Required Libraries
*/
import org.joda.time.{DateTime, DateTimeZone}
println("Import libraries started by " + DateTime.now(DateTimeZone.UTC))
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.sql.SparkSession
println("Import libraries ended by " + DateTime.now(DateTimeZone.UTC))


Import libraries started by 2021-05-12T17:49:15.312Z
Import libraries ended by 2021-05-12T17:49:15.312Z


import org.joda.time.{DateTime, DateTimeZone}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.RandomForest
import org.apache.spark.sql.SparkSession


In [38]:
/*
Step #2 :  Load Training Data
*/
println("Data load Started by: " + DateTime.now(DateTimeZone.UTC))
val vAR_SalesData_csv_rdd = sc.textFile("gs://dssparkbucket/DS.AI_SalesData_RDD.csv").map { line =>
      val p = line.split(',')
      LabeledPoint(p(6).toDouble, Vectors.dense(p(0).toDouble,p(1).toDouble,p(2).toDouble,p(3).toDouble,p(4).toDouble,p(5).toDouble,p(7).toDouble,p(8).toDouble,p(9).toDouble,p(10).toDouble,p(11).toDouble,p(12).toDouble))
    }.cache()
println("Data load Ended by: " + DateTime.now(DateTimeZone.UTC))

println("Partition applied default by system = " + vAR_SalesData_csv_rdd.getNumPartitions)


Data load Started by: 2021-05-12T17:49:15.738Z
Data load Ended by: 2021-05-12T17:49:15.792Z
Partition applied default by system = 2


vAR_SalesData_csv_rdd: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[204] at map at <console>:46


In [39]:
/* 
Step #3 :Training and testing subsets by spliting using randomSplit method at 7:3 ratio. seed makes sure the subsets are mutualy exclusive 
*/
val Array(vAR_training, vAR_test) = vAR_txt_RDD.randomSplit(Array(0.7, 0.3), seed = 12345)



vAR_training: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[205] at randomSplit at <console>:46
vAR_test: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[206] at randomSplit at <console>:46


In [40]:
/*
Step #4 : Setup parameters
*/
println("Parameter Update started by = " + DateTime.now(DateTimeZone.UTC))
val vAR_numClasses = 3
val vAR_categoricalFeaturesInfo = Map[Int, Int]()
val vAR_numTrees = 15 // number of trees to run in parallel
val vAR_featureSubsetStrategy = "auto"
val vAR_impurity = "variance"
val vAR_maxDepth = 3
val vAR_maxBins = 32

println("Parameter Update ended by = " + DateTime.now(DateTimeZone.UTC))


Parameter Update started by = 2021-05-12T17:49:16.638Z
Parameter Update ended by = 2021-05-12T17:49:16.647Z


vAR_numClasses: Int = 3
vAR_categoricalFeaturesInfo: scala.collection.immutable.Map[Int,Int] = Map()
vAR_numTrees: Int = 15
vAR_featureSubsetStrategy: String = auto
vAR_impurity: String = variance
vAR_maxDepth: Int = 3
vAR_maxBins: Int = 32


In [41]:
/*
  Step #5 :  Setup Pipeline, Train , Test , Validate and Predict
*/
println("Train, Test , Validate and Prediction of RandomForest Algorithm started by = " + DateTime.now(DateTimeZone.UTC))

val vAR_model = RandomForest.trainRegressor(vAR_training, vAR_categoricalFeaturesInfo,
  vAR_numTrees, vAR_featureSubsetStrategy, vAR_impurity, vAR_maxDepth, vAR_maxBins)
val vAR_labelAndPreds = vAR_test.map { point =>
    val vAR_prediction = vAR_model.predict(point.features)
      (point.label, vAR_prediction)
    }
val vAR_testMSE = vAR_labelAndPreds.map{ case(v, p) => math.pow((v - p), 2)}.mean()
println(s"Test Mean Squared Error = $vAR_testMSE")
val vAR_testErr = vAR_labelAndPreds.filter(r => r._1 != r._2).count.toDouble / vAR_test.count()
println(s"Test Error = $vAR_testErr")

println("Train, Test , Validate and Prediction of RandomForest Algorithm ended by = " + DateTime.now(DateTimeZone.UTC))


Train, Test , Validate and Prediction of RandomForest Algorithm started by = 2021-05-12T17:49:16.990Z
Test Mean Squared Error = 141.3716633454979
Test Error = 1.0
Train, Test , Validate and Prediction of RandomForest Algorithm ended by = 2021-05-12T17:49:17.694Z


vAR_model: org.apache.spark.mllib.tree.model.RandomForestModel =
TreeEnsembleModel regressor with 15 trees

vAR_labelAndPreds: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[225] at map at <console>:59
vAR_testMSE: Double = 141.3716633454979
vAR_testErr: Double = 1.0


In [42]:
/*
Step #6 : Apply Repartition count
*/
println("Apply Repartition started by= " + DateTime.now(DateTimeZone.UTC))
//Apply Repartition to compare performance with default partition
val vAR_repart_count_set = 4
println("Repartition count set as = " + vAR_repart_count_set)
val vAR_SalesData_csv_rdd_part = vAR_SalesData_csv_rdd.repartition(4)
println("Apply Repartition Ended by= " + DateTime.now(DateTimeZone.UTC))
println("Repartition Applied count = " + vAR_SalesData_csv_rdd_part.partitions.size)

/*
Step #7 :  Select  Train and Test Data
*/
println("After Repartition Split the loaded data into training and test")
val Array(vAR_training_part, vAR_test_part) = vAR_SalesData_csv_rdd_part.randomSplit(Array(0.7, 0.3), seed = 12345)



Apply Repartition started by= 2021-05-12T17:49:18.142Z
Repartition count set as = 4
Apply Repartition Ended by= 2021-05-12T17:49:18.169Z
Repartition Applied count = 4
After Repartition Split the loaded data into training and test


vAR_repart_count_set: Int = 4
vAR_SalesData_csv_rdd_part: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[232] at repartition at <console>:52
vAR_training_part: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[233] at randomSplit at <console>:60
vAR_test_part: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[234] at randomSplit at <console>:60


In [43]:
/*
Step #8 : Setup parameters
*/
println("After Repartition parameter setup started by: " +DateTime.now(DateTimeZone.UTC))

val vAR_numClasses_part = 3
val vAR_categoricalFeaturesInfo_part = Map[Int, Int]()
val vAR_numTrees_part = 30 // number of trees to run in parallel after repartition
val vAR_featureSubsetStrategy_part = "auto"
val vAR_impurity_part = "variance"
val vAR_maxDepth_part = 4
val vAR_maxBins_part = 32

println("After Repartition parameter setup ended by: " +DateTime.now(DateTimeZone.UTC))

After Repartition parameter setup started by: 2021-05-12T17:49:18.432Z
After Repartition parameter setup ended by: 2021-05-12T17:49:18.433Z


vAR_numClasses_part: Int = 3
vAR_categoricalFeaturesInfo_part: scala.collection.immutable.Map[Int,Int] = Map()
vAR_numTrees_part: Int = 30
vAR_featureSubsetStrategy_part: String = auto
vAR_impurity_part: String = variance
vAR_maxDepth_part: Int = 4
vAR_maxBins_part: Int = 32


In [1]:
/*
Step #9 : Train Data  after repartition
*/
println("After Repartition Model Fit, Prediction  of RandomForest Algorithm  started by: " + DateTime.now(DateTimeZone.UTC))
val vAR_model_part = RandomForest.trainRegressor(vAR_training_part, vAR_categoricalFeaturesInfo_part,
  vAR_numTrees_part, vAR_featureSubsetStrategy_part, vAR_impurity_part, vAR_maxDepth_part, vAR_maxBins_part)

/*
Step #10 : Evaluate model on test instances and compute test error
*/
val vAR_labelAndPreds_part = vAR_test_part.map { point =>
    val prediction_part = vAR_model_part.predict(point.features)
    (point.label, prediction_part)
  }

val vAR_testMSE_part = vAR_labelAndPreds_part.map{ case(v, p) => math.pow((v - p), 2)}.mean()
println(s"After Repartition, Test Mean Squared Error = $vAR_testMSE_part")
val vAR_testErr_part = vAR_labelAndPreds_part.filter(r => r._1 != r._2).count.toDouble / vAR_test_part.count()
println(s"After Repartition, Test Error = $vAR_testErr_part")

println("After Repartition Model Fit, Prediction  of RandomForest Algorithm  ended by: " + DateTime.now(DateTimeZone.UTC))

Intitializing Scala interpreter ...

Spark Web UI available at http://dsclusterbq-m:8088/proxy/application_1620850277284_0002
SparkContext available as 'sc' (version = 3.1.1, master = yarn, app id = application_1620850277284_0002)
SparkSession available as 'spark'


<console>: 26: error: not found: value DateTime