In [1]:
/*
     Step #1 :  Import Libraries
*/
import org.joda.time.{DateTime, DateTimeZone}
println("Import libraries started by = " +DateTime.now(DateTimeZone.UTC))

import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.{VectorAssembler, VectorIndexer}
import org.apache.spark.ml.regression.RandomForestRegressor
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}

println("Import libraries ended by = " +DateTime.now(DateTimeZone.UTC))

Intitializing Scala interpreter ...

Spark Web UI available at http://dsclusterbq-m:8088/proxy/application_1620786445671_0001
SparkContext available as 'sc' (version = 3.1.1, master = yarn, app id = application_1620786445671_0001)
SparkSession available as 'spark'


Import libraries started by = 2021-05-12T02:31:21.291Z
Import libraries ended by = 2021-05-12T02:31:21.347Z


import org.joda.time.{DateTime, DateTimeZone}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.{VectorAssembler, VectorIndexer}
import org.apache.spark.ml.regression.RandomForestRegressor
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer}


In [2]:
/*
  Step #2 :  Load Training Data
*/
  println("Data load Started by: " + DateTime.now(DateTimeZone.UTC))
  val vAR_SalesData_csv_df = spark.read
    .option("header", true)
    .option("inferSchema", "true")
    .csv("gs://dssparkbucket/DS.AI_SalesData.csv")
  println("Data load  Ended by: " + DateTime.now(DateTimeZone.UTC))
  println("Default partition applied by system = " + vAR_SalesData_csv_df.rdd.getNumPartitions)

Data load Started by: 2021-05-12T02:31:22.006Z
Data load  Ended by: 2021-05-12T02:31:34.826Z
Default partitions applied by system = 1


vAR_SalesData_csv_df: org.apache.spark.sql.DataFrame = [TransactionID: int, ProductID: int ... 11 more fields]


In [3]:

/*
  Step #3 : transform data for training
*/

  println("Data transform Started by: " + DateTime.now(DateTimeZone.UTC))

 /*
   VectorAssembler is a transformer as it takes the input dataframe and returns the transformed dataframe
   with a new column which is vector representation of all the features.
   Select Features columns as "TransactionID", "Quantity", "ActualCost", "CustomerID", "TotalDue", "LineTotal", "FinishedGoodsFlag", "SalesReasonID", "AverageRate", "EndOfDayRate", "SalesLastYear", "ProductID"
 */
  val vAR_assembler = new VectorAssembler()
    .setInputCols(Array("TransactionID", "Quantity", "ActualCost", "CustomerID", "TotalDue", "LineTotal", "FinishedGoodsFlag", "SalesReasonID", "AverageRate", "EndOfDayRate", "SalesLastYear", "ProductID"))
    .setOutputCol("features")
  val vAR_transformed_output = vAR_assembler.transform(vAR_SalesData_csv_df)
   
  val vAR_labelIndexer = new StringIndexer()
  .setInputCol("MakeFlag")
  .setOutputCol("indexedLabel")
  .fit(vAR_transformed_output)

 // Set maxCategories so features with > 4 distinct values are treated as continuous.
 val vAR_featureIndexer = new VectorIndexer()
  .setInputCol("features")
  .setOutputCol("indexedFeatures")
  .setMaxCategories(4)
  .fit(vAR_transformed_output)

println("Data transform Ended by: " + DateTime.now(DateTimeZone.UTC))



Data transform Started by: 2021-05-12T02:31:35.946Z
Data transform Ended by: 2021-05-12T02:31:40.254Z


vAR_assembler: org.apache.spark.ml.feature.VectorAssembler = VectorAssembler: uid=vecAssembler_b0bd7f0e24fd, handleInvalid=error, numInputCols=12
vAR_transformed_output: org.apache.spark.sql.DataFrame = [TransactionID: int, ProductID: int ... 12 more fields]
vAR_labelIndexer: org.apache.spark.ml.feature.StringIndexerModel = StringIndexerModel: uid=strIdx_d5cc61ec71e6, handleInvalid=error
vAR_featureIndexer: org.apache.spark.ml.feature.VectorIndexerModel = VectorIndexerModel: uid=vecIdx_81d5ec82ceae, numFeatures=12, handleInvalid=error


In [4]:
/*
  Step #4 :  Select  Training,Test Data as 70:30
*/
  println("Split the loaded data into training and test")
  val vAR_splitupdata = vAR_transformed_output.randomSplit(Array(0.70, 0.30))
  val vAR_training_data = vAR_splitupdata(0)
  val vAR_test_data = vAR_splitupdata(1)

Split the loaded data into training and test


vAR_splitupdata: Array[org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]] = Array([TransactionID: int, ProductID: int ... 12 more fields], [TransactionID: int, ProductID: int ... 12 more fields])
vAR_training_data: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [TransactionID: int, ProductID: int ... 12 more fields]
vAR_test_data: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [TransactionID: int, ProductID: int ... 12 more fields]


In [5]:
/*
  Step #5 :  Setup Pipeline
*/
println("Data prep started by = " + DateTime.now(DateTimeZone.UTC))
// Train a RandomForest model.
val vAR_rf = new RandomForestRegressor()
  .setLabelCol("indexedLabel")
  .setFeaturesCol("indexedFeatures")
  .setNumTrees(15)  // number of trees to run in parallel  

// Convert indexed labels back to original labels.
val vAR_labelConverter = new IndexToString()
  .setInputCol("prediction")
  .setOutputCol("predictedLabel")
  .setLabels(vAR_labelIndexer.labels)

// Chain indexers and forest in a Pipeline
val vAR_pipeline = new Pipeline()
  .setStages(Array(vAR_labelIndexer, vAR_featureIndexer, vAR_rf, vAR_labelConverter))

println("Data Prep Ended by = " + DateTime.now(DateTimeZone.UTC))

Data prep started by = 2021-05-12T02:31:41.766Z
Data Prep Ended by = 2021-05-12T02:31:41.840Z


vAR_rf: org.apache.spark.ml.regression.RandomForestRegressor = rfr_811e9c94c94a
vAR_labelConverter: org.apache.spark.ml.feature.IndexToString = idxToStr_3aca36832261
vAR_pipeline: org.apache.spark.ml.Pipeline = pipeline_6f3afd6381b5


In [6]:

/*
  Step #6 :  Train , Test , Validate and Predict
  */

println("Train, Test , Validate and Predict started by = " + DateTime.now(DateTimeZone.UTC))

// Train model.  This also runs the indexers.
val vAR_model = vAR_pipeline.fit(vAR_training_data)

// Make predictions.
val vAR_predictions = vAR_model.transform(vAR_test_data)

// Select example rows to display.
vAR_predictions.select("prediction", "MakeFlag", "features").show(5)

// Select (prediction, true label) and compute test error.
val vAR_evaluator = new RegressionEvaluator()
  .setLabelCol("MakeFlag")
  .setPredictionCol("prediction")
  .setMetricName("rmse")

val vAR_rmse = vAR_evaluator.evaluate(vAR_predictions)
println(s"Root Mean Squared Error (RMSE) on test data = $vAR_rmse")

println("Train, Test , Validate and Predict ended by = " + DateTime.now(DateTimeZone.UTC))






Train, Test , Validate and Predict started by = 2021-05-12T02:31:42.401Z
+----------+--------+--------------------+
|prediction|MakeFlag|            features|
+----------+--------+--------------------+
|       0.0|       0|[103860.0,1.0,7.9...|
|       0.0|       0|[103861.0,1.0,3.9...|
|       0.0|       0|[103861.0,1.0,3.9...|
|       0.0|       0|[103868.0,1.0,2.2...|
|       0.0|       0|[103868.0,1.0,2.2...|
+----------+--------+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 0.0
Train, Test , Validate and Predict ended by = 2021-05-12T02:31:55.002Z


vAR_model: org.apache.spark.ml.PipelineModel = pipeline_6f3afd6381b5
vAR_predictions: org.apache.spark.sql.DataFrame = [TransactionID: int, ProductID: int ... 16 more fields]
vAR_evaluator: org.apache.spark.ml.evaluation.RegressionEvaluator = RegressionEvaluator: uid=regEval_df316fb9d76e, metricName=rmse, throughOrigin=false
vAR_rmse: Double = 0.0


In [7]:
/*
Step #7 : Apply Repartition count
*/

  println("Partition Applied default by System  = " + vAR_SalesData_csv_df.rdd.getNumPartitions)
  println("Apply Repartition Started by : " + DateTime.now(DateTimeZone.UTC))
 
  //Apply Repartition to compare performance with default partition
  val vAR_repart_count_set = 4
  println("Repartition count set as = " + vAR_repart_count_set)
  val vAR_SalesData_csv_df_part = vAR_SalesData_csv_df.repartition(vAR_repart_count_set)
  println("Apply Repartition Ended by= " + DateTime.now(DateTimeZone.UTC))

  println("Repartition Applied count = " + vAR_SalesData_csv_df_part.rdd.getNumPartitions)

Partition Applied default by System  = 1
Apply Repartition Started by : 2021-05-12T02:31:55.417Z
Repartition count set as = 4
Apply Repartition Ended by= 2021-05-12T02:31:55.422Z
Repartition Applied count = 4


vAR_repart_count_set: Int = 4
vAR_SalesData_csv_df_part: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [TransactionID: int, ProductID: int ... 11 more fields]


In [8]:
/*
Step #8 : VectorAssembler is a transformer as it takes the input dataframe and returns the transformed dataframe
               with a new column which is vector representation of all the features.
               Select Features columns as "TransactionID", "Quantity", "ActualCost", "CustomerID", "TotalDue", "LineTotal", "FinishedGoodsFlag", "SalesReasonID", "AverageRate", "EndOfDayRate", "SalesLastYear", "ProductID"
*/
  println("After Repartition Data transform Started by: " + DateTime.now(DateTimeZone.UTC))
  val vAR_assembler_part = new VectorAssembler()
    .setInputCols(Array("TransactionID", "Quantity", "ActualCost", "CustomerID", "TotalDue", "LineTotal", "FinishedGoodsFlag", "SalesReasonID", "AverageRate", "EndOfDayRate", "SalesLastYear", "ProductID"))
    .setOutputCol("features")
  val vAR_transformed_output_part = vAR_assembler_part.transform(vAR_SalesData_csv_df_part)
  /*
  Step #9 :  Select  Training,Test Data after repartition
  */
  val vAR_splitupdata_part = vAR_transformed_output_part.randomSplit(Array(0.80, 0.20))
  val vAR_training_data_part = vAR_splitupdata_part(0)
  val vAR_test_data_part = vAR_splitupdata_part(1)

println("After Repartition Data transform Ended by: " + DateTime.now(DateTimeZone.UTC))


/*
 Step #10 :  Index the features Data after repartition
*/
println("After Repartition Data Prep started by: " + DateTime.now(DateTimeZone.UTC))

  val vAR_featureIndexer_part = new VectorIndexer()
    .setInputCol("features")
    .setOutputCol("indexedFeatures")
    .setMaxCategories(13)
    .fit(vAR_transformed_output)
/*
Step #11 :  select the Algorithm with columns indexedFeatures, MakeFlag after repartition
*/
  val vAR_rf_part = new RandomForestRegressor( )
    .setLabelCol("MakeFlag")
    .setFeaturesCol("indexedFeatures")
    .setNumTrees(30)   // number of trees to run in parallel  after repartition

/*  
Step #12 :  Configure an ML pipeline, which consists of two stages after repartition 
*/
  val vAR_pipeline_part = new Pipeline()
    .setStages(Array(vAR_featureIndexer_part, vAR_rf_part))
/*
Step #13 :  setup Evaluator after repartition for testing
*/
  val vAR_evaluator_part = new RegressionEvaluator()
    .setLabelCol("MakeFlag")
    .setPredictionCol("prediction")
    .setMetricName("rmse")
  println("After Repartition Data prep ended by: " + DateTime.now(DateTimeZone.UTC))

  

After Repartition Data transform Started by: 2021-05-12T02:31:56.213Z
After Repartition Data transform Ended by: 2021-05-12T02:31:56.285Z
After Repartition Data Prep started by: 2021-05-12T02:31:56.290Z
After Repartition Data prep ended by: 2021-05-12T02:31:56.561Z


vAR_assembler_part: org.apache.spark.ml.feature.VectorAssembler = VectorAssembler: uid=vecAssembler_9c72138ac573, handleInvalid=error, numInputCols=12
vAR_transformed_output_part: org.apache.spark.sql.DataFrame = [TransactionID: int, ProductID: int ... 12 more fields]
vAR_splitupdata_part: Array[org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]] = Array([TransactionID: int, ProductID: int ... 12 more fields], [TransactionID: int, ProductID: int ... 12 more fields])
vAR_training_data_part: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [TransactionID: int, ProductID: int ... 12 more fields]
vAR_test_data_part: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [TransactionID: int, ProductID: int ... 12 more fields]
vAR_featureIndexer_part: org.apache.spark.ml.feature...


In [9]:
/*
Step #14 : Train Data  after repartition
*/
  println("After Repartition Model Fit, Predict Started by: " + DateTime.now(DateTimeZone.UTC))
  val vAR_Model_part = vAR_pipeline_part.fit(vAR_training_data_part)

/*
Step #15 : Predict on Test Data after repartition
*/
  val vAR_predictions_part = vAR_Model_part.transform(vAR_test_data_part)
  vAR_predictions_part.show()

  println("After Repartition Model Train completed by:", DateTime.now(DateTimeZone.UTC))



After Repartition Model Fit, Predict Started by: 2021-05-12T02:31:56.970Z
+-------------+---------+--------+----------+----------+--------+---------+--------+-----------------+-------------+-----------+------------+-------------+--------------------+--------------------+----------+
|TransactionID|ProductID|Quantity|ActualCost|CustomerID|TotalDue|LineTotal|MakeFlag|FinishedGoodsFlag|SalesReasonID|AverageRate|EndOfDayRate|SalesLastYear|            features|     indexedFeatures|prediction|
+-------------+---------+--------+----------+----------+--------+---------+--------+-----------------+-------------+-----------+------------+-------------+--------------------+--------------------+----------+
|       103869|      921|       1|      4.99|     12107| 35.6584|    24.99|       0|                1|           10|     1.5955|      1.5969|      5693989|[103869.0,1.0,4.9...|[4.0,0.0,2.0,2.0,...|       0.0|
+-------------+---------+--------+----------+----------+--------+---------+--------+------

vAR_Model_part: org.apache.spark.ml.PipelineModel = pipeline_a0f2990afac6
vAR_predictions_part: org.apache.spark.sql.DataFrame = [TransactionID: int, ProductID: int ... 14 more fields]
