## Group 8 Assignment Phase 3

In [1]:
//Start a simple Spark Session
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql._

//Import VectorAssembler for the Feature Vector and Linear Algebra data structures
import org.apache.spark.ml.feature.{VectorAssembler,StringIndexer,VectorIndexer,OneHotEncoder}
import org.apache.spark.ml.linalg.{Vector,Vectors, Matrix, Matrices}

//Model Building Pipeline
import org.apache.spark.ml.{Pipeline, PipelineModel}

//Binary Classification
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel,
                                           RandomForestClassifier, GBTClassifier,DecisionTreeClassifier}
//Model Training
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, 
                                   ParamGridBuilder, TrainValidationSplit}


//Model Evaluation
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator,MulticlassClassificationEvaluator}

//Optional: Use the following code below to set the Error reporting
import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)


//For Cleaning
//import scala.util.matching.Regex

val spark = SparkSession.builder().getOrCreate()

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.0.4:4041
SparkContext available as 'sc' (version = 2.4.5, master = local[*], app id = local-1590822159355)
SparkSession available as 'spark'


import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql._
import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer, VectorIndexer, OneHotEncoder}
import org.apache.spark.ml.linalg.{Vector, Vectors, Matrix, Matrices}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel, RandomForestClassifier, GBTClassifier, DecisionTreeClassifier}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, MulticlassClassificationEvaluator}
import org.apache.l...

## Read in a parquet file of flight delay, fuel-price and meteorological data

In [2]:
// Take a random sample (without replacement) of the data (to reduce memory requirements)
val sampleFraction = 0.1

//Concatenate rows df2 and df3 and drop any rows with missing data
val flights = (spark
            .read.parquet("flightDelay.parquet")
            .withColumn("Month_Num1", $"Month_Num" cast "Int")
            .withColumn("Date_Num",  ($"Year"-2004)*12 + $"Month_Num1")
            .drop("Sectors_Flown", "Month_Num1", "Change")
            .withColumnRenamed("Departures_Delayed","label")
            .withColumnRenamed("Price","Fuel_Price")
            .sample(false,sampleFraction)
            .na.drop())

flights.printSchema()

root
 |-- Departing_Port: string (nullable = true)
 |-- Arriving_Port: string (nullable = true)
 |-- Airline: string (nullable = true)
 |-- label: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month_Num: string (nullable = true)
 |-- Fuel_Price: double (nullable = true)
 |-- Departing_Port_station_ID: string (nullable = true)
 |-- Departing_Port_station_name: string (nullable = true)
 |-- Arriving_Port_station_ID: string (nullable = true)
 |-- Arriving_Port_station_name: string (nullable = true)
 |-- Mean_3pm_cloud_cover_oktas_Depart: double (nullable = true)
 |-- Mean_3pm_dew_point_temperature_Degrees_C_Depart: double (nullable = true)
 |-- Mean_3pm_relative_humidity_%_Depart: double (nullable = true)
 |-- Mean_3pm_temperature_Degrees_C_Depart: double (nullable = true)
 |-- Mean_3pm_wet_bulb_temperature_Degrees_C_Depart: double (nullable = true)
 |-- Mean_3pm_wind_speed_km/h_Depart: double (nullable = true)
 |-- Mean_9am_cloud_cover_okas_Depart: double (nullable 

sampleFraction: Double = 0.1
flights: org.apache.spark.sql.DataFrame = [Departing_Port: string, Arriving_Port: string ... 72 more fields]


## Take a look at the proportion of lates in the dataset

In [3]:
val counts = flights.groupBy("label").count()

println("proportion of lates (label=1) in the sample")
counts.show()

proportion of lates (label=1) in the sample
+-----+------+
|label| count|
+-----+------+
|    1|107848|
|    0|520844|
+-----+------+



counts: org.apache.spark.sql.DataFrame = [label: int, count: bigint]


## Set up a Logistic Regression Pipleline

In [4]:
//////////////////////////////////////////////////
//// Setting Up DataFrame for Machine Learning ///
//////////////////////////////////////////////////

// Deal with Categorical Columns
val categoricalVariables = Array(
    "Departing_Port", "Arriving_Port", "Airline")
val categoricalIndexers = categoricalVariables
  .map(i => new StringIndexer().setInputCol(i).setOutputCol(i+"_Index"))
val categoricalEncoders = categoricalVariables
  .map(e => new OneHotEncoder().setInputCol(e + "_Index").setOutputCol(e + "_Vec"))


// columns that need to added to the features vector
val cols = Array("Date_Num",  "Airline_Vec", "Fuel_Price",
    "Departing_Port_Vec", "Mean_9am_wind_speed_km/h_Depart", "Mean_rainfall_mm_Depart",
    "Arriving_Port_Vec", "Mean_9am_wind_speed_km/h_Arrive","Mean_rainfall_mm_Arrive")

// Assemble everything together to be ("label","features") format
val assembler = (new VectorAssembler()
                 .setInputCols(cols)
                 .setOutputCol("features") )


/////////////////////////////
// Set Up the Pipeline //////
/////////////////////////////

val lr = new LogisticRegression()
        //.setFitIntercept(true)
        //.setRegParam(0.01)
        .setMaxIter(100)
        //.setTol(0.001)
        //.setThreshold(0.2)
        .setStandardization(true)
        //.setWeightCol("classWeightCol")
        .setLabelCol("label")
        .setFeaturesCol("features")
        //.setFamily("multinomial")

// Print out the parameters, documentation, and any default values.
println(s"LogisticRegression parameters:\n ${lr.explainParams()}\n")
//val stages = Array(departureIndexer,arrivalIndexer,airlineIndexer,
//          departureEncoder,arrivalEncoder, airlineEncoder, 
//          assembler, lr)

val stages: Array[org.apache.spark.ml.PipelineStage] = categoricalIndexers ++ categoricalEncoders ++ Array(assembler, lr)



// build the pipeline
val pipeline = new Pipeline().setStages(stages)


LogisticRegression parameters:
 aggregationDepth: suggested depth for treeAggregate (>= 2) (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial. (default: auto)
featuresCol: features column name (default: features, current: features)
fitIntercept: whether to fit an intercept term (default: true)
labelCol: label column name (default: label, current: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. (undefined)
maxIter: maximum number of iterations (>= 0) (default: 100, current: 100)
predictionCol: prediction column name (default: prediction)


categoricalVariables: Array[String] = Array(Departing_Port, Arriving_Port, Airline)
categoricalIndexers: Array[org.apache.spark.ml.feature.StringIndexer] = Array(strIdx_49fafce624df, strIdx_7d8d6517a672, strIdx_c0482b8a3b70)
categoricalEncoders: Array[org.apache.spark.ml.feature.OneHotEncoder] = Array(oneHot_15496b16c59c, oneHot_1dfa95aaa786, oneHot_f9a5ea8950f1)
cols: Array[String] = Array(Date_Num, Airline_Vec, Fuel_Price, Departing_Port_Vec, Mean_9am_wind_speed_km/h_Depart, Mean_rainfall_mm_Depart, Arriving_Port_Vec, Mean_9am_wind_speed_km/h_Arrive, Mean_rainfall_mm_Arrive)
assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_b366717a25d2
lr: org.apache.spark.ml.classification.LogisticRegression = logreg_169df198af27
stages: Array[org.apache.spark.ml.PipelineStage] =...

## Train Pipleine using a Train - Validation Split

In [4]:
val paramGrid = new ParamGridBuilder()
  .addGrid(lr.regParam, Array(1.0, 0.3, 0.2, 0.1))
  .addGrid(lr.threshold, Array(0.1,0.2,0.3,0.4,0.5))
  .addGrid(lr.tol, Array(0.001,0.00001))
  .addGrid(lr.elasticNetParam, Array(0.0, 0.5, 1.0))
  .build()

val tvs = new TrainValidationSplit()
  .setEstimator(pipeline) // the estimator can also just be an individual model rather than a pipeline
  .setEvaluator(new BinaryClassificationEvaluator)
  .setEstimatorParamMaps(paramGrid)
  .setTrainRatio(0.75)

//////////////////////////
/// Split the Data ///////
//////////////////////////
val Array(training, test) = flights.randomSplit(Array(0.7, 0.3), seed = 12345)

//Create a holdout test set
val model = tvs.fit(training)

println(s"***model was fit using parameters: ${model.parent.extractParamMap}")


// Get Results on Test Set
val results = model.transform(test)

// Make Predictions on the Test Dataset
val predictions = results.select ("features", "label", "prediction")



predictions.show(10)


2020-05-27 07:47:20,552 WARN  [Thread-4] netlib.BLAS (BLAS.java:<clinit>(61)) - Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
2020-05-27 07:47:20,553 WARN  [Thread-4] netlib.BLAS (BLAS.java:<clinit>(61)) - Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
***model was fit using parameters: {
	tvs_8154792a39a5-collectSubModels: false,
	tvs_8154792a39a5-estimator: pipeline_134793f1313b,
	tvs_8154792a39a5-estimatorParamMaps: [Lorg.apache.spark.ml.param.ParamMap;@f9facf,
	tvs_8154792a39a5-evaluator: binEval_14deb6311477,
	tvs_8154792a39a5-parallelism: 1,
	tvs_8154792a39a5-seed: -1772833110,
	tvs_8154792a39a5-trainRatio: 0.75
}
+--------------------+-----+----------+
|            features|label|prediction|
+--------------------+-----+----------+
|(101,[0,4,10,14,5...|    0|       0.0|
|(101,[0,4,10,14,5...|    0|       0.0|
|(101,[0,4,10,14,5...|    0|       0.0|
|(101,[0,4,10,14,5...|    0|       0.0|
|(101,[0,4,10,14,5...|    0|    

paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	logreg_1f564ab5e3fc-elasticNetParam: 0.0,
	logreg_1f564ab5e3fc-regParam: 1.0,
	logreg_1f564ab5e3fc-threshold: 0.1,
	logreg_1f564ab5e3fc-tol: 0.001
}, {
	logreg_1f564ab5e3fc-elasticNetParam: 0.5,
	logreg_1f564ab5e3fc-regParam: 1.0,
	logreg_1f564ab5e3fc-threshold: 0.1,
	logreg_1f564ab5e3fc-tol: 0.001
}, {
	logreg_1f564ab5e3fc-elasticNetParam: 1.0,
	logreg_1f564ab5e3fc-regParam: 1.0,
	logreg_1f564ab5e3fc-threshold: 0.1,
	logreg_1f564ab5e3fc-tol: 0.001
}, {
	logreg_1f564ab5e3fc-elasticNetParam: 0.0,
	logreg_1f564ab5e3fc-regParam: 1.0,
	logreg_1f564ab5e3fc-threshold: 0.2,
	logreg_1f564ab5e3fc-tol: 0.001
}, {
	logreg_1f564ab5e3fc-elasticNetParam: 0.5,
	logreg_1f564ab5e3fc-regParam: 1.0,
	logreg_1f564ab5e3fc-threshold: 0.2,
	logre...

## Evaluate the Train - Validation Split Model

In [None]:
val eval = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction")
println(s"Accuracy: ${eval.setMetricName("accuracy").evaluate(results)}")
println(s"Precision: ${eval.setMetricName("weightedPrecision").evaluate(results)}")
println(s"Recall: ${eval.setMetricName("weightedRecall").evaluate(results)}")
println(s"F1: ${eval.setMetricName("f1").evaluate(results)}")

val TP = results.select("label", "prediction").filter("label = 1 and prediction = 1").count
val TN = results.select("label", "prediction").filter("label = 0 and prediction = 0").count
val FP = results.select("label", "prediction").filter("label = 0 and prediction = 1").count
val FN = results.select("label", "prediction").filter("label = 1 and prediction = 0").count
val total = results.select("label").count.toDouble

// Confusion matrix
println("Confusion matrix\n(Predict N, Predict P):")
val confusion: Matrix = Matrices.dense(2, 2, Array(TN, FP, FN, TP))

val accuracy    = (TP + TN) / total
val precision   = (TP + FP) / total
val recall      = (TP + FN) / total
val F1 = 2/(1/precision + 1/recall)

## Train the Pipeline using Cross Validation

In [5]:
// We use a ParamGridBuilder to construct a grid of parameters to search over.
// With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
// this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
val paramGrid = new ParamGridBuilder()
  .addGrid(lr.regParam, Array(0.1))
  .addGrid(lr.threshold, Array(0.19,0.20,0.21,0.22))
  .addGrid(lr.tol, Array(0.000001))
  .addGrid(lr.elasticNetParam, Array(0.0))
  .build()

// We now treat the Pipeline as an Estimator, wrapping it in a CrossValidator instance.
// This will allow us to jointly choose parameters for all Pipeline stages.
// A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
// Note that the evaluator here is a BinaryClassificationEvaluator and its default metric
// is areaUnderROC.
val cv = new CrossValidator()
  .setEstimator(pipeline)
  .setEvaluator(new BinaryClassificationEvaluator)
  .setEstimatorParamMaps(paramGrid)
  .setNumFolds(5)  // Use 3+ in practice
  .setParallelism(2)  // Evaluate up to 2 parameter settings in parallel



//////////////////////////
/// Split the Data ///////
//////////////////////////
val Array(training, test) = flights.randomSplit(Array(0.7, 0.3), seed = 12345)

// Run cross-validation, and choose the best set of parameters.
val model = cv.fit(training)

// Print the coefficients and intercept for logistic regression
//println(s"*******************************************\nCoefficients: ${model.coefficients} Intercept: ${model.intercept}")
// Since model is a Model (i.e., a Transformer produced by an Estimator),
// we can view the parameters it used during fit().
// This prints the parameter (name: value) pairs, where names are unique IDs for this LogisticRegression instance.

println(s"***model was fit using parameters: ${model.parent.extractParamMap}")


***model was fit using parameters: {
	cv_f68da6dbf65b-collectSubModels: false,
	cv_f68da6dbf65b-estimator: pipeline_683ced8320b6,
	cv_f68da6dbf65b-estimatorParamMaps: [Lorg.apache.spark.ml.param.ParamMap;@32dcc51e,
	cv_f68da6dbf65b-evaluator: binEval_f8d0feda317b,
	cv_f68da6dbf65b-numFolds: 5,
	cv_f68da6dbf65b-parallelism: 2,
	cv_f68da6dbf65b-seed: -1191137437
}


paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	logreg_169df198af27-elasticNetParam: 0.0,
	logreg_169df198af27-regParam: 0.1,
	logreg_169df198af27-threshold: 0.19,
	logreg_169df198af27-tol: 1.0E-6
}, {
	logreg_169df198af27-elasticNetParam: 0.0,
	logreg_169df198af27-regParam: 0.1,
	logreg_169df198af27-threshold: 0.2,
	logreg_169df198af27-tol: 1.0E-6
}, {
	logreg_169df198af27-elasticNetParam: 0.0,
	logreg_169df198af27-regParam: 0.1,
	logreg_169df198af27-threshold: 0.21,
	logreg_169df198af27-tol: 1.0E-6
}, {
	logreg_169df198af27-elasticNetParam: 0.0,
	logreg_169df198af27-regParam: 0.1,
	logreg_169df198af27-threshold: 0.22,
	logreg_169df198af27-tol: 1.0E-6
})
cv: org.apache.spark.ml.tuning.CrossValidator = cv_f68da6dbf65b
training: org.apache.spark.sql.Dataset[org.apache.spa...

In [6]:
//Persist the Model
model.write.overwrite().save("./flightDelayModel/")

// Get Results on Test Set
//val results = model.transform(test)

//results.select ("features", "label", "prediction").show(10)

val results: DataFrame = CrossValidatorModel
  .load("./flightDelayModel/")
  .transform(test)
  .select(
    col("features"),
    col("label"),
    col("prediction")
  )

results.show()

## Get Logistic Regression Coeffs of the Best Model

In [11]:
val bestModel = model.bestModel match {
  case pm: PipelineModel => Some(pm)
  case _ => None
}

val lrm = bestModel
  .map(_.stages.collect { case lrm: LogisticRegressionModel => lrm })
  .flatMap(_.headOption)

lrm.map(m => (m.intercept, m.coefficients))
//lrm.map(m => (m.summary.rootMeanSquaredError))

bestModel: Option[org.apache.spark.ml.PipelineModel] = Some(pipeline_683ced8320b6)
lrm: Option[org.apache.spark.ml.classification.LogisticRegressionModel] = Some(LogisticRegressionModel: uid = logreg_169df198af27, numClasses = 2, numFeatures = 101)
res6: Option[(Double, org.apache.spark.ml.linalg.Vector)] = Some((-2.206564991260743,[9.309599045065873E-4,-0.0730739101269188,-0.08026598473733182,0.018755568099392207,0.22030202099288065,-0.1370242307032936,0.2288225311959318,-0.06613872737800723,0.05259047538014665,0.1855574283365176,0.14516083076942676,0.0701572698255225,0.022381406657896008,-0.04559810900602169,-0.14300035458269922,0.00808300803610624,-0.15728711891298264,0.0686603062257141,-0.08564519279432477,0.10724356585061354,-0.07252316495737186,0.0831512204780029,0.062258962830634...

In [13]:
lrm.map(m => (m.summary.truePositiveRateByLabel))

res8: Option[Array[Double]] = Some([D@6b8a9c38)


## Get the Parameters of the best model

In [14]:
lrm.map(m => m.extractParamMap())

res9: Option[org.apache.spark.ml.param.ParamMap] =
Some({
	logreg_169df198af27-aggregationDepth: 2,
	logreg_169df198af27-elasticNetParam: 0.0,
	logreg_169df198af27-family: auto,
	logreg_169df198af27-featuresCol: features,
	logreg_169df198af27-fitIntercept: true,
	logreg_169df198af27-labelCol: label,
	logreg_169df198af27-maxIter: 100,
	logreg_169df198af27-predictionCol: prediction,
	logreg_169df198af27-probabilityCol: probability,
	logreg_169df198af27-rawPredictionCol: rawPrediction,
	logreg_169df198af27-regParam: 0.1,
	logreg_169df198af27-standardization: true,
	logreg_169df198af27-threshold: 0.2,
	logreg_169df198af27-tol: 1.0E-6
})


In [20]:
val eval = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction")
println(s"Accuracy: ${eval.setMetricName("accuracy").evaluate(results)}")
println(s"Precision: ${eval.setMetricName("weightedPrecision").evaluate(results)}")
println(s"Recall: ${eval.setMetricName("weightedRecall").evaluate(results)}")
println(s"F1: ${eval.setMetricName("f1").evaluate(results)}")

val TP = results.select("label", "prediction").filter("label = 1 and prediction = 1").count
val TN = results.select("label", "prediction").filter("label = 0 and prediction = 0").count
val FP = results.select("label", "prediction").filter("label = 0 and prediction = 1").count
val FN = results.select("label", "prediction").filter("label = 1 and prediction = 0").count
val total = results.select("label").count.toDouble

// Confusion matrix
println("Confusion matrix\n(Predict N, Predict P):")
val confusion: Matrix = Matrices.dense(2, 2, Array(TN, FN, FP, TP))

val accuracy    = (TP + TN) / total
val precision   = (TP + FP) / total
val recall      = (TP + FN) / total
val F1 = 2/(1/precision + 1/recall)

Accuracy: 0.7420380141122624
Precision: 0.7418617489714237
Recall: 0.7420380141122624
F1: 0.7419498271311218
Confusion matrix
(Predict N, Predict P):


eval: org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator = mcEval_833848ff0157
TP: Long = 8103
TN: Long = 131973
FP: Long = 24320
FN: Long = 24376
total: Double = 188772.0
confusion: org.apache.spark.ml.linalg.Matrix =
131973.0  24320.0
24376.0   8103.0
accuracy: Double = 0.7420380141122624
precision: Double = 0.1717574640306825
recall: Double = 0.1720541181954951
F1: Double = 0.17190566313064426


In [19]:
import org.apache.spark.mllib.evaluation.MulticlassMetrics

val eval_rdd =  results.select($"prediction",$"label").as[(Double, Double)].rdd
val eval = new MulticlassMetrics(eval_rdd)

println(eval.confusionMatrix)

131973.0  24320.0  
24376.0   8103.0   


import org.apache.spark.mllib.evaluation.MulticlassMetrics
eval_rdd: org.apache.spark.rdd.RDD[(Double, Double)] = MapPartitionsRDD[6627] at rdd at <console>:50
eval: org.apache.spark.mllib.evaluation.MulticlassMetrics = org.apache.spark.mllib.evaluation.MulticlassMetrics@27c576a6
