## Group 8 Assignment Phase 3 Logistic Regression

* Do the followings in HDFS:

* Remove any folder/files in /tmp that starts with flightData_,

* Create folder /tmp/flightData_in/,

* Put the parquet dataset file into /tmp/flightData_in/,

* Make sure put was successfull (it should have the same size as the local file)!

In [5]:
! hadoop fs -chmod -R 777 hdfs://localhost:9000/tmp
! hadoop fs -rm    -r  hdfs://localhost:9000/tmp/flightData_*
! hadoop fs -mkdir -p  hdfs://localhost:9000/tmp/flightData_in
! hadoop fs -put   -p  flightDelay.parquet             hdfs://localhost:9000/tmp/flightData_in
! hadoop fs -ls        hdfs://localhost:9000/tmp/flightData_in/

20/06/05 03:44:58 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.


Deleted hdfs://localhost:9000/tmp/flightData_in


Found 1 items


drwxrwxr-x   - root root          0 2020-05-24 03:38 hdfs://localhost:9000/tmp/flightData_in/flightDelay.parquet




In [6]:
!hdfs getconf -confKey fs.defaultFS

hdfs://localhost:9000



## Load Regquisite Libraries and Start a Spark Session

In [7]:
//Start a simple Spark Session
import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql._

import org.apache.spark.ml.attribute._

//Feature pre-Processing Classes
import org.apache.spark.ml.feature.{VectorAssembler,StringIndexer,
                                    VectorIndexer,OneHotEncoder, PCA, Normalizer}

//Linear Algebra Data Structures
import org.apache.spark.ml.linalg.{Vector,Vectors}

//Model Building Pipeline
import org.apache.spark.ml.{Pipeline, PipelineStage, PipelineModel}

//Binary Classification
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel, BinaryLogisticRegressionSummary,
                                           RandomForestClassifier, GBTClassifier,
                                           DecisionTreeClassifier, DecisionTreeClassificationModel}
//Model Training
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, 
                                   ParamGridBuilder, TrainValidationSplit}

//Model Evaluation
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator,MulticlassClassificationEvaluator}

//Optional: Use the following code below to set the Error reporting
import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)


//For Cleaning
//import scala.util.matching.Regex

val spark = SparkSession.builder().appName("Group 8 ML Phase 3").getOrCreate()

Intitializing Scala interpreter ...

Spark Web UI available at http://a45c2dc34def:4040
SparkContext available as 'sc' (version = 2.4.5, master = local[*], app id = local-1591328726437)
SparkSession available as 'spark'


import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql._
import org.apache.spark.ml.attribute._
import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer, VectorIndexer, OneHotEncoder, PCA, Normalizer}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.{Pipeline, PipelineStage, PipelineModel}
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel, BinaryLogisticRegressionSummary, RandomForestClassifier, GBTClassifier, DecisionTreeClassifier, DecisionTreeClassificationModel}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel, ParamGridBuilder, TrainValidatio...

## Read in a parquet file of flight delay, fuel-price and meteorological data

In [8]:
val flights = (spark
            .read.parquet("hdfs://localhost:9000/tmp/flightData_in/flightDelay.parquet")
            .withColumn("Month_Num1", $"Month_Num" cast "Int")
            //convert month and year to integer index starting Jan 2004
            .withColumn("Date_Num",  ($"Year"-2004)*12 + $"Month_Num1")
            .drop("Sectors_Flown", "Month_Num1", "Change")
            .withColumnRenamed("Departures_Delayed","label")
            .withColumnRenamed("Price","Fuel_Price")
            //drop NA's even though none were found!
            .na.drop()
            //.cache
              )

flights.printSchema()

root
 |-- Departing_Port: string (nullable = true)
 |-- Arriving_Port: string (nullable = true)
 |-- Airline: string (nullable = true)
 |-- label: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month_Num: string (nullable = true)
 |-- Fuel_Price: double (nullable = true)
 |-- Departing_Port_station_ID: string (nullable = true)
 |-- Departing_Port_station_name: string (nullable = true)
 |-- Arriving_Port_station_ID: string (nullable = true)
 |-- Arriving_Port_station_name: string (nullable = true)
 |-- Mean_3pm_cloud_cover_oktas_Depart: double (nullable = true)
 |-- Mean_3pm_dew_point_temperature_Degrees_C_Depart: double (nullable = true)
 |-- Mean_3pm_relative_humidity_%_Depart: double (nullable = true)
 |-- Mean_3pm_temperature_Degrees_C_Depart: double (nullable = true)
 |-- Mean_3pm_wet_bulb_temperature_Degrees_C_Depart: double (nullable = true)
 |-- Mean_3pm_wind_speed_km/h_Depart: double (nullable = true)
 |-- Mean_9am_cloud_cover_okas_Depart: double (nullable 

flights: org.apache.spark.sql.DataFrame = [Departing_Port: string, Arriving_Port: string ... 72 more fields]


## Take a look at the degree of imbalance in the dataset

In [9]:
val counts = flights.groupBy("label").count()

println("proportion of lates (label=1) in the sample")
counts.show()

proportion of lates (label=1) in the sample
+-----+-------+
|label|  count|
+-----+-------+
|    1|1072071|
|    0|5224826|
+-----+-------+



counts: org.apache.spark.sql.DataFrame = [label: int, count: bigint]


## Split The Data into training and testing dataframes

In [10]:
//Filter out the most recent 12 months of flight data as the test dataset
//Dates after March 2019 have Date_Num > 183
val testing = flights.filter($"Date_Num"> 183).cache()
println(s"Test Set of the Most Recent 12 Months has ${testing.count()} records")

//Filter out rows prior to the most recent 12 months of flight data as the training dataset
val rawTraining = flights.filter($"Date_Num" < 184)


Test Set of the Most Recent 12 Months has 435479 records


testing: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Departing_Port: string, Arriving_Port: string ... 72 more fields]
rawTraining: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Departing_Port: string, Arriving_Port: string ... 72 more fields]


## Down sample the Ontime Departures To Balance The Training data

In [11]:
val ontimeTrainingFlights = rawTraining.filter($"label"===0)
println(s"On time Training Flights: ${ontimeTrainingFlights.count()}")

val delayedTrainingFlights = rawTraining.filter($"label"===1)
println(s"Delayed Training Flights: ${delayedTrainingFlights.count()}")

//ontime:delayed approx 5:1 so take a random sample of size fifth of the ontime departures
val downSampleFraction = 0.2
val sampledOntimeTrainingFlights = ontimeTrainingFlights.sample(false, downSampleFraction)  

println(s"Down Sampled ontime Training Flights: ${sampledOntimeTrainingFlights.count()}")

//down sample resulting training set for the purposes of local testing
val localTestingSampleFraction = 0.1
//Concatenate rows of ontimeTrainingFlights and delayedTrainingFlights
val training = (sampledOntimeTrainingFlights
                .union(delayedTrainingFlights)
                .sample(false, localTestingSampleFraction)
                .cache())
               
val resampledCounts = training.groupBy("label").count()
println("proportion of lates (label=1) in the sample")
resampledCounts.show()

On time Training Flights: 4884963
Delayed Training Flights: 976455
Down Sampled ontime Training Flights: 976969
proportion of lates (label=1) in the sample
+-----+-----+
|label|count|
+-----+-----+
|    1|98109|
|    0|97607|
+-----+-----+



ontimeTrainingFlights: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Departing_Port: string, Arriving_Port: string ... 72 more fields]
delayedTrainingFlights: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Departing_Port: string, Arriving_Port: string ... 72 more fields]
downSampleFraction: Double = 0.2
sampledOntimeTrainingFlights: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Departing_Port: string, Arriving_Port: string ... 72 more fields]
localTestingSampleFraction: Double = 0.1
training: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Departing_Port: string, Arriving_Port: string ... 72 more fields]
resampledCounts: org.apache.spark.sql.DataFrame = [label: int, count: bigint]


## Contruct a Confusion Matrix for Model Assessment

In [21]:
def getConfusionMatrix(predictionDF: DataFrame): Unit = {
    
    println("========================Model Assessment Metrics==================================================")
    // Define Binary Classification Evaluator
    val binaryEval = new BinaryClassificationEvaluator().setLabelCol("label").setRawPredictionCol("rawPrediction")
    // Run Evaluation.  The area under the ROC curve ranges from 0.5 and 1.0 with larger values indicative of better fit
    println(s"Area under ROC: ${binaryEval.setMetricName("areaUnderROC").evaluate(predictionDF)}")
    // Define Multiclass Classification Evaluator
    val multiEval = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction")
    println(s"Accuracy: ${multiEval.setMetricName("accuracy").evaluate(predictionDF)}")
    println(s"Weighted Precision: ${multiEval.setMetricName("weightedPrecision").evaluate(predictionDF)}")
    println(s"Weighted Recall: ${multiEval.setMetricName("weightedRecall").evaluate(predictionDF)}")
    println(s"F1: ${multiEval.setMetricName("f1").evaluate(predictionDF)}")

    val TP = predictionDF.select("label", "prediction").filter("label = 1 and prediction = 1").count
    val TN = predictionDF.select("label", "prediction").filter("label = 0 and prediction = 0").count
    val FP = predictionDF.select("label", "prediction").filter("label = 0 and prediction = 1").count
    val FN = predictionDF.select("label", "prediction").filter("label = 1 and prediction = 0").count
    val total = predictionDF.select("label").count.toDouble
    // Unweighted Metrics
    val accuracy    = (TP + TN) / total
    val precision   = (TP + FP) / total
    val recall      = (TP + FN) / total
    val F1 = 2/(1/precision + 1/recall)
    println(s"Accuracy: ${accuracy}")
    println(s"Precision: ${precision}")
    println(s"Recall: ${recall}")
    println(s"F1: ${F1}")

    // Confusion matrix
    printf(s"""|=================== Confusion Matrix ==========================
           |##########| %-15s                     %-15s
           |----------+----------------------------------------------------
           |Actual = 0| %-15d                     %-15d
           |Actual = 1| %-15d                     %-15d
           |===============================================================
         """.stripMargin, "Predicted = 0", "Predicted = 1", TN, FP, FN, TP)

    println("==================================================================================================")
}


getConfusionMatrix: (predictionDF: org.apache.spark.sql.DataFrame)Unit


## Set up Flight Data Feature Processing Pipleline Stages for Arbitrary Estimator

In [13]:
// Deal with Categorical Columns
val categoricalVariables = Array(
    "Departing_Port", "Arriving_Port", "Airline")
val categoricalIndexers = categoricalVariables
  .map(i => new StringIndexer().setInputCol(i).setOutputCol(i+"_Index"))
val categoricalEncoders = categoricalVariables
  .map(e => new OneHotEncoder().setInputCol(e + "_Index").setOutputCol(e + "_Vec"))


// select the flight data explanatory fields that will predict flight delay
val explanatoryFields = Array("Date_Num",  "Airline_Vec", "Fuel_Price",
    "Departing_Port_Vec", "Mean_daily_wind_run_km_Depart", "Mean_rainfall_mm_Depart",
    "Mean_number_of_days_of_rain_Depart","Mean_number_of_days_>_40_Degrees_C_Depart",
    "Arriving_Port_Vec")

// Assemble everything together to be ("label","features") format
val assembler = (new VectorAssembler()
                 .setInputCols(explanatoryFields)
                 //.setOutputCol("indexedFeatures")
                 .setOutputCol("features")
                )

// Normalize each Vector using $L^1$ norm.
val normalizer = new Normalizer()
  .setInputCol(assembler.getOutputCol)
  .setOutputCol("normedFeatures")
  .setP(1.0)


// Choose linear combinations of explanatory variables that explain the most variance in the training data
//PCA DOF
val pcaDOF = 14
val pca = new PCA()
    .setInputCol(assembler.getOutputCol)
    .setOutputCol("features")
    .setK(pcaDOF)

///////////////////////////////////////////////////////////////////////////
//   Define Feature Preprocessing Stages suitable for all candidate models  ///
///////////////////////////////////////////////////////////////////////////

//val featureProcessingStages: Array[PipelineStage] = categoricalIndexers ++ categoricalEncoders ++ Array(assembler, normalizer, pca)

val featureProcessingStages: Array[PipelineStage] = categoricalIndexers ++ categoricalEncoders ++ Array(assembler)


categoricalVariables: Array[String] = Array(Departing_Port, Arriving_Port, Airline)
categoricalIndexers: Array[org.apache.spark.ml.feature.StringIndexer] = Array(strIdx_5e1154b95779, strIdx_8169f45e0912, strIdx_876fee818f32)
categoricalEncoders: Array[org.apache.spark.ml.feature.OneHotEncoder] = Array(oneHot_a39e512d0c95, oneHot_cc8ceb3b059a, oneHot_04ed4b7fe449)
explanatoryFields: Array[String] = Array(Date_Num, Airline_Vec, Fuel_Price, Departing_Port_Vec, Mean_daily_wind_run_km_Depart, Mean_rainfall_mm_Depart, Mean_number_of_days_of_rain_Depart, Mean_number_of_days_>_40_Degrees_C_Depart, Arriving_Port_Vec)
assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_dcb98d29439a
normalizer: org.apache.spark.ml.feature.Normalizer = normalizer_fe279c982b15
pcaDOF: Int = 14
pca:...

## Setup a Cross Validated Logistic Regression Pipeline

In [14]:
// Define the Logistic Regression Estimator.
val lr = new LogisticRegression()
        .setFeaturesCol(assembler.getOutputCol)
        //.setFeaturesCol(pca.getOutputCol)
        .setLabelCol("label")

// Print out the parameters, documentation, and any default values.
//println(s"LogisticRegression parameters:\n ${lr.explainParams()}\n")

// We use a ParamGridBuilder to construct a grid of parameters to search over.
val lrParamGrid = new ParamGridBuilder()
  .addGrid(lr.regParam, Array(0.01))
  .addGrid(lr.threshold, (for (i <- 46 to 48) yield i.toDouble / 100).toArray)
  .addGrid(lr.tol, Array(0.000001))
  .addGrid(lr.elasticNetParam, Array(0.0))
  .build()



// A CrossValidator requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
// Note that the evaluator here is a BinaryClassificationEvaluator and its default metric is areaUnderROC.
val cv = new CrossValidator()
  .setEstimator(new Pipeline().setStages(featureProcessingStages ++ Array(lr)))
  .setEvaluator(new BinaryClassificationEvaluator)
  .setEstimatorParamMaps(lrParamGrid)
  .setNumFolds(3)  // Use 3+ in practice
  //.setParallelism(2)  // Evaluate up to 2 parameter settings in parallel


lr: org.apache.spark.ml.classification.LogisticRegression = logreg_eaa0837ec626
lrParamGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	logreg_eaa0837ec626-elasticNetParam: 0.0,
	logreg_eaa0837ec626-regParam: 0.01,
	logreg_eaa0837ec626-threshold: 0.46,
	logreg_eaa0837ec626-tol: 1.0E-6
}, {
	logreg_eaa0837ec626-elasticNetParam: 0.0,
	logreg_eaa0837ec626-regParam: 0.01,
	logreg_eaa0837ec626-threshold: 0.47,
	logreg_eaa0837ec626-tol: 1.0E-6
}, {
	logreg_eaa0837ec626-elasticNetParam: 0.0,
	logreg_eaa0837ec626-regParam: 0.01,
	logreg_eaa0837ec626-threshold: 0.48,
	logreg_eaa0837ec626-tol: 1.0E-6
})
cv: org.apache.spark.ml.tuning.CrossValidator = cv_efa90acf25fe


## Train the Logistic Regression Model using Cross Valiation Tuning for Hyperparameters

In [15]:
// Run cross-validation, and choose the best set of parameters.
val pipelineModel = cv.fit(training)


val bestModel = pipelineModel.bestModel match {
  case pm: PipelineModel => Some(pm)
  case _ => None
}

val ml = bestModel
    .map(_.stages.collect { case ml: LogisticRegressionModel => ml })
    .flatMap(_.headOption)

// Get fitted logistic regression model
val lrModel = ml.get.asInstanceOf[LogisticRegressionModel]

//Get Coeffs of the Best Logistic Regression Model
//println(s"Intercept: ${lrModel.intercept}")
//println(s"Coefficients: ${lrModel.coefficients}")
println(s"ElasticNetParam: ${lrModel.getElasticNetParam}")
println(s"Threshold: ${lrModel.getThreshold}")


val lrModelSummary = ml.get.summary.asInstanceOf[BinaryLogisticRegressionSummary]
println(s"areaUnderCurve: ${lrModelSummary.areaUnderROC}")
val fMeasure = lrModelSummary.fMeasureByThreshold
val maxFMeasure = fMeasure.agg("F-Measure" -> "Max").head().getDouble(0)
val bestThreshold = fMeasure.where($"F-Measure" === maxFMeasure).select("threshold").head().getDouble(0)
println(s"MaxFMeasure: $maxFMeasure & bestThreshold: $bestThreshold")


2020-06-05 03:49:17,594 WARN  [Thread-4] netlib.BLAS (BLAS.java:<clinit>(61)) - Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
2020-06-05 03:49:17,596 WARN  [Thread-4] netlib.BLAS (BLAS.java:<clinit>(61)) - Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
ElasticNetParam: 0.0
Threshold: 0.47
areaUnderCurve: 0.5948465368862681
MaxFMeasure: 0.6687893260556441 & bestThreshold: 0.33485248667659895


pipelineModel: org.apache.spark.ml.tuning.CrossValidatorModel = cv_efa90acf25fe
bestModel: Option[org.apache.spark.ml.PipelineModel] = Some(pipeline_e1a379f68b05)
ml: Option[org.apache.spark.ml.classification.LogisticRegressionModel] = Some(LogisticRegressionModel: uid = logreg_eaa0837ec626, numClasses = 2, numFeatures = 101)
lrModel: org.apache.spark.ml.classification.LogisticRegressionModel = LogisticRegressionModel: uid = logreg_eaa0837ec626, numClasses = 2, numFeatures = 101
lrModelSummary: org.apache.spark.ml.classification.BinaryLogisticRegressionSummary = org.apache.spark.ml.classification.BinaryLogisticRegressionTrainingSummaryImpl@71b16a57
fMeasure: org.apache.spark.sql.DataFrame = [threshold: double, F-Measure: double]
maxFMeasure: Double = 0.6687893260556441
bestThreshold: Do...

## Test the Logistic Regression Pipeline and Report on the performance

In [16]:
//Test the model
val lrPredictions = pipelineModel.transform(testing)

// Get output schema of our fitted pipeline
val schema = lrPredictions.schema
// Extract the attributes of the input (features) column to our logistic regression model

val featureAttrs = AttributeGroup.fromStructField(schema(lrModel.getFeaturesCol)).attributes.get

val features = featureAttrs.map(_.name.get)

// Add "(Intercept)" to list of feature names if the model was fit with an intercept
val featureNames: Array[String] = if (lrModel.getFitIntercept) {
  Array("(Intercept)") ++ features
} else {
  features
}

// Get array of coefficients
val lrModelCoeffs = lrModel.coefficients.toArray
val coeffs = if (lrModel.getFitIntercept) 
        {lrModelCoeffs ++ Array(lrModel.intercept)} 
    else 
        {lrModelCoeffs}

// Print feature names & coefficients together
println("Coefficient   Feature")
println("====================================================================")
featureNames.zip(coeffs)
    .foreach { case (feature, coeff) => 
println(f"$coeff%4.4f        $feature%-50.50s")}



//lrPredictions.select("prediction", "label", "features").show(20)
getConfusionMatrix(lrPredictions)

Coefficient   Feature
0.0008        (Intercept)                                       
-0.1253        Date_Num                                          
-0.1480        Airline_Vec_Virgin Australia                      
0.3251        Airline_Vec_Qantas                                
0.0571        Airline_Vec_Jetstar                               
0.3517        Airline_Vec_QantasLink                            
-0.2521        Airline_Vec_Tigerair Australia                    
-0.0515        Airline_Vec_Regional Express                      
0.2328        Airline_Vec_Virgin Australia Regional Airlines    
0.4137        Airline_Vec_Virgin Australia - ATR/F100 Operations
0.2433        Airline_Vec_Skywest                               
0.0471        Fuel_Price                                        
-0.0404        Departing_Port_Vec_Sydney                         
-0.0223        Departing_Port_Vec_Melbourne                      
0.0871        Departing_Port_Vec_Brisbane                     

lrPredictions: org.apache.spark.sql.DataFrame = [Departing_Port: string, Arriving_Port: string ... 82 more fields]
schema: org.apache.spark.sql.types.StructType = StructType(StructField(Departing_Port,StringType,true), StructField(Arriving_Port,StringType,true), StructField(Airline,StringType,true), StructField(label,IntegerType,true), StructField(Year,IntegerType,true), StructField(Month_Num,StringType,true), StructField(Fuel_Price,DoubleType,true), StructField(Departing_Port_station_ID,StringType,true), StructField(Departing_Port_station_name,StringType,true), StructField(Arriving_Port_station_ID,StringType,true), StructField(Arriving_Port_station_name,StringType,true), StructField(Mean_3pm_cloud_cover_oktas_Depart,DoubleType,true), StructField(Mean_3pm_dew_point_temperature_Degrees_C...

## Store The Best CV Trained Logistic Model to the hdfs

In [18]:
! hadoop fs -rm    -r  hdfs://localhost:9000/tmp/flightDelayModel_*
! hadoop fs -mkdir -p  hdfs://localhost:9000/tmp/flightDelayModel__out

20/06/05 03:53:47 INFO fs.TrashPolicyDefault: Namenode trash configuration: Deletion interval = 0 minutes, Emptier interval = 0 minutes.


Deleted hdfs://localhost:9000/tmp/flightDelayModel__out




In [19]:
//Persist the Model to the hdfs
pipelineModel
    .write
    .overwrite()
    .save("hdfs://localhost:9000/tmp/flightDelayModel__out")


In [20]:
//Check the stored model, by reading it back in, and running a prediciton
val results: DataFrame = CrossValidatorModel
.load("hdfs://localhost:9000/tmp/flightDelayModel__out")
.transform(testing)
.select(
    col("features"),
    col("label"),
    col("prediction")
)

results.show(2,truncate = false)

+---------------------------------------------------------------------+-----+----------+
|features                                                             |label|prediction|
+---------------------------------------------------------------------+-----+----------+
|(101,[0,4,10,29,54,55,56,58],[184.0,1.0,2.78,1.0,241.0,40.4,6.7,1.0])|1    |1.0       |
|(101,[0,4,10,29,54,55,56,58],[184.0,1.0,2.78,1.0,241.0,40.4,6.7,1.0])|1    |1.0       |
+---------------------------------------------------------------------+-----+----------+
only showing top 2 rows



results: org.apache.spark.sql.DataFrame = [features: vector, label: int ... 1 more field]
