# Multi-Classification Model: Logistic Regression with One vs. All


We ran an additional model to predict flight delay! 

In [1]:
#read in file as dataframe 
# import pyspark modules
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *       # for datatype conversion
from pyspark.sql.functions import *   # for col() function
from pyspark.ml.feature import StandardScaler
from pyspark.ml.regression import LinearRegression
import pandas as pd
import os
import pyspark.sql.types as typ
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, Row
from pyspark.ml.feature import OneHotEncoder, StringIndexer, Bucketizer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
import pyspark.mllib.regression as reg
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.linalg import SparseVector, DenseVector
import functools 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
import seaborn as sns

In [2]:
from pyspark.sql import SparkSession 
spark = SparkSession \
    .builder \
    .master("local") \
    .appName("app") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '2') \
    .config('spark.cores.max', '2') \
    .config("spark.driver.memory",'4g') \
    .getOrCreate()

sc = SparkContext.getOrCreate()
sqlCtx = SQLContext(sc)

## Load in the data 

In [3]:
path_to_data = os.path.join("/home/jovyan/FlightDelay/clean_data_no_hot_2")

In [4]:
df = spark.read.format("csv") \
    .option("header", "true").option("inferschema","true").load(path_to_data)

In [5]:
df.count()

278016

In [6]:
a= df.count()
b = df.filter(df.CANCELLED == 1).count()
b/a

0.016178924953959485

In [7]:
pd_df = df.toPandas()

In [8]:
pd_df.head()

Unnamed: 0,_c0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,DISTANCE,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,B_SCHEDULED_ARRIVAL,B_ARRIVAL_TIME,B_SCHEDULED_DEPARTURE,B_DEPARTURE_TIME
0,0,2015,1,1,4,US,840,SFO,CLT,20,...,2296,806,811,5,0,0,2.0,2.0,0.0,0.0
1,1,2015,1,1,4,AA,1674,LAS,MIA,35,...,2174,803,753,-10,0,0,2.0,2.0,0.0,0.0
2,2,2015,1,1,4,AS,136,ANC,SEA,135,...,1448,600,1476,4,0,1,2.0,4.0,0.0,4.0
3,3,2015,1,1,4,AA,2459,PHX,DFW,200,...,868,500,1476,4,0,1,1.0,4.0,0.0,4.0
4,4,2015,1,1,4,B6,1990,SJU,EWR,206,...,1608,512,516,4,0,0,1.0,1.0,0.0,0.0


## Apply OneHotEncoder 

In [9]:
# for each level, count freq. val=0 for most freq, then 1, ...

stringIndexer = StringIndexer(inputCol="AIRLINE", outputCol="AIRLINE_Index")
model = stringIndexer.fit(df)
indexed = model.transform(df)

encoder = OneHotEncoder(inputCol="AIRLINE_Index", outputCol="AIRLINE_Vec")
encoded = encoder.transform(indexed)



In [10]:
# for each level, count freq. val=0 for most freq, then 1, ...

stringIndexer2 = StringIndexer(inputCol="ORIGIN_AIRPORT", outputCol="ORIGIN_AIRPORT_Index")
model2 = stringIndexer2.fit(encoded)
indexed2 = model2.transform(encoded)

encoder2 = OneHotEncoder(inputCol="ORIGIN_AIRPORT_Index", outputCol="ORIGIN_AIRPORT_Vec")
encoded2 = encoder2.transform(indexed2)


In [11]:
# for each level, count freq. val=0 for most freq, then 1, ...

stringIndexer3 = StringIndexer(inputCol="DESTINATION_AIRPORT", outputCol="DESTINATION_AIRPORT_Index")
model3 = stringIndexer3.fit(encoded2)
indexed3 = model3.transform(encoded2)

encoder3 = OneHotEncoder(inputCol="DESTINATION_AIRPORT_Index", outputCol="DESTINATION_AIRPORT_Vec")
encoded3 = encoder3.transform(indexed3)
encoded3.select('DESTINATION_AIRPORT','DESTINATION_AIRPORT_Index', "DESTINATION_AIRPORT_Vec").show()
#encoded3.cache()b

+-------------------+-------------------------+-----------------------+
|DESTINATION_AIRPORT|DESTINATION_AIRPORT_Index|DESTINATION_AIRPORT_Vec|
+-------------------+-------------------------+-----------------------+
|                CLT|                     14.0|       (618,[14],[1.0])|
|                MIA|                     24.0|       (618,[24],[1.0])|
|                SEA|                     10.0|       (618,[10],[1.0])|
|                DFW|                      2.0|        (618,[2],[1.0])|
|                EWR|                     15.0|       (618,[15],[1.0])|
|                CLT|                     14.0|       (618,[14],[1.0])|
|                MCO|                     11.0|       (618,[11],[1.0])|
|                JFK|                     18.0|       (618,[18],[1.0])|
|                DEN|                      3.0|        (618,[3],[1.0])|
|                ATL|                      0.0|        (618,[0],[1.0])|
|                LAX|                      4.0|        (618,[4],

In [12]:
new_cols_to_drop = ['AIRLINE_Index', 'AIRLINE', 'ORIGIN_AIRPORT_Index', 
                    'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT_Index',
                    'DESTINATION_AIRPORT', 'FLIGHT_NUMBER']

final_encoded = encoded3.drop(*new_cols_to_drop)

## Bucketize Departure Delay

In [13]:
# minimum departure delays
min_DD = final_encoded.agg({"DEPARTURE_DELAY": "max"}).collect()[0][0]
min_DD

131

In [14]:
#maximum departure delays 
max_DD = final_encoded.agg({"DEPARTURE_DELAY": "min"}).collect()[0][0]
max_DD

-48

In [15]:
delay_splits = [-48, 5, 45, 131] 

deptime_bucketizer = Bucketizer() \
  .setInputCol("DEPARTURE_DELAY") \
  .setOutputCol("B_DEPARTURE_DELAY") \
  .setSplits(delay_splits)

#Transform original data into its bucket index.
final_encoded_b = deptime_bucketizer\
               .transform(final_encoded)

In [16]:
final_encoded_b.columns

['_c0',
 'YEAR',
 'MONTH',
 'DAY',
 'DAY_OF_WEEK',
 'SCHEDULED_DEPARTURE',
 'DEPARTURE_TIME',
 'DEPARTURE_DELAY',
 'SCHEDULED_TIME',
 'ELAPSED_TIME',
 'DISTANCE',
 'SCHEDULED_ARRIVAL',
 'ARRIVAL_TIME',
 'ARRIVAL_DELAY',
 'DIVERTED',
 'CANCELLED',
 'B_SCHEDULED_ARRIVAL',
 'B_ARRIVAL_TIME',
 'B_SCHEDULED_DEPARTURE',
 'B_DEPARTURE_TIME',
 'AIRLINE_Vec',
 'ORIGIN_AIRPORT_Vec',
 'DESTINATION_AIRPORT_Vec',
 'B_DEPARTURE_DELAY']

In [17]:
# take out columns for predicting departure_delay
final_df = final_encoded_b.drop('_c0', 'YEAR', 'ARRIVAL_DELAY', 'ARRIVAL_TIME', 
                              'ELAPSED_TIME', 'ARRIVAL_DELAY', 'B_DEPARTURE_TIME', 'B_ARRIVAL_TIME','DEPARTURE_TIME',
                               'SCHEDULED_TIME', 'CANCELLED', 'DEPARTURE_DELAY')

In [18]:
final_df.columns

['MONTH',
 'DAY',
 'DAY_OF_WEEK',
 'SCHEDULED_DEPARTURE',
 'DISTANCE',
 'SCHEDULED_ARRIVAL',
 'DIVERTED',
 'B_SCHEDULED_ARRIVAL',
 'B_SCHEDULED_DEPARTURE',
 'AIRLINE_Vec',
 'ORIGIN_AIRPORT_Vec',
 'DESTINATION_AIRPORT_Vec',
 'B_DEPARTURE_DELAY']

In [None]:
final_df.show(5)

+-----+---+-----------+-------------------+--------+-----------------+--------+-------------------+---------------------+--------------+------------------+-----------------------+-----------------+
|MONTH|DAY|DAY_OF_WEEK|SCHEDULED_DEPARTURE|DISTANCE|SCHEDULED_ARRIVAL|DIVERTED|B_SCHEDULED_ARRIVAL|B_SCHEDULED_DEPARTURE|   AIRLINE_Vec|ORIGIN_AIRPORT_Vec|DESTINATION_AIRPORT_Vec|B_DEPARTURE_DELAY|
+-----+---+-----------+-------------------+--------+-----------------+--------+-------------------+---------------------+--------------+------------------+-----------------------+-----------------+
|    1|  1|          4|                 20|    2296|              806|       0|                2.0|                  0.0|(13,[8],[1.0])|   (614,[5],[1.0])|       (618,[14],[1.0])|              0.0|
|    1|  1|          4|                 35|    2174|              803|       0|                2.0|                  0.0|(13,[2],[1.0])|   (614,[8],[1.0])|       (618,[24],[1.0])|              0.0|
|    1|  1

In [None]:
pd2 = final_df.toPandas()
pd2.head()

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,SCHEDULED_DEPARTURE,DISTANCE,SCHEDULED_ARRIVAL,DIVERTED,B_SCHEDULED_ARRIVAL,B_SCHEDULED_DEPARTURE,AIRLINE_Vec,ORIGIN_AIRPORT_Vec,DESTINATION_AIRPORT_Vec,B_DEPARTURE_DELAY
0,1,1,4,20,2296,806,0,2.0,0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
1,1,1,4,35,2174,803,0,2.0,0.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0
2,1,1,4,135,1448,600,0,2.0,0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0
3,1,1,4,200,868,500,0,1.0,0.0,"(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","(0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0
4,1,1,4,206,1608,512,0,1.0,0.0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0


## Scale the Data

In [None]:
#convery final_encoded to rdd 
# we cannpot scale bucketized or vec columns, so we omit those form the scaling process
input_data = final_df.rdd.map(lambda x: (x[12], DenseVector(x[0:7])))

In [None]:
input_data.take(3)

[(0.0, DenseVector([1.0, 1.0, 4.0, 20.0, 2296.0, 806.0, 0.0])),
 (0.0, DenseVector([1.0, 1.0, 4.0, 35.0, 2174.0, 803.0, 0.0])),
 (1.0, DenseVector([1.0, 1.0, 4.0, 135.0, 1448.0, 600.0, 0.0]))]

In [None]:
df2 = sqlCtx.createDataFrame(input_data, ["label","features2"])

In [None]:
SS = StandardScaler(inputCol = "features2", outputCol = "features_scaled")

scaler = SS.fit(df2)

In [None]:
#transform the data in df2 with our scaler 
scaled_df = scaler.transform(df2)


In [None]:
scaled_df.show(4)

+-----+--------------------+--------------------+
|label|           features2|     features_scaled|
+-----+--------------------+--------------------+
|  0.0|[1.0,1.0,4.0,20.0...|[0.29360021733560...|
|  0.0|[1.0,1.0,4.0,35.0...|[0.29360021733560...|
|  1.0|[1.0,1.0,4.0,135....|[0.29360021733560...|
|  1.0|[1.0,1.0,4.0,200....|[0.29360021733560...|
+-----+--------------------+--------------------+
only showing top 4 rows



In [None]:
# since there is no common column between these two dataframes add row_index so that it can be joined
scaled_df = scaled_df.withColumn('row_index', F.monotonically_increasing_id())
final_df = final_df.withColumn('row_index', F.monotonically_increasing_id())

In [None]:
# combine scaled df and final_df
total_df = scaled_df.join(final_df, scaled_df.row_index == final_df.row_index)

In [None]:
total_df.columns

['label',
 'features2',
 'features_scaled',
 'row_index',
 'MONTH',
 'DAY',
 'DAY_OF_WEEK',
 'SCHEDULED_DEPARTURE',
 'DISTANCE',
 'SCHEDULED_ARRIVAL',
 'DIVERTED',
 'B_SCHEDULED_ARRIVAL',
 'B_SCHEDULED_DEPARTURE',
 'AIRLINE_Vec',
 'ORIGIN_AIRPORT_Vec',
 'DESTINATION_AIRPORT_Vec',
 'B_DEPARTURE_DELAY',
 'row_index']

In [None]:
#drop columns of already scaled predictors 
total_df = total_df.drop(*['MONTH', 'DAY', 'DAY_OF_WEEK', 'SCHEDULED_DEPARTURE', 'SCHEDULED_TIME', 'DISTANCE',
                           'SCHEDULED_ARRIVAL', 'DIVERTED', 
                           'CANCELLED2', 'row_index']) 

In [None]:
total_df.columns

['label',
 'features2',
 'features_scaled',
 'B_SCHEDULED_ARRIVAL',
 'B_SCHEDULED_DEPARTURE',
 'AIRLINE_Vec',
 'ORIGIN_AIRPORT_Vec',
 'DESTINATION_AIRPORT_Vec',
 'B_DEPARTURE_DELAY']

## Split Data

In [None]:
def unionAll(dfs):
    return functools.reduce(lambda df1,df2: df1.union(df2.select(df1.columns)), dfs) 

In [None]:
(train_data_scaled, test_data_scaled) = total_df.randomSplit([0.8, 0.2], seed = 314) 

In [None]:
train_data_scaled.where(train_data_scaled.B_DEPARTURE_DELAY == 2).count()

10025

In [None]:
train_data_scaled.where(train_data_scaled.B_DEPARTURE_DELAY == 1).count()

50623

In [None]:
train_data_scaled.where(train_data_scaled.B_DEPARTURE_DELAY == 0).count()

161898

## One vs. All Multiclassification with Logisic regresison

In [None]:
from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

In [None]:
vars_to_keep = [
 'features_scaled',
 'B_SCHEDULED_ARRIVAL',
 'B_SCHEDULED_DEPARTURE',
 'AIRLINE_Vec',
 'ORIGIN_AIRPORT_Vec',
 'DESTINATION_AIRPORT_Vec'
]

target = 'label'
SEED = 314
ITERS = 10
FOLDS = 5

In [None]:
assembler = VectorAssembler(
                            inputCols=[c for c in vars_to_keep],
                            outputCol='features')

In [None]:
train = (assembler.transform(train_data_scaled).select(target, "features"))

In [None]:
train.show(3)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(1254,[0,1,2,3,4,...|
|  0.0|(1254,[0,1,2,3,4,...|
|  0.0|(1254,[0,1,2,3,4,...|
+-----+--------------------+
only showing top 3 rows



In [None]:
# instantiate the base classifier.
lr = LogisticRegression(maxIter=ITERS, featuresCol="features", labelCol = "label", fitIntercept=True)

In [None]:
# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lr)

In [None]:
ovr

OneVsRest_99afcdb3fe59

In [None]:
pipeline = Pipeline(stages=[lr,ovr])

In [None]:
pipeline

Pipeline_6f4fc35fc4d1

In [None]:
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.5, 0.1, 0.01]) \
    .build()

In [None]:
paramGrid

[{Param(parent='LogisticRegression_ac21001f76c9', name='regParam', doc='regularization parameter (>= 0).'): 0.5},
 {Param(parent='LogisticRegression_ac21001f76c9', name='regParam', doc='regularization parameter (>= 0).'): 0.1},
 {Param(parent='LogisticRegression_ac21001f76c9', name='regParam', doc='regularization parameter (>= 0).'): 0.01}]

In [None]:
# k-fold cross validation
crossval = CrossValidator(
            estimator=ovr, 
            estimatorParamMaps=paramGrid,
            evaluator= MulticlassClassificationEvaluator(), 
            numFolds=FOLDS)

In [None]:
# train the multiclass model.
ovrModel = crossval.fit(train)

In [None]:
train.show(2)

In [None]:
# score the model on test data.
#predictions = model.transform(assembler.transform(test_data_scaled.select(vars_to_keep)).select("features"))

predictions = ovrModel.transform(assembler.transform(test_data_scaled.select(vars_to_keep)).select("features")) 


In [None]:
predictions.show(4)

In [None]:
predictions = predictions.withColumn('row_index', F.monotonically_increasing_id())
test_data_scaled = test_data_scaled.withColumn('row_index', F.monotonically_increasing_id())

In [None]:
# combine scaled df and final_df
tot_preds = predictions.join(test_data_scaled, predictions.row_index == test_data_scaled.row_index)

In [None]:
tot_preds = tot_preds.drop('row_index', 'features2', 'B_SCHEDULED_ARRIVAL','B_SCHEDULED_DEPARTURE',  
               'AIRLINE_Vec', 'ORIGIN_AIRPORT_Vec', 'DESTINATION_AIRPORT_Vec', 'B_DEPARTURE_DELAY', 'features_scaled')

In [None]:
tot_preds.show(5)

## Model Evaluation: 

In [None]:
model.bestModel

# obtain evaluator.
evaluator1 = MulticlassClassificationEvaluator(metricName="accuracy")

# compute the classification error on test data.
accuracy = evaluator1.evaluate(tot_preds)
print("Test Error = %g" % (1.0 - accuracy))

In [None]:
print('Accuracy = %g' % accuracy)

In [None]:
evaluator3 = MulticlassClassificationEvaluator()
F1 = evaluator3.evaluate(tot_preds)

In [None]:
print('F1 = %g' % F1)

In [None]:
evaluator4 = MulticlassClassificationEvaluator(metricName="weightedPrecision")

In [None]:
precision = evaluator4.evaluate(tot_preds)
print("Weighted Precision = %g" % (precision))

In [None]:
evaluator5 = MulticlassClassificationEvaluator(metricName="weightedRecall")

In [None]:
recall = evaluator5.evaluate(tot_preds)
print("Weighted Recall = %g" % (recall)) 

In [None]:
evaluator6 = MulticlassClassificationEvaluator(metricName="confusionMatrix")
cm = evaluator6.evaluate(tot_preds)
cm
# tp = df[(df.target_index == 1) & (df.prediction == 1)].count()
#tp = tot_preds[(tot_preds.label == 1) & (tot_preds.prediction == 1)].count()
#tn = tot_preds[(tot_preds.label == 0) & (tot_preds.prediction == 0)].count()
#fp = tot_preds[(tot_preds.label == 0) & (tot_preds.prediction == 1)].count()
#fn = tot_preds[(tot_preds.label == 1) & (tot_preds.prediction == 0)].count()

In [None]:
# Confusion matrix 
#print("True Positives:", tp)
#print("True Negatives:", tn)
#print("False Positives:", fp)
#print("False Negatives:", fn)
#print("Total", df.count())