#  Set up 

In [1]:
#read in file as dataframe 
# import pyspark modules
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *       # for datatype conversion
from pyspark.sql.functions import *   # for col() function
from pyspark.ml.feature import StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.evaluation import MulticlassMetrics
import pandas as pd
import os
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml import Pipeline
import pyspark.sql.types as typ
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, Row
from pyspark.ml.feature import OneHotEncoder, StringIndexer, Bucketizer
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
import pyspark.mllib.regression as reg
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.linalg import SparseVector, DenseVector
import functools 
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
from pyspark.sql import SparkSession 
spark = SparkSession \
    .builder \
    .master("local") \
    .appName("app") \
    .config("spark.executor.memory", '2g') \
    .config('spark.executor.cores', '2') \
    .config('spark.cores.max', '2') \
    .config("spark.driver.memory",'4g') \
    .getOrCreate()

sc = SparkContext.getOrCreate()
sqlCtx = SQLContext(sc)

In [3]:
path_to_data = os.path.join("/home/jovyan/FlightDelay/clean_data_no_hot_2.dms")

In [4]:
df = spark.read.format("csv") \
    .option("header", "true").option("inferschema","true").load(path_to_data)

# One Hot Encoder

In [5]:
# for each level, count freq. val=0 for most freq, then 1, ...

stringIndexer = StringIndexer(inputCol="AIRLINE", outputCol="AIRLINE_Index")
model = stringIndexer.fit(df)
indexed = model.transform(df)

encoder = OneHotEncoder(inputCol="AIRLINE_Index", outputCol="AIRLINE_Vec")
encoded = encoder.transform(indexed)


In [6]:
# for each level, count freq. val=0 for most freq, then 1, ...

stringIndexer2 = StringIndexer(inputCol="ORIGIN_AIRPORT", outputCol="ORIGIN_AIRPORT_Index")
model2 = stringIndexer2.fit(encoded)
indexed2 = model2.transform(encoded)

encoder2 = OneHotEncoder(inputCol="ORIGIN_AIRPORT_Index", outputCol="ORIGIN_AIRPORT_Vec")
encoded2 = encoder2.transform(indexed2)



In [7]:
# for each level, count freq. val=0 for most freq, then 1, ...

stringIndexer3 = StringIndexer(inputCol="DESTINATION_AIRPORT", outputCol="DESTINATION_AIRPORT_Index")
model3 = stringIndexer3.fit(encoded2)
indexed3 = model3.transform(encoded2)

encoder3 = OneHotEncoder(inputCol="DESTINATION_AIRPORT_Index", outputCol="DESTINATION_AIRPORT_Vec")
encoded3 = encoder3.transform(indexed3)
encoded3.select('DESTINATION_AIRPORT','DESTINATION_AIRPORT_Index', "DESTINATION_AIRPORT_Vec").show()
#encoded3.cache()b

+-------------------+-------------------------+-----------------------+
|DESTINATION_AIRPORT|DESTINATION_AIRPORT_Index|DESTINATION_AIRPORT_Vec|
+-------------------+-------------------------+-----------------------+
|                CLT|                     14.0|       (618,[14],[1.0])|
|                MIA|                     24.0|       (618,[24],[1.0])|
|                SEA|                     10.0|       (618,[10],[1.0])|
|                DFW|                      2.0|        (618,[2],[1.0])|
|                EWR|                     15.0|       (618,[15],[1.0])|
|                CLT|                     14.0|       (618,[14],[1.0])|
|                MCO|                     11.0|       (618,[11],[1.0])|
|                JFK|                     18.0|       (618,[18],[1.0])|
|                DEN|                      3.0|        (618,[3],[1.0])|
|                ATL|                      0.0|        (618,[0],[1.0])|
|                LAX|                      4.0|        (618,[4],

In [8]:
new_cols_to_drop = ['AIRLINE_Index', 'AIRLINE', 'ORIGIN_AIRPORT_Index', 
                    'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT_Index',
                    'DESTINATION_AIRPORT', 'FLIGHT_NUMBER']

final_encoded = encoded3.drop(*new_cols_to_drop)

In [9]:
final_encoded.columns

['_c0',
 'YEAR',
 'MONTH',
 'DAY',
 'DAY_OF_WEEK',
 'SCHEDULED_DEPARTURE',
 'DEPARTURE_TIME',
 'DEPARTURE_DELAY',
 'SCHEDULED_TIME',
 'ELAPSED_TIME',
 'DISTANCE',
 'SCHEDULED_ARRIVAL',
 'ARRIVAL_TIME',
 'ARRIVAL_DELAY',
 'DIVERTED',
 'CANCELLED',
 'B_SCHEDULED_ARRIVAL',
 'B_ARRIVAL_TIME',
 'B_SCHEDULED_DEPARTURE',
 'B_DEPARTURE_TIME',
 'AIRLINE_Vec',
 'ORIGIN_AIRPORT_Vec',
 'DESTINATION_AIRPORT_Vec']

In [10]:
final_df = final_encoded.drop('_c0', 'YEAR', 'DEPARTURE_DELAY', 'ARRIVAL_DELAY', 'ARRIVAL_TIME', 
                              'B_ARRIVAL_TIME', 'ELAPSED_TIME', 'B_ARRIVAL_TIME', 
                              'B_DEPARTURE_TIME', 'DEPARTURE_TIME')

# Scale data

In [11]:
#convery final_encoded to rdd 
# we cannpot scale bucketized or vec columns, so we omit those form the scaling process
input_data = final_df.rdd.map(lambda x: (x[8], DenseVector(x[1:8])))

In [12]:
df2 = sqlCtx.createDataFrame(input_data, ["label","features2"])

In [13]:
SS = StandardScaler(inputCol = "features2", outputCol = "features_scaled")

scaler = SS.fit(df2)

In [14]:
#transform the data in df2 with our scaler 
scaled_df = scaler.transform(df2)
#join scalable feature with columns  'AIRLINE_Vec', 'ORIGIN_AIRPORT_Vec', 
#'DESTINATION_AIRPORT_Vec', 'B_SCHEDULED_ARRIVAL', 'B_ARRIVAL_TIME', 
#'B_SCHEDULED_DEPARTURE','B_DEPARTURE_TIME'


In [15]:
scaled_df.show(4)

+-----+--------------------+--------------------+
|label|           features2|     features_scaled|
+-----+--------------------+--------------------+
|    0|[1.0,4.0,20.0,286...|[0.11379250643083...|
|    0|[1.0,4.0,35.0,268...|[0.11379250643083...|
|    1|[1.0,4.0,135.0,20...|[0.11379250643083...|
|    1|[1.0,4.0,200.0,12...|[0.11379250643083...|
+-----+--------------------+--------------------+
only showing top 4 rows



In [16]:
# since there is no common column between these two dataframes add row_index so that it can be joined
scaled_df = scaled_df.withColumn('row_index', F.monotonically_increasing_id())
final_df = final_df.withColumn('row_index', F.monotonically_increasing_id())

In [17]:
# combine scaled df and final_df
total_df = scaled_df.join(final_df, scaled_df.row_index == final_df.row_index)

In [18]:
total_df.count()

278016

In [19]:
#drop columns of already scaled predictors 
total_df = total_df.drop(*['MONTH', 'DAY', 'DAY_OF_WEEK', 'SCHEDULED_DEPARTURE', 'SCHEDULED_TIME', 'DISTANCE',
                           'SCHEDULED_ARRIVAL', 'DIVERTED', 
                           'CANCELLED2', 'row_index']) 

In [20]:
#final scaled dataframe for predicting Cancelled flights 
total_df.columns

['label',
 'features2',
 'features_scaled',
 'CANCELLED',
 'B_SCHEDULED_ARRIVAL',
 'B_SCHEDULED_DEPARTURE',
 'AIRLINE_Vec',
 'ORIGIN_AIRPORT_Vec',
 'DESTINATION_AIRPORT_Vec']

# SPlit data

In [21]:
def unionAll(dfs):
    return functools.reduce(lambda df1,df2: df1.union(df2.select(df1.columns)), dfs)

In [22]:
train_data_scaled, test_data_scaled = total_df.randomSplit([0.8, 0.2], seed=12)

In [23]:
c = train_data_scaled.where(total_df.label == 1)

In [24]:
## How to filter out large portion of non-canclelled flights in training data to make data more even 
not_c = train_data_scaled.where(total_df.label == 0).sample(False, .018, 454) 

In [25]:
not_c.count()

3926

In [26]:
c.count()

3542

In [27]:
train_data_scaled = unionAll([c, not_c])

In [28]:
train_data_scaled.columns

['label',
 'features2',
 'features_scaled',
 'CANCELLED',
 'B_SCHEDULED_ARRIVAL',
 'B_SCHEDULED_DEPARTURE',
 'AIRLINE_Vec',
 'ORIGIN_AIRPORT_Vec',
 'DESTINATION_AIRPORT_Vec']

# Baseline Regression 

only using ORIGIN_AIRPORT_Vec to predict cancellelation 

In [29]:
train_data_base = train_data_scaled.select(["ORIGIN_AIRPORT_Vec", "CANCELLED"])
test_data_base =  test_data_scaled.select(["ORIGIN_AIRPORT_Vec", "CANCELLED"])
lr = LogisticRegression(featuresCol = 'ORIGIN_AIRPORT_Vec', labelCol='CANCELLED', maxIter=10, regParam=0.3, elasticNetParam=0.8)
linearModel = lr.fit(train_data_base)
predicted = linearModel.transform(test_data_base)

In [30]:
predicted.show(4)

+------------------+---------+--------------------+--------------------+----------+
|ORIGIN_AIRPORT_Vec|CANCELLED|       rawPrediction|         probability|prediction|
+------------------+---------+--------------------+--------------------+----------+
|   (614,[4],[1.0])|        0|[0.10292955649362...|[0.52570969469737...|       0.0|
|  (614,[11],[1.0])|        0|[0.10292955649362...|[0.52570969469737...|       0.0|
| (614,[281],[1.0])|        0|[0.10292955649362...|[0.52570969469737...|       0.0|
|  (614,[77],[1.0])|        0|[0.10292955649362...|[0.52570969469737...|       0.0|
+------------------+---------+--------------------+--------------------+----------+
only showing top 4 rows



# Evaluate the baseline model

AUC

In [31]:
evaluator = BinaryClassificationEvaluator(labelCol="CANCELLED")
AUC = evaluator.evaluate(predicted)
print("AUC: %f" % AUC)

AUC: 0.500000


accuracy, precision, recall and f1

In [32]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="CANCELLED", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predicted)
print("accuracy: %f" % accuracy)

accuracy: 0.982847


In [33]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="CANCELLED", predictionCol="prediction")
f1 = evaluator.evaluate(predicted)
print("f1: %f" % f1)

f1: 0.974345


In [34]:
# tp = df[(df.target_index == 1) & (df.prediction == 1)].count()
tp = predicted[(predicted.CANCELLED == 1) & (predicted.prediction == 1.0)].count()
tn = predicted[(predicted.CANCELLED == 0) & (predicted.prediction == 0.0)].count()
fp = predicted[(predicted.CANCELLED == 0) & (predicted.prediction == 1.0)].count()
fn = predicted[(predicted.CANCELLED == 1) & (predicted.prediction == 0.0)].count()

Calculate Precision and recall:

In [35]:
# precision = tp / (tp + fp)
recall = tp / (tp + fn) 
# print("Precision = %g" % (precision))
print("Recall = %g" % (recall))

Recall = 0


Confusion matrix

In [36]:
# Confusion matrix 
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Total", df.count())

True Positives: 0
True Negatives: 54779
False Positives: 0
False Negatives: 956
Total 278016


# Logistic Regression 

In [37]:
train_data_scaled.columns

['label',
 'features2',
 'features_scaled',
 'CANCELLED',
 'B_SCHEDULED_ARRIVAL',
 'B_SCHEDULED_DEPARTURE',
 'AIRLINE_Vec',
 'ORIGIN_AIRPORT_Vec',
 'DESTINATION_AIRPORT_Vec']

In [38]:
# select variables excluding label, feature2 and CANCELLED
vars_to_keep = [
 'features_scaled',
 'B_SCHEDULED_ARRIVAL',
 'B_SCHEDULED_DEPARTURE',
 'AIRLINE_Vec',
 'ORIGIN_AIRPORT_Vec',
 'DESTINATION_AIRPORT_Vec'
]

In [39]:
assembler = VectorAssembler(
                            inputCols=[c for c in vars_to_keep],
                            outputCol='features')
train = assembler.transform(train_data_scaled).select("CANCELLED", "features")
test = assembler.transform(test_data_scaled).select("CANCELLED", "features")

In [40]:
# Fit logistic regression 
lr = LogisticRegression(labelCol="CANCELLED", featuresCol="features", maxIter=10)

In [41]:
# Evaluate model 
evaluator = BinaryClassificationEvaluator(labelCol="CANCELLED")

In [42]:
evaluator.getMetricName()

'areaUnderROC'

In [43]:
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lr.maxIter, [1, 5, 10])
             .build())

In [44]:
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, 
                    estimatorParamMaps=paramGrid, 
                    evaluator=evaluator, 
                    numFolds=5)

In [45]:
# Run cross validations
cvModel = cv.fit(train)

# Use test set to measure the accuracy of our model on new data
predicted = cvModel.transform(test)

In [46]:
predicted.show(3)

+---------+--------------------+--------------------+--------------------+----------+
|CANCELLED|            features|       rawPrediction|         probability|prediction|
+---------+--------------------+--------------------+--------------------+----------+
|        0|(1254,[0,1,2,3,4,...|[0.52522800938600...|[0.62836943489917...|       0.0|
|        0|(1254,[0,1,2,3,4,...|[-0.0479653814083...|[0.48801095313762...|       1.0|
|        0|(1254,[0,1,2,3,4,...|[-0.1683619566972...|[0.45800865367558...|       1.0|
+---------+--------------------+--------------------+--------------------+----------+
only showing top 3 rows



# Evaluate the logistic model

AUC

In [47]:
AUC = evaluator.evaluate(predicted)
print("AUC: %f" % AUC)

AUC: 0.727176


accuracy, precision, recall and f1

In [48]:
# cvModel uses the best model found from the Cross Validation
# Evaluate best model
evaluator = MulticlassClassificationEvaluator(
    labelCol="CANCELLED", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predicted)
print("accuracy: %f" % accuracy)

accuracy: 0.695236


In [49]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="CANCELLED", predictionCol="prediction")
f1 = evaluator.evaluate(predicted)
print("f1: %f" % f1)

f1: 0.804997


In [50]:
# tp = df[(df.target_index == 1) & (df.prediction == 1)].count()
tp = predicted[(predicted.CANCELLED == 1) & (predicted.prediction == 1.0)].count()
tn = predicted[(predicted.CANCELLED == 0) & (predicted.prediction == 0.0)].count()
fp = predicted[(predicted.CANCELLED == 0) & (predicted.prediction == 1.0)].count()
fn = predicted[(predicted.CANCELLED == 1) & (predicted.prediction == 0.0)].count()

Confusion matrix

In [51]:
# Confusion matrix 
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Total", df.count())

True Positives: 608
True Negatives: 38141
False Positives: 16638
False Negatives: 348
Total 278016


In [52]:
precision = tp / (tp + fp)
recall = tp / (tp + fn) 
print("Precision = %g" % (precision))
print("Recall = %g" % (recall))

Precision = 0.0352546
Recall = 0.635983
