# Predicting arrival delay using Logistic Regressioninear

In [17]:
# !pip install pyspark

In [18]:
# !pip show pyspark

In [19]:
# importing necessary packages
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression

In [20]:
# creating spark session
spark = SparkSession.builder.appName("delay").getOrCreate()


In [21]:
pandas_df = pd.read_csv("../data/outdated/monthly_data/AllMonths_RAW.csv", header=0)
flights_dropped_df = pandas_df.drop([
    'ACTUAL_ELAPSED_TIME',
 'AIR_TIME',
 'ARR_DEL15',
 'ARR_DELAY',
 'ARR_TIME',
 'ARR_TIME_BLK',
 'CANCELLATION_CODE',
 'CANCELLED',
 'CARRIER_DELAY',
#  'CRS_ARR_TIME',
#  'CRS_DEP_TIME',
#  'CRS_ELAPSED_TIME',
#  'DAY_OF_WEEK',
 'DEP_DEL15',
#  'DEP_DELAY',
 'DEP_TIME',
 'DEP_TIME_BLK',
#  'DEST',
#  'DISTANCE',
 'DIVERTED',
 'FL_DATE',
 'LATE_AIRCRAFT_DELAY',
#  'MONTH',
 'NAS_DELAY',
#  'OP_CARRIER_FL_NUM',
#  'OP_UNIQUE_CARRIER',
#  'ORIGIN',
 'SECURITY_DELAY',
 'TAIL_NUM',
 'TAXI_IN',
 'TAXI_OUT',
 'WEATHER_DELAY',
 'WHEELS_OFF',
 'WHEELS_ON',
#  'YEAR'
],axis=1)

In [22]:
def classify(num):
    if (num < 0):
        return ('No Delay') 
    else:
        if (num < 15):
            return ('Late: < 15 mins')
        elif (num < 30):
            return ('Late: < 30 mins')
        elif (num < 60):
            return ('Late: < 1 hours')
        elif (num < 90):
            return ('Late: < 1.5 hours')
        elif (num < 120):
            return ('Late: < 2 hours')
        elif (num < 150):
            return ('Late: < 2.5 hours')
        elif (num < 180):
            return ('Late: < 3 hours')
        elif (num < 240):
            return ('Late: < 4 hours')
        elif (num < 300):
            return ('Late: < 5 hours')
        else:
            return ('Late: > 5 hours')


# Transforming arrival delay to fall into one of 5 categories
flights_dropped_df['DEP_DELAY'] = flights_dropped_df['DEP_DELAY'].apply(lambda x: classify(x))

In [23]:
spark_df = spark.createDataFrame(flights_dropped_df)

In [24]:
# TESTING CELL

# pandas_df.columns

In [25]:
from pyspark.ml.feature import StringIndexer
indexer1 = StringIndexer(inputCol="OP_UNIQUE_CARRIER", outputCol="OP_UNIQUE_CARRIER_numeric")
#Fits a model to the input dataset with optional parameters.
spark_df = indexer1.fit(spark_df).transform(spark_df)

indexer2 = StringIndexer(inputCol="ORIGIN", outputCol="ORIGIN_numeric")
spark_df = indexer2.fit(spark_df).transform(spark_df)

indexer3 = StringIndexer(inputCol="DEST", outputCol="DEST_numeric")
spark_df = indexer3.fit(spark_df).transform(spark_df)

indexer4 = StringIndexer(inputCol="DEP_DELAY", outputCol="DEP_DELAY_numeric")
spark_df = indexer4.fit(spark_df).transform(spark_df)

spark_df.show()

22/10/11 15:22:07 WARN TaskSetManager: Stage 233 contains a task of very large size (1735 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/10/11 15:22:08 WARN TaskSetManager: Stage 236 contains a task of very large size (1735 KiB). The maximum recommended task size is 1000 KiB.
22/10/11 15:22:08 WARN TaskSetManager: Stage 239 contains a task of very large size (1735 KiB). The maximum recommended task size is 1000 KiB.
22/10/11 15:22:09 WARN TaskSetManager: Stage 242 contains a task of very large size (1735 KiB). The maximum recommended task size is 1000 KiB.
22/10/11 15:22:10 WARN TaskSetManager: Stage 245 contains a task of very large size (1735 KiB). The maximum recommended task size is 1000 KiB.
+----+-----+-----------+-----------------+-----------------+------+----+------------+-----------------+------------+----------------+--------+-------------------------+--------------+------------+-----------------+
|YEAR|MONTH|DAY_OF_WEEK|OP_UNIQUE_CARRIER|OP_CARRIER_FL_NUM|ORIGIN|DEST|CRS_DEP_TIME|        DEP_DELAY|CRS_ARR_TIME|CRS_ELAPSED_TIME|DISTANCE|OP_UNIQUE_CARRIER_numeric|ORIGIN_numeric|DEST_numeric|DEP_DELAY_numeric

In [26]:
assembler = VectorAssembler(inputCols = ['YEAR', 'MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER_numeric',
       'OP_CARRIER_FL_NUM', 'ORIGIN_numeric', 'DEST_numeric', 'CRS_DEP_TIME',
       'CRS_ARR_TIME', 'CRS_ELAPSED_TIME', 'DISTANCE'], outputCol='features')
output = assembler.transform(spark_df)

In [27]:
finalised_data = output.select('features', 'DEP_DELAY_numeric')
finalised_data.show()

22/10/11 15:22:10 WARN TaskSetManager: Stage 246 contains a task of very large size (1735 KiB). The maximum recommended task size is 1000 KiB.
+--------------------+-----------------+
|            features|DEP_DELAY_numeric|
+--------------------+-----------------+
|[2022.0,1.0,3.0,0...|              0.0|
|[2022.0,1.0,4.0,0...|              0.0|
|[2022.0,1.0,5.0,0...|              2.0|
|[2022.0,1.0,7.0,0...|              0.0|
|[2022.0,1.0,1.0,0...|              1.0|
|[2022.0,1.0,2.0,0...|              0.0|
|[2022.0,1.0,3.0,0...|              0.0|
|[2022.0,1.0,4.0,0...|              5.0|
|[2022.0,1.0,5.0,0...|              6.0|
|[2022.0,1.0,7.0,0...|              0.0|
|[2022.0,1.0,1.0,0...|              0.0|
|[2022.0,1.0,2.0,0...|              3.0|
|[2022.0,1.0,3.0,0...|              0.0|
|[2022.0,1.0,4.0,0...|              0.0|
|[2022.0,1.0,5.0,0...|              0.0|
|[2022.0,1.0,7.0,0...|              4.0|
|[2022.0,1.0,1.0,0...|              0.0|
|[2022.0,1.0,6.0,0...|              2

In [28]:
train, test = finalised_data.randomSplit([0.8, 0.2])

In [29]:
lr = LogisticRegression(labelCol='DEP_DELAY_numeric',featuresCol='features')
lrn = lr.fit(train)

22/10/11 15:22:11 WARN TaskSetManager: Stage 247 contains a task of very large size (1735 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/10/11 15:22:12 WARN TaskSetManager: Stage 249 contains a task of very large size (1735 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/10/11 15:22:13 WARN TaskSetManager: Stage 251 contains a task of very large size (1735 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/10/11 15:22:14 WARN TaskSetManager: Stage 253 contains a task of very large size (1735 KiB). The maximum recommended task size is 1000 KiB.
22/10/11 15:22:14 WARN TaskSetManager: Stage 255 contains a task of very large size (1735 KiB). The maximum recommended task size is 1000 KiB.
22/10/11 15:22:15 WARN TaskSetManager: Stage 257 contains a task of very large size (1735 KiB). The maximum recommended task size is 1000 KiB.
22/10/11 15:22:15 WARN TaskSetManager: Stage 259 contains a task of very large size (1735 KiB). The maximum recommended task size is 1000 KiB.
22/10/11 15:22:15 WARN TaskSetManager: Stage 261 contains a task of very large size (1735 KiB). The maximum recommended task size is 1000 KiB.
22/10/11 15:22:16 WARN TaskSetManager: Stage 263 contains a task of very large size (1735 KiB). The maximum recommended task size is 1000 KiB.
22/10/11 15:22:16 WARN TaskSetManager: Stage 265 contains a task of very large size (1735 KiB). The maximum recommended task size is 1000 KiB.

In [30]:
lrn_summary = lrn.summary
lrn_summary.predictions.show()

22/10/11 15:22:49 WARN TaskSetManager: Stage 461 contains a task of very large size (1735 KiB). The maximum recommended task size is 1000 KiB.
+--------------------+-----------------+--------------------+--------------------+----------+
|            features|DEP_DELAY_numeric|       rawPrediction|         probability|prediction|
+--------------------+-----------------+--------------------+--------------------+----------+
|[2022.0,1.0,1.0,0...|              0.0|[2.95617314348748...|[0.59765719260495...|       0.0|
|[2022.0,1.0,1.0,0...|              3.0|[2.95617314348748...|[0.59765719260495...|       0.0|
|[2022.0,1.0,1.0,0...|              4.0|[2.95617314348748...|[0.59765719260495...|       0.0|
|[2022.0,1.0,1.0,0...|              6.0|[2.95277403588129...|[0.59688372559110...|       0.0|
|[2022.0,1.0,1.0,0...|              0.0|[3.83194137692418...|[0.76719165527217...|       0.0|
|[2022.0,1.0,1.0,0...|              0.0|[3.83194137692418...|[0.76719165527217...|       0.0|
|[2022.0,1.