In [1]:
import random

import findspark
findspark.init()
import pyspark

from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, Bucketizer
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import isnan, when, count, col, abs
from pyspark.sql import functions as sf 

In [2]:
filename = "2004.csv"

In [3]:
def _init_spark():
    spark = SparkSession.builder.appName("Project").getOrCreate()
    sc = spark.sparkContext
    return spark, sc

spark, sc = _init_spark()
sqlContext = SQLContext(sc)

df = sqlContext.read.load(filename, 
                      format='com.databricks.spark.csv', 
                      header='true',
                      delimiter=',',
                      inferSchema='true')
df.cache()

col_to_drop = ['ArrTime', 'ActualElapsedTime', 'AirTime', 'TaxiIn', 'Diverted',
               'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 
               'Year', 'TailNum', 'CancellationCode' ] # Only those 3 I added up to delay, others 
                                                       # are delayed as is stated in the task
df = df.drop(*col_to_drop)
df = df.withColumn("ArrDelay", df["ArrDelay"].cast(IntegerType()))
df = df.withColumn("DepDelay", df["DepDelay"].cast(IntegerType()))
df = df.withColumn("CRSDepTime", df["CRSDepTime"].cast(IntegerType()))
df = df.withColumn("CRSArrTime", df["CRSArrTime"].cast(IntegerType()))
df = df.withColumn("DepTime", df["DepTime"].cast(IntegerType()))

df = df.filter("Cancelled == 0") #select only those flights that happened
df = df.drop("Cancelled")
df = df.drop(*["UniqueCarrier", "DayofMonth", "FlightNum"]) #Droping unimportant categorical variables

df = df.na.drop("any") # Drop columns with null values +- 99% of dataset remains 

In [4]:
df.show(5) 

+-----+---------+-------+----------+----------+--------------+--------+--------+------+----+--------+-------+
|Month|DayOfWeek|DepTime|CRSDepTime|CRSArrTime|CRSElapsedTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiOut|
+-----+---------+-------+----------+----------+--------------+--------+--------+------+----+--------+-------+
|    1|        1|    623|       630|       915|           105|     -14|      -7|   ORD| CLT|     599|     11|
|    1|        2|    621|       630|       915|           105|      -4|      -9|   ORD| CLT|     599|     16|
|    1|        3|    633|       630|       915|           105|       5|       3|   ORD| CLT|     599|     15|
|    1|        4|    627|       630|       915|           105|     -16|      -3|   ORD| CLT|     599|     10|
|    1|        5|    635|       630|       915|           105|       3|       5|   ORD| CLT|     599|     13|
+-----+---------+-------+----------+----------+--------------+--------+--------+------+----+--------+-------+
only showi

In [5]:
# Introducing interaction between the categorical variables: Origin and Dest

In [6]:
df = df.withColumn('OrigDest', 
                    sf.concat(sf.col('Origin'),sf.lit('_'), sf.col('Dest')))
df = df.drop(*["Origin", "Dest"])

In [7]:
df.show(1)

+-----+---------+-------+----------+----------+--------------+--------+--------+--------+-------+--------+
|Month|DayOfWeek|DepTime|CRSDepTime|CRSArrTime|CRSElapsedTime|ArrDelay|DepDelay|Distance|TaxiOut|OrigDest|
+-----+---------+-------+----------+----------+--------------+--------+--------+--------+-------+--------+
|    1|        1|    623|       630|       915|           105|     -14|      -7|     599|     11| ORD_CLT|
+-----+---------+-------+----------+----------+--------------+--------+--------+--------+-------+--------+
only showing top 1 row



In [8]:
# Converting continuous to categorical:, "DepTime" "CRSDepTime", "CRSArrTime" which are times. 
# We want to seperate them to types of the day (morning, aftenoon, evening, night): https://www.learnersdictionary.com/qa/parts-of-the-day-early-morning-late-morning-etc

In [9]:
splits = [-float("inf"), 500, 1200, 1700, float("inf")]
bucketizer = Bucketizer(splitsArray= [splits, splits, splits], inputCols=["CRSDepTime", "CRSArrTime", "DepTime"], outputCols=["CatCRSDepTime", "CatCRSArrTime", "CatDepTime"])
df = bucketizer.transform(df)

df = df.drop(*["CRSDepTime", "CRSArrTime"])

In [10]:
varIdxer = StringIndexer(inputCol="OrigDest",outputCol="IndOrigDest").fit(df)
df = varIdxer.transform(df)
df = df.drop("OrigDest")

In [11]:
df.show(5)

+-----+---------+-------+--------------+--------+--------+--------+-------+-------------+-------------+----------+-----------+
|Month|DayOfWeek|DepTime|CRSElapsedTime|ArrDelay|DepDelay|Distance|TaxiOut|CatCRSDepTime|CatCRSArrTime|CatDepTime|IndOrigDest|
+-----+---------+-------+--------------+--------+--------+--------+-------+-------------+-------------+----------+-----------+
|    1|        1|    623|           105|     -14|      -7|     599|     11|          1.0|          1.0|       1.0|      178.0|
|    1|        2|    621|           105|      -4|      -9|     599|     16|          1.0|          1.0|       1.0|      178.0|
|    1|        3|    633|           105|       5|       3|     599|     15|          1.0|          1.0|       1.0|      178.0|
|    1|        4|    627|           105|     -16|      -3|     599|     10|          1.0|          1.0|       1.0|      178.0|
|    1|        5|    635|           105|       3|       5|     599|     13|          1.0|          1.0|       1

In [12]:
# One Hot encoding the categorical data:

In [13]:
oneHot = OneHotEncoder(inputCols=['Month', 'DayOfWeek', 'CatCRSDepTime', 'CatCRSArrTime', 'IndOrigDest', 'CatDepTime'],
                       outputCols=['HotMonth', 'HotDayOfWeek', 'HotCRSCatDepTime', 'HotCRSCatArrTime', 'HotIndOrigDest', 'HotDepTime']).fit(df)
df = oneHot.transform(df)
df = df.drop(*['Month', 'DayOfWeek', 'CatDepTime', 'CatCRSDepTime', 'CatCRSArrTime', 'IndOrigDest'])

In [14]:
df.show(5)

+-------+--------------+--------+--------+--------+-------+------------------+--------------+----------------+----------------+-------------+-------------+
|DepTime|CRSElapsedTime|ArrDelay|DepDelay|Distance|TaxiOut|    HotIndOrigDest|      HotMonth|HotCRSCatDepTime|HotCRSCatArrTime| HotDayOfWeek|   HotDepTime|
+-------+--------------+--------+--------+--------+-------+------------------+--------------+----------------+----------------+-------------+-------------+
|    623|           105|     -14|      -7|     599|     11|(4408,[178],[1.0])|(12,[1],[1.0])|   (3,[1],[1.0])|   (3,[1],[1.0])|(7,[1],[1.0])|(3,[1],[1.0])|
|    621|           105|      -4|      -9|     599|     16|(4408,[178],[1.0])|(12,[1],[1.0])|   (3,[1],[1.0])|   (3,[1],[1.0])|(7,[2],[1.0])|(3,[1],[1.0])|
|    633|           105|       5|       3|     599|     15|(4408,[178],[1.0])|(12,[1],[1.0])|   (3,[1],[1.0])|   (3,[1],[1.0])|(7,[3],[1.0])|(3,[1],[1.0])|
|    627|           105|     -16|      -3|     599|     10|(4408

In [15]:
df = df.withColumn("Speed", sf.round(col("Distance") / col("CRSElapsedTime"), 2))
df = df.drop(*["Distance", "CRSElapsedTime"])

In [16]:
df.show(5)

+-------+--------+--------+-------+------------------+--------------+----------------+----------------+-------------+-------------+-----+
|DepTime|ArrDelay|DepDelay|TaxiOut|    HotIndOrigDest|      HotMonth|HotCRSCatDepTime|HotCRSCatArrTime| HotDayOfWeek|   HotDepTime|Speed|
+-------+--------+--------+-------+------------------+--------------+----------------+----------------+-------------+-------------+-----+
|    623|     -14|      -7|     11|(4408,[178],[1.0])|(12,[1],[1.0])|   (3,[1],[1.0])|   (3,[1],[1.0])|(7,[1],[1.0])|(3,[1],[1.0])|  5.7|
|    621|      -4|      -9|     16|(4408,[178],[1.0])|(12,[1],[1.0])|   (3,[1],[1.0])|   (3,[1],[1.0])|(7,[2],[1.0])|(3,[1],[1.0])|  5.7|
|    633|       5|       3|     15|(4408,[178],[1.0])|(12,[1],[1.0])|   (3,[1],[1.0])|   (3,[1],[1.0])|(7,[3],[1.0])|(3,[1],[1.0])|  5.7|
|    627|     -16|      -3|     10|(4408,[178],[1.0])|(12,[1],[1.0])|   (3,[1],[1.0])|   (3,[1],[1.0])|(7,[4],[1.0])|(3,[1],[1.0])|  5.7|
|    635|       3|       5|     13

In [17]:
# FINAL SELECTION:
X1 = ['DepDelay', 'TaxiOut']
X2 = ['DepDelay', 'TaxiOut',  'HotDepTime']
X3 = ['DepDelay', 'TaxiOut', 'HotIndOrigDest', 'HotDepTime'] 
X4 = ['DepDelay', 'TaxiOut', 'HotDayOfWeek', 'HotMonth', 'Speed'] 
X5 = ['DepDelay', 'TaxiOut', 'HotDayOfWeek', 'HotIndOrigDest', 'Speed']
X6 = ['DepDelay', 'TaxiOut', 'HotIndOrigDest', 'Speed', 'HotCRSCatDepTime', 'HotCRSCatArrTime', 'HotDepTime']

In [18]:
assembler = VectorAssembler(inputCols=X1, outputCol='features')
output = assembler.transform(df).select('features','ArrDelay')

In [19]:
train, test = output.randomSplit([0.75, 0.25])

In [20]:
lin_reg = LinearRegression(featuresCol = 'features', labelCol='ArrDelay')
linear_model = lin_reg.fit(train)

#trainSummary = linear_model.summary
#print("RMSE: %f" % trainSummary.rootMeanSquaredError)
#print("MSE: %f" %  trainSummary.meanSquaredError)
#print("\nr2: %f" % trainSummary.r2)

In [21]:
predictions = linear_model.transform(test)
x =((predictions['ArrDelay']-predictions['prediction'])/predictions['ArrDelay'])*100
predictions = predictions.withColumn('Accuracy',abs(x))
#predictions.select("prediction","ArrDelay","Accuracy","features").show(10)

In [22]:
pred_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="ArrDelay",metricName="r2")
print("R Squared (R2) on test data = %g" % pred_evaluator.evaluate(predictions))

evaluator = RegressionEvaluator(
    labelCol='ArrDelay', predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

evaluator = RegressionEvaluator(
    labelCol='ArrDelay', predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(predictions)
print("Mean Absolute Error (MAE) on test data = %g" % mae)

R Squared (R2) on test data = 0.851357
Root Mean Squared Error (RMSE) on test data = 12.786
Mean Absolute Error (MAE) on test data = 7.82713


X1:
R Squared (R2) on test data = 0.826373
Root Mean Squared Error (RMSE) on test data = 13.8497
Mean Squared Error (RMSE) on test data = 191.815

X2:
R Squared (R2) on test data = 0.86015
Root Mean Squared Error (RMSE) on test data = 12.3931
Mean Squared Error (RMSE) on test data = 153.59

X2_DepTime
R Squared (R2) on test data = 0.86645
Root Mean Squared Error (RMSE) on test data = 12.2173
Mean Squared Error (RMSE) on test data = 149.264

X2_DepTimeCont
R Squared (R2) on test data = 0.865903
Root Mean Squared Error (RMSE) on test data = 12.217
Mean Squared Error (RMSE) on test data = 149.254

X3:
R Squared (R2) on test data = 0.867128
Root Mean Squared Error (RMSE) on test data = 12.1511
Mean Squared Error (RMSE) on test data = 147.649

X3_HotDepTime
R Squared (R2) on test data = 0.869852
Root Mean Squared Error (RMSE) on test data = 12.0166
Mean Squared Error (RMSE) on test data = 144.399

X4:
R Squared (R2) on test data = 0.859676
Root Mean Squared Error (RMSE) on test data = 12.5005
Mean Squared Error (RMSE) on test data = 156.263

X4_HotDepTime
R Squared (R2) on test data = 0.861924
Root Mean Squared Error (RMSE) on test data = 12.4194
Mean Squared Error (RMSE) on test data = 154.242

X5: 
R Squared (R2) on test data = 0.863975
Root Mean Squared Error (RMSE) on test data = 12.2739
Mean Squared Error (RMSE) on test data = 150.648

X5_HotDepTime
R Squared (R2) on test data = 0.873747
Root Mean Squared Error (RMSE) on test data = 11.8387
Mean Squared Error (RMSE) on test data = 140.154

X6:
R Squared (R2) on test data = 0.870854
Root Mean Squared Error (RMSE) on test data = 12.0075
Mean Squared Error (RMSE) on test data = 144.179

X6_NoWeekNoSpeed:
R Squared (R2) on test data = 0.869446
Root Mean Squared Error (RMSE) on test data = 12.0361
Mean Squared Error (RMSE) on test data = 144.868

X6_NoSpeed:
R Squared (R2) on test data = 0.8693
Root Mean Squared Error (RMSE) on test data = 12.0816
Mean Squared Error (RMSE) on test data = 145.966

X6_NoWeek:
R Squared (R2) on test data = 0.873866
Root Mean Squared Error (RMSE) on test data = 11.8401
Mean Squared Error (RMSE) on test data = 140.188

X6_NoWeek_DepTime
R Squared (R2) on test data = 0.878756
Root Mean Squared Error (RMSE) on test data = 11.6798
Mean Squared Error (RMSE) on test data = 136.418

X6_NoWeek_DepTimeCont
R Squared (R2) on test data = 0.872805
Root Mean Squared Error (RMSE) on test data = 11.9287
Mean Squared Error (RMSE) on test data = 142.295

In [23]:
sc.stop()