In [3]:
import findspark
findspark.init()

import pyspark
import random

In [4]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PCA
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from  pyspark.sql.functions import abs
from pyspark.ml.evaluation import RegressionEvaluator

In [5]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import isnan, when, count, col

In [6]:
filename = "2004.csv"

In [7]:
def _init_spark():
    spark = SparkSession.builder.appName("Project").getOrCreate()
    sc = spark.sparkContext
    return spark, sc

spark, sc = _init_spark()
sqlContext = SQLContext(sc)

df = sqlContext.read.load(filename, 
                      format='com.databricks.spark.csv', 
                      header='true',
                      delimiter=',',
                      inferSchema='true')
df.cache()

col_to_drop = ['ArrTime', 'ActualElapsedTime', 'AirTime', 'TaxiIn', 'Diverted',
               'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 
               'Year', 'TailNum', 'CancellationCode' ] # Only those 3 I added up to delay, others 
                                                       # are delayed as is stated in the task
df = df.drop(*col_to_drop)

df = df.withColumn("ArrDelay", df["ArrDelay"].cast(IntegerType()))
df = df.withColumn("DepDelay", df["DepDelay"].cast(IntegerType()))
df = df.na.drop("any")

In [10]:
# We need to work with only the flight which were not cancelled, as the flights which are cancelled cannot delay :)
df = df.filter("Cancelled == 0")

In [11]:
df.select([x[0] for x in df.dtypes if 'int' in x]).show(5) # Categorical

+-----+----------+---------+----------+----------+---------+--------------+--------+--------+--------+-------+---------+
|Month|DayofMonth|DayOfWeek|CRSDepTime|CRSArrTime|FlightNum|CRSElapsedTime|ArrDelay|DepDelay|Distance|TaxiOut|Cancelled|
+-----+----------+---------+----------+----------+---------+--------------+--------+--------+--------+-------+---------+
|    1|        12|        1|       630|       915|      462|           105|     -14|      -7|     599|     11|        0|
|    1|        13|        2|       630|       915|      462|           105|      -4|      -9|     599|     16|        0|
|    1|        14|        3|       630|       915|      462|           105|       5|       3|     599|     15|        0|
|    1|        15|        4|       630|       915|      462|           105|     -16|      -3|     599|     10|        0|
|    1|        16|        5|       630|       915|      462|           105|       3|       5|     599|     13|        0|
+-----+----------+---------+----

In [12]:
df.select([x[0] for x in df.dtypes if 'int' in x]).show(5) #Continuous

+-----+----------+---------+----------+----------+---------+--------------+--------+--------+--------+-------+---------+
|Month|DayofMonth|DayOfWeek|CRSDepTime|CRSArrTime|FlightNum|CRSElapsedTime|ArrDelay|DepDelay|Distance|TaxiOut|Cancelled|
+-----+----------+---------+----------+----------+---------+--------------+--------+--------+--------+-------+---------+
|    1|        12|        1|       630|       915|      462|           105|     -14|      -7|     599|     11|        0|
|    1|        13|        2|       630|       915|      462|           105|      -4|      -9|     599|     16|        0|
|    1|        14|        3|       630|       915|      462|           105|       5|       3|     599|     15|        0|
|    1|        15|        4|       630|       915|      462|           105|     -16|      -3|     599|     10|        0|
|    1|        16|        5|       630|       915|      462|           105|       3|       5|     599|     13|        0|
+-----+----------+---------+----

In [13]:
# Categorical data: Month, DayofMonth, DayOfWeek, FlightNum, UniqueCarrier, Origin, Dest

In [14]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder
from pyspark.ml.feature import VectorAssembler

varIdxer = StringIndexer(inputCols=['UniqueCarrier', 'Origin', 'Dest'],outputCols=['IndUniCar', 'IndOrig', 'IndDest']).fit(df)
df = varIdxer.transform(df)

In [15]:
df.select([x[0] for x in df.dtypes if 'int' not in x]).show(5) # SOME OF Categorical

+-------+-------------+------+----+---------+-------+-------+
|DepTime|UniqueCarrier|Origin|Dest|IndUniCar|IndOrig|IndDest|
+-------+-------------+------+----+---------+-------+-------+
|    623|           UA|   ORD| CLT|      3.0|    1.0|   20.0|
|    621|           UA|   ORD| CLT|      3.0|    1.0|   20.0|
|    633|           UA|   ORD| CLT|      3.0|    1.0|   20.0|
|    627|           UA|   ORD| CLT|      3.0|    1.0|   20.0|
|    635|           UA|   ORD| CLT|      3.0|    1.0|   20.0|
+-------+-------------+------+----+---------+-------+-------+
only showing top 5 rows



In [16]:
oneHot = OneHotEncoder(inputCols=['IndUniCar', 'IndOrig', 'IndDest','Month', 'DayofMonth', 'DayOfWeek', 'FlightNum'], outputCols=['HotUniCar', 'HotOrig', 'HotDest','HotMonth', 'HotDayofMonth', 'HotDayOfWeek', 'HotFlightNum']).fit(df)

In [17]:
df2 = oneHot.transform(df)

In [18]:
df2.select(['HotUniCar', 'HotOrig', 'HotDest','HotMonth', 'HotDayofMonth', 'HotDayOfWeek', 'HotFlightNum']).show(5)

+--------------+---------------+----------------+--------------+---------------+-------------+------------------+
|     HotUniCar|        HotOrig|         HotDest|      HotMonth|  HotDayofMonth| HotDayOfWeek|      HotFlightNum|
+--------------+---------------+----------------+--------------+---------------+-------------+------------------+
|(18,[3],[1.0])|(284,[1],[1.0])|(282,[20],[1.0])|(12,[1],[1.0])|(31,[12],[1.0])|(7,[1],[1.0])|(9912,[462],[1.0])|
|(18,[3],[1.0])|(284,[1],[1.0])|(282,[20],[1.0])|(12,[1],[1.0])|(31,[13],[1.0])|(7,[2],[1.0])|(9912,[462],[1.0])|
|(18,[3],[1.0])|(284,[1],[1.0])|(282,[20],[1.0])|(12,[1],[1.0])|(31,[14],[1.0])|(7,[3],[1.0])|(9912,[462],[1.0])|
|(18,[3],[1.0])|(284,[1],[1.0])|(282,[20],[1.0])|(12,[1],[1.0])|(31,[15],[1.0])|(7,[4],[1.0])|(9912,[462],[1.0])|
|(18,[3],[1.0])|(284,[1],[1.0])|(282,[20],[1.0])|(12,[1],[1.0])|(31,[16],[1.0])|(7,[5],[1.0])|(9912,[462],[1.0])|
+--------------+---------------+----------------+--------------+---------------+--------

In [19]:
df2 = df2.drop(*['IndUniCar', 'IndOrig', 'IndDest','Month', 'DayofMonth', 'DayOfWeek', 'FlightNum', 'UniqueCarrier', 'Origin', 'Dest'])

In [20]:
df3 = df2.drop(*['DepTime', 'CRSDepTime', 'CRSArrTime', 'Cancelled']) # DON't need by now, but maybe we would do something in the future

In [21]:
df3.show(1)

+--------------+--------+--------+--------+-------+--------------+---------------+--------------+----------------+------------------+---------------+-------------+
|CRSElapsedTime|ArrDelay|DepDelay|Distance|TaxiOut|      HotMonth|        HotOrig|     HotUniCar|         HotDest|      HotFlightNum|  HotDayofMonth| HotDayOfWeek|
+--------------+--------+--------+--------+-------+--------------+---------------+--------------+----------------+------------------+---------------+-------------+
|           105|     -14|      -7|     599|     11|(12,[1],[1.0])|(284,[1],[1.0])|(18,[3],[1.0])|(282,[20],[1.0])|(9912,[462],[1.0])|(31,[12],[1.0])|(7,[1],[1.0])|
+--------------+--------+--------+--------+-------+--------------+---------------+--------------+----------------+------------------+---------------+-------------+
only showing top 1 row



In [22]:
df4 = df3.drop("ArrDelay")

In [19]:
assembler = VectorAssembler(inputCols=df4.columns, outputCol="features")
output = assembler.transform(df3).select('features','ArrDelay')

In [20]:
output.show(1)

+--------------------+--------+
|            features|ArrDelay|
+--------------------+--------+
|(10550,[0,1,2,3,5...|     -14|
+--------------------+--------+
only showing top 1 row



In [21]:
train,test = output.randomSplit([0.75, 0.25])

In [244]:
train.show(5)

+--------------------+--------+
|            features|ArrDelay|
+--------------------+--------+
|(10550,[0,1,2,3,5...|       2|
|(10550,[0,1,2,3,5...|     183|
|(10550,[0,1,2,3,5...|     137|
|(10550,[0,1,2,3,5...|     137|
|(10550,[0,1,2,3,5...|      27|
+--------------------+--------+
only showing top 5 rows



In [245]:
lin_reg = LinearRegression(featuresCol = 'features', labelCol='ArrDelay')
linear_model = lin_reg.fit(train)

In [255]:
#print("Coefficients: " + str(linear_model.coefficients))
#print("\nIntercept: " + str(linear_model.intercept))

In [248]:
trainSummary = linear_model.summary
print("RMSE: %f" % trainSummary.rootMeanSquaredError)
print("\nr2: %f" % trainSummary.r2)

RMSE: 11.675834

r2: 0.877188


In [253]:
predictions = linear_model.transform(test)
x =((predictions['ArrDelay']-predictions['prediction'])/predictions['ArrDelay'])*100
predictions = predictions.withColumn('Accuracy',abs(x))
predictions.select("prediction","ArrDelay","Accuracy","features").show(10)

+------------------+--------+------------------+--------------------+
|        prediction|ArrDelay|          Accuracy|            features|
+------------------+--------+------------------+--------------------+
|3.2271949369443877|      90| 96.41422784783957|(10550,[0,1,2,3,5...|
|-4.611955333088233|      -1|361.19553330882326|(10550,[0,1,2,3,5...|
| 39.03444977884545|      57|31.518509159920267|(10550,[0,1,2,3,5...|
|26.618280707367184|      25|6.4731228294687355|(10550,[0,1,2,3,5...|
| 82.58275258324767|     150|44.944831611168226|(10550,[0,1,2,3,5...|
| 7.527100707817499|      12| 37.27416076818751|(10550,[0,1,2,3,5...|
| 6.155790248300671|      19| 67.60110395631224|(10550,[0,1,2,3,5...|
| 4.431111756497154|       0|              null|(10550,[0,1,2,3,5...|
|-6.032202484712428|      -2| 201.6101242356214|(10550,[0,1,2,3,5...|
| -6.07485132962319|       6| 201.2475221603865|(10550,[0,1,2,3,5...|
+------------------+--------+------------------+--------------------+
only showing top 10 

In [254]:
pred_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="ArrDelay",metricName="r2")
print("R Squared (R2) on test data = %g" % pred_evaluator.evaluate(predictions))

R Squared (R2) on test data = 0.871809


In [28]:
from pyspark.ml.regression import FMRegressor
from pyspark.ml.feature import MinMaxScaler

In [29]:
# manually_selected=["CRSElapsedTime", "DepDelay", "Distance", "TaxiOut", "HotOrig",  "HotDest", "HotDayOfWeek"]
manually_selected=["DepDelay", "TaxiOut", "HotOrig", "HotDayOfWeek"]

In [30]:
assembler = VectorAssembler(inputCols=manually_selected, outputCol="features")
output = assembler.transform(df3).select('features','ArrDelay')

In [31]:
train,test = output.randomSplit([0.75, 0.25])

In [32]:
featureScaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures").fit(output)

In [33]:
# Train a FM model.
fm = FMRegressor(featuresCol="scaledFeatures", stepSize=0.001, labelCol='ArrDelay')

# Create a Pipeline.
pipeline = Pipeline(stages=[featureScaler, fm])

In [None]:
# Train model.
model = pipeline.fit(train)

In [None]:
# Make predictions.
predictions = model.transform(test)

# Select example rows to display.
predictions.select("prediction", 'ArrDelay', "features").show(5)

In [None]:
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol='ArrDelay', predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

pred_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="ArrDelay",metricName="r2")
print("R Squared (R2) on test data = %g" % pred_evaluator.evaluate(predictions))

fmModel = model.stages[1]
print("Factors: " + str(fmModel.factors))
print("Linear: " + str(fmModel.linear))
print("Intercept: " + str(fmModel.intercept))

## HERE DRAFT: Under developmentMESS looking to deal with time: Morning, Afternoon, Evening, Night 

In [38]:
df.select(["CRSDepTime", "DepTime", "CRSElapsedTime", ]).show(5) #"CRSArrTime" - local arrival time doesn't show us a thing

+----------+-------+--------------+
|CRSDepTime|DepTime|CRSElapsedTime|
+----------+-------+--------------+
|       630|    623|           105|
|       630|    621|           105|
|       630|    633|           105|
|       630|    627|           105|
|       630|    635|           105|
+----------+-------+--------------+
only showing top 5 rows



In [39]:
from pyspark.sql.functions import unix_timestamp, from_unixtime

In [40]:
df.select("CRSDepTime").show(1)

+----------+
|CRSDepTime|
+----------+
|       630|
+----------+
only showing top 1 row



In [42]:
from pyspark.sql.functions import format_string

In [46]:
df.select(format_string('%04s', df.CRSDepTime))

DataFrame[format_string(%04s, CRSDepTime): string]

In [27]:
df.withColumn("CRSDepTime", f.format_string("%04d", "CRSDepTime"))

DataFrame[Month: int, DayofMonth: int, DayOfWeek: int, DepTime: string, CRSDepTime: string, CRSArrTime: int, UniqueCarrier: string, FlightNum: int, CRSElapsedTime: int, ArrDelay: int, DepDelay: int, Origin: string, Dest: string, Distance: int, TaxiOut: int, Cancelled: int, IndUniCar: double, IndOrig: double, IndDest: double]

In [104]:
df2 = df.withColumn('CRSDepTime',from_unixtime(unix_timestamp(col(('CRSDepTime')), "hmm"), "HH:mm'Z'"))

In [105]:
#df2 = df.withColumn('CRSDepTime', to_timestamp('CRSDepTime', "h:mm"))

In [86]:
df.select('CRSDepTime').take(1)

[Row(CRSDepTime='630')]

In [54]:
df.select([x[0] for x in df.dtypes if 'int' in x]).show(5) #Continuous

+-----+----------+---------+----------+----------+---------+--------------+--------+--------+--------+-------+---------+
|Month|DayofMonth|DayOfWeek|CRSDepTime|CRSArrTime|FlightNum|CRSElapsedTime|ArrDelay|DepDelay|Distance|TaxiOut|Cancelled|
+-----+----------+---------+----------+----------+---------+--------------+--------+--------+--------+-------+---------+
|    1|        12|        1|       630|       915|      462|           105|     -14|      -7|     599|     11|        0|
|    1|        13|        2|       630|       915|      462|           105|      -4|      -9|     599|     16|        0|
|    1|        14|        3|       630|       915|      462|           105|       5|       3|     599|     15|        0|
|    1|        15|        4|       630|       915|      462|           105|     -16|      -3|     599|     10|        0|
|    1|        16|        5|       630|       915|      462|           105|       3|       5|     599|     13|        0|
+-----+----------+---------+----

In [153]:
from pyspark.sql.functions import concat, col, lit

In [171]:
from pyspark.sql import functions as f
#df2 = df.withColumn('CRSDepTime', format_string('%04s', df.CRSDepTime))

In [172]:
df2 = df.withColumn('CRSDepTime', format_string("%04d", col('CRSDepTime').cast('int')))

In [174]:
df2.select("CRSDepTime").show(5)

+----------+
|CRSDepTime|
+----------+
|      0630|
|      0630|
|      0630|
|      0630|
|      0630|
+----------+
only showing top 5 rows



In [190]:
df = df.withColumn("CRSDepTime", df["CRSDepTime"].cast(IntegerType()))

In [None]:
df2 = df.withColumn("DayPeriod", when(col("CRSDepTime") >= 500 & col("CRSDepTime") <= 1200, "Morning"))
                    
                    #.when(col("CRSDepTime") >= 1200,"Afternoon").when(col("CRSDepTime") >= 1800,"Evening").when(col("CRSDepTime") >= 1200,"Afternoon").when(col("CRSDepTime") >= 2200,"Night")
                          #       .otherwise("Night"))

In [None]:
df2.filter(col('CRSDepTime') >=1800).select("DayPeriod").show(100)

In [202]:
df.select(["DepTime", "CRSArrTime", ]).show(10)

+-------+----------+
|DepTime|CRSArrTime|
+-------+----------+
|    623|       915|
|    621|       915|
|    633|       915|
|    627|       915|
|    635|       915|
|    628|       915|
|    650|       915|
|    627|       915|
|    623|       915|
|    626|       915|
+-------+----------+
only showing top 10 rows



In [34]:
df.filter('DepTime > 500 and DepTime <1200').show(5)

+-----+----------+---------+-------+----------+----------+-------------+---------+--------------+--------+--------+------+----+--------+-------+---------+---------+-------+-------+
|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|CRSArrTime|UniqueCarrier|FlightNum|CRSElapsedTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiOut|Cancelled|IndUniCar|IndOrig|IndDest|
+-----+----------+---------+-------+----------+----------+-------------+---------+--------------+--------+--------+------+----+--------+-------+---------+---------+-------+-------+
|    1|        12|        1|    623|       630|       915|           UA|      462|           105|     -14|      -7|   ORD| CLT|     599|     11|        0|      3.0|    1.0|   20.0|
|    1|        13|        2|    621|       630|       915|           UA|      462|           105|      -4|      -9|   ORD| CLT|     599|     16|        0|      3.0|    1.0|   20.0|
|    1|        14|        3|    633|       630|       915|           UA|      462|           10

In [None]:
sc.stop()