In [1]:
import findspark
findspark.init()

import pyspark
import random

In [2]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import isnan, when, count, col

In [3]:
filename = "2004.csv"

In [4]:
def _init_spark():
    spark = SparkSession.builder.appName("Project").getOrCreate()
    sc = spark.sparkContext
    return spark, sc

spark, sc = _init_spark()

sqlContext = SQLContext(sc)

df = sqlContext.read.load(filename, 
                      format='com.databricks.spark.csv', 
                      header='true',
                      delimiter=',',
                      inferSchema='true')
df.cache()

DataFrame[Year: int, Month: int, DayofMonth: int, DayOfWeek: int, DepTime: string, CRSDepTime: int, ArrTime: string, CRSArrTime: int, UniqueCarrier: string, FlightNum: int, TailNum: string, ActualElapsedTime: string, CRSElapsedTime: int, AirTime: string, ArrDelay: string, DepDelay: string, Origin: string, Dest: string, Distance: int, TaxiIn: int, TaxiOut: int, Cancelled: int, CancellationCode: string, Diverted: int, CarrierDelay: int, WeatherDelay: int, NASDelay: int, SecurityDelay: int, LateAircraftDelay: int]

In [5]:
col_to_drop = ['ArrTime', 'ActualElapsedTime', 'AirTime', 'TaxiIn', 'Diverted', 'CancellationCode',
               'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'Year', 'TailNum', ]
df = df.drop(*col_to_drop)

df = df.withColumn("ArrDelay", df["ArrDelay"].cast(IntegerType()))
df = df.withColumn("DepDelay", df["DepDelay"].cast(IntegerType()))
df = df.na.drop("any")

In [6]:
df.printSchema()

root
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- CRSElapsedTime: integer (nullable = true)
 |-- ArrDelay: integer (nullable = true)
 |-- DepDelay: integer (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: integer (nullable = true)
 |-- TaxiOut: integer (nullable = true)
 |-- Cancelled: integer (nullable = true)



In [7]:
df.show(5)

+-----+----------+---------+-------+----------+----------+-------------+---------+--------------+--------+--------+------+----+--------+-------+---------+
|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|CRSArrTime|UniqueCarrier|FlightNum|CRSElapsedTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiOut|Cancelled|
+-----+----------+---------+-------+----------+----------+-------------+---------+--------------+--------+--------+------+----+--------+-------+---------+
|    1|        12|        1|    623|       630|       915|           UA|      462|           105|     -14|      -7|   ORD| CLT|     599|     11|        0|
|    1|        13|        2|    621|       630|       915|           UA|      462|           105|      -4|      -9|   ORD| CLT|     599|     16|        0|
|    1|        14|        3|    633|       630|       915|           UA|      462|           105|       5|       3|   ORD| CLT|     599|     15|        0|
|    1|        15|        4|    627|       630|       915|           U

### PCA with Linear regression

In [18]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PCA
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from  pyspark.sql.functions import abs
from pyspark.ml.evaluation import RegressionEvaluator

In [9]:
for_PCA = df.drop('UniqueCarrier').drop("Origin").drop("Dest").drop("DepTime")

In [10]:
df.columns

['Month',
 'DayofMonth',
 'DayOfWeek',
 'DepTime',
 'CRSDepTime',
 'CRSArrTime',
 'UniqueCarrier',
 'FlightNum',
 'CRSElapsedTime',
 'ArrDelay',
 'DepDelay',
 'Origin',
 'Dest',
 'Distance',
 'TaxiOut',
 'Cancelled']

In [12]:
for_PCA.columns[8:]

['DepDelay', 'Distance', 'TaxiOut', 'Cancelled']

In [13]:
#df_features  = df.drop("ArrDel")
assembler = VectorAssembler(inputCols=for_PCA.columns[:7]+for_PCA.columns[8:], outputCol='features')

In [14]:
data = assembler.transform(for_PCA)

In [15]:
pca = PCA(k=2, inputCol='features', outputCol='PCAfeatures')

In [16]:
pca.extractParamMap()

{Param(parent='PCA_4497475dc9c7', name='outputCol', doc='output column name.'): 'PCAfeatures',
 Param(parent='PCA_4497475dc9c7', name='k', doc='the number of principal components'): 2,
 Param(parent='PCA_4497475dc9c7', name='inputCol', doc='input column name.'): 'features'}

In [17]:
pca_model = pca.fit(data)

In [18]:
pca_data = pca_model.transform(data).select('PCAfeatures', 'ArrDelay')

In [19]:
pca_data.take(5)

[Row(PCAfeatures=DenseVector([381.4072, 1108.3639]), ArrDelay=-14),
 Row(PCAfeatures=DenseVector([381.4053, 1108.3559]), ArrDelay=-4),
 Row(PCAfeatures=DenseVector([381.4088, 1108.4381]), ArrDelay=5),
 Row(PCAfeatures=DenseVector([381.4085, 1108.3906]), ArrDelay=-16),
 Row(PCAfeatures=DenseVector([381.4098, 1108.4497]), ArrDelay=3)]

In [20]:
#output = assembler.transform(for_PCA).select('features', 'ArrDelay')

In [21]:
train,test = pca_data.randomSplit([0.75, 0.25])

In [22]:
train.show(5)

+--------------------+--------+
|         PCAfeatures|ArrDelay|
+--------------------+--------+
|[-577.11914502349...|      38|
|[-573.96073299046...|      57|
|[-573.95997417472...|      19|
|[-573.95986870465...|      96|
|[-573.95932089870...|      -7|
+--------------------+--------+
only showing top 5 rows



In [23]:
lin_reg = LinearRegression(featuresCol = 'PCAfeatures', labelCol='ArrDelay')
linear_model = lin_reg.fit(train)

In [24]:
print("Coefficients: " + str(linear_model.coefficients))
print("\nIntercept: " + str(linear_model.intercept))

Coefficients: [-2.2058648577523622e-05,0.0070673568157513765]

Intercept: -7.841051894839074


In [25]:
trainSummary = linear_model.summary
print("RMSE: %f" % trainSummary.rootMeanSquaredError)
print("\nr2: %f" % trainSummary.r2)

RMSE: 33.115509

r2: 0.017654


In [27]:
predictions = linear_model.transform(test)
x =((predictions['ArrDelay']-predictions['prediction'])/predictions['ArrDelay'])*100
predictions = predictions.withColumn('Accuracy',abs(x))
predictions.select("prediction","ArrDelay","Accuracy","PCAfeatures").show(10)

+-----------------+--------+------------------+--------------------+
|       prediction|ArrDelay|          Accuracy|         PCAfeatures|
+-----------------+--------+------------------+--------------------+
|5.887377655037643|     -41|114.35945769521376|[-573.95422382875...|
|5.887591804983934|      29| 79.69795929315885|[-573.95157573531...|
|5.887926814863498|       4|47.198170371587445|[-573.95138887744...|
| 5.88810054743924|      24| 75.46624771900316|[-573.95123802515...|
| 5.88774821565905|      30| 80.37417261446983|[-573.95096378065...|
| 5.88788571247232|      14| 57.94367348234057|[-573.95010322466...|
|4.386676838124735|       8|45.166539523440804|[-563.65517998286...|
| 4.38642363121876|       2|119.32118156093799|[-563.65433222498...|
|4.386405560539499|       9| 51.26216043845001|[-563.65400686856...|
| 4.38615965540602|      -9| 148.7351072822891|[-563.65344580363...|
+-----------------+--------+------------------+--------------------+
only showing top 10 rows



In [28]:
pred_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="ArrDelay",metricName="r2")
print("R Squared (R2) on test data = %g" % pred_evaluator.evaluate(predictions))

R Squared (R2) on test data = 0.0174602


### Generalized linear regression

In [9]:
features = df.select(['DepDelay', 'TaxiOut'])

gen_assembler = VectorAssembler(
    inputCols=features.columns,
    outputCol='features')

gen_output = gen_assembler.transform(df).select('features','ArrDelay')

In [49]:
gen_output.show(5)

+-----------+--------+
|   features|ArrDelay|
+-----------+--------+
|[-7.0,11.0]|     -14|
|[-9.0,16.0]|      -4|
| [3.0,15.0]|       5|
|[-3.0,10.0]|     -16|
| [5.0,13.0]|       3|
+-----------+--------+
only showing top 5 rows



In [50]:
gen_train,gen_test = gen_output.randomSplit([0.75, 0.25])

In [51]:
from pyspark.ml.regression import GeneralizedLinearRegression

In [81]:
glr = GeneralizedLinearRegression(family="gaussian", link="Identity", maxIter=10, regParam=0.3, labelCol='ArrDelay')

In [82]:
gen_model = glr.fit(gen_train)

In [83]:
print("Coefficients: " + str(gen_model.coefficients))
print("\nIntercept: " + str(gen_model.intercept))

Coefficients: [0.995425321448535,0.430985604572551]

Intercept: -8.207030823746992


In [84]:
trainSummary = gen_model.summary
trainSummary.pValues

[0.0, 0.0, 0.0]

In [85]:
predictions = gen_model.transform(gen_test)
x =((predictions['ArrDelay']-predictions['prediction'])/predictions['ArrDelay'])*100
predictions = predictions.withColumn('Accuracy',abs(x))
predictions.select("prediction","ArrDelay","Accuracy","features").show(10)

+-------------------+--------+------------------+--------------+
|         prediction|ArrDelay|          Accuracy|      features|
+-------------------+--------+------------------+--------------+
| -8.207030823746992|     -23| 64.31725728805657|     (2,[],[])|
| -8.207030823746992|     -23| 64.31725728805657|     (2,[],[])|
| -8.207030823746992|     -15| 45.28646117502005|     (2,[],[])|
| -8.207030823746992|     -10| 17.92969176253008|     (2,[],[])|
| -8.207030823746992|      -8|   2.5878852968374|     (2,[],[])|
| -8.207030823746992|      -3|173.56769412489973|     (2,[],[])|
| -8.207030823746992|       1| 920.7030823746992|     (2,[],[])|
| -96.50687265499955|       5|2030.1374530999913|[-174.0,197.0]|
|-60.174809228353176|      10| 701.7480922835318|  [-60.0,18.0]|
|-27.809304576095172|       3|1026.9768192031725|  [-50.0,70.0]|
+-------------------+--------+------------------+--------------+
only showing top 10 rows



In [86]:
pred_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="ArrDelay",metricName="r2")
print("R Squared (R2) on test data = %g" % pred_evaluator.evaluate(predictions))

R Squared (R2) on test data = 0.857663


### Decision Tree and Random Forest Regressions

In [10]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer, VectorAssembler

In [11]:
features = df.select(['DepDelay', 'TaxiOut', 'ArrDelay'])

gen_assembler = VectorAssembler(
    inputCols=features.columns[:-1],
    outputCol='features')

gen_output = gen_assembler.transform(df).select('features','ArrDelay')

featureIndexer = VectorIndexer(inputCol='features', outputCol='IndexedFeatures').fit(gen_output)

In [None]:
(train, test) = gen_output.randomSplit([0.7, 0.3])

### Decision Tree Regression

In [41]:
dt = DecisionTreeRegressor(featuresCol="IndexedFeatures", labelCol='ArrDelay')

In [42]:
pipeline = Pipeline(stages=[featureIndexer, dt])

In [43]:
# Train model.  This also runs the indexer.
model = pipeline.fit(train)

In [46]:
# Make predictions.
predictions = model.transform(test)

# Select example rows to display.
predictions.select("prediction", 'ArrDelay', "features").show(25)

+-------------------+--------+------------+
|         prediction|ArrDelay|    features|
+-------------------+--------+------------+
|-7.5995709293313665|     -25|   (2,[],[])|
|-7.5995709293313665|     -21|   (2,[],[])|
|-7.5995709293313665|     -18|   (2,[],[])|
|-7.5995709293313665|      -8|   (2,[],[])|
|-7.5995709293313665|      -2|   (2,[],[])|
|-7.5995709293313665|      -2|   (2,[],[])|
|-7.5995709293313665|      -2|   (2,[],[])|
|-7.5995709293313665|       5|   (2,[],[])|
|-7.5995709293313665|      -7|[-230.0,5.0]|
|-7.5995709293313665|     -33| [-34.0,3.0]|
|  32.15366308109039|      -5|[-34.0,62.0]|
|-7.5995709293313665|     -44|[-32.0,10.0]|
|-7.5995709293313665|     -34| [-31.0,4.0]|
| 0.3736192866377671|     -22|[-30.0,25.0]|
|  32.15366308109039|      -5|[-30.0,41.0]|
|-7.5995709293313665|     -35|[-29.0,14.0]|
|-7.5995709293313665|     -31|[-29.0,18.0]|
|-7.5995709293313665|     -29|[-28.0,11.0]|
|-7.5995709293313665|     -20|[-28.0,13.0]|
| 0.3736192866377671|      -2|[-

In [49]:
evaluator = RegressionEvaluator(
    labelCol='ArrDelay', predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

pred_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="ArrDelay",metricName="r2")
print("R Squared (R2) on test data = %g" % pred_evaluator.evaluate(predictions))

treeModel = model.stages[1]
# summary only
print(treeModel)

Root Mean Squared Error (RMSE) on test data = 17.6589
R Squared (R2) on test data = 0.718146
DecisionTreeRegressionModel: uid=DecisionTreeRegressor_6a94b21311e8, depth=5, numNodes=49, numFeatures=2


### Random Forest Regression

In [None]:
from pyspark.ml.regression import RandomForestRegressor

In [None]:
rf = RandomForestRegressor(featuresCol="IndexedFeatures", labelCol='ArrDelay')
pipeline = Pipeline(stages=[featureIndexer, rf])

In [None]:
model = pipeline.fit(train)

In [None]:
predictions = model.transform(test)

In [None]:
predictions.select("prediction", "ArrDelay", "features").show(25)

In [66]:
evaluator = RegressionEvaluator(
    labelCol='ArrDelay', predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

pred_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="ArrDelay",metricName="r2")
print("R Squared (R2) on test data = %g" % pred_evaluator.evaluate(predictions))

rfModel = model.stages[1]
print(rfModel)

Root Mean Squared Error (RMSE) on test data = 17.4934
R Squared (R2) on test data = 0.723404
RandomForestRegressionModel: uid=RandomForestRegressor_89b16ab5a513, numTrees=20, numFeatures=2


### Factorization Machines Regressor

In [33]:
features = df.drop('DepTime').drop('UniqueCarrier').drop('Origin').drop('Dest')

In [34]:
features.columns

['Month',
 'DayofMonth',
 'DayOfWeek',
 'CRSDepTime',
 'CRSArrTime',
 'FlightNum',
 'CRSElapsedTime',
 'ArrDelay',
 'DepDelay',
 'Distance',
 'TaxiOut',
 'Cancelled']

In [36]:
gen_assembler = VectorAssembler(
    inputCols=features.columns[:7]+features.columns[8:],
    outputCol='features')

gen_output = gen_assembler.transform(df).select('features','ArrDelay')

In [37]:
gen_output.select("ArrDelay").show(12)

+--------+
|ArrDelay|
+--------+
|     -14|
|      -4|
|       5|
|     -16|
|       3|
|     -10|
|      29|
|     -11|
|     -12|
|     -14|
|       8|
|     -14|
+--------+
only showing top 12 rows



In [38]:
from pyspark.ml.regression import FMRegressor
from pyspark.ml.feature import MinMaxScaler

In [39]:
# Scale features.
featureScaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures").fit(gen_output)


In [40]:
(train, test) = gen_output.randomSplit([0.7, 0.3])

In [41]:
# Train a FM model.
fm = FMRegressor(featuresCol="scaledFeatures", stepSize=0.001, labelCol='ArrDelay')

# Create a Pipeline.
pipeline = Pipeline(stages=[featureScaler, fm])

# Train model.
model = pipeline.fit(train)

In [42]:
# Make predictions.
predictions = model.transform(test)

# Select example rows to display.
predictions.select("prediction", 'ArrDelay', "features").show(5)

+------------------+--------+--------------------+
|        prediction|ArrDelay|            features|
+------------------+--------+--------------------+
|0.6931460279487999|      -5|[1.0,1.0,4.0,10.0...|
|0.7439296526564556|     -11|[1.0,1.0,4.0,15.0...|
|0.5867121567901064|     -27|[1.0,1.0,4.0,40.0...|
|0.3896635958684287|     -15|[1.0,1.0,4.0,44.0...|
|0.6615812978882051|       1|[1.0,1.0,4.0,44.0...|
+------------------+--------+--------------------+
only showing top 5 rows



In [43]:
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol='ArrDelay', predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

pred_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="ArrDelay",metricName="r2")
print("R Squared (R2) on test data = %g" % pred_evaluator.evaluate(predictions))

fmModel = model.stages[1]
print("Factors: " + str(fmModel.factors))
print("Linear: " + str(fmModel.linear))
print("Intercept: " + str(fmModel.intercept))

Root Mean Squared Error (RMSE) on test data = 33.7432
R Squared (R2) on test data = -0.0202498
Factors: DenseMatrix([[-0.13734176,  0.13575624, -0.1220944 ,  0.15238986,  0.12498728,
               0.14471925,  0.1413182 ,  0.14007785],
             [-0.13192597,  0.13580635, -0.15355515,  0.11318518,  0.13977094,
               0.13316608,  0.1235364 ,  0.13413498],
             [-0.12827936,  0.12645923, -0.14623821,  0.13907225,  0.12067736,
               0.136871  ,  0.13240495,  0.13535771],
             [-0.1247703 ,  0.13503355, -0.14268182,  0.14192383,  0.13903658,
               0.13126824,  0.13824103,  0.12818781],
             [-0.13956542,  0.13233411, -0.13272553,  0.13519752,  0.13791362,
               0.13503472,  0.13518516,  0.14467738],
             [-0.14617786,  0.12842154, -0.13927865,  0.13899924,  0.1418498 ,
               0.14035993,  0.14872769,  0.13289414],
             [-0.12038997,  0.13811026, -0.1231779 ,  0.1251023 ,  0.13239863,
               0.12

### Gradient-boosted tree regression

In [12]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

In [13]:
features = df.select(['DepDelay', 'TaxiOut', 'ArrDelay'])

gen_assembler = VectorAssembler(
    inputCols=features.columns[:-1],
    outputCol='features')

gen_output = gen_assembler.transform(df).select('features','ArrDelay')

featureIndexer = VectorIndexer(inputCol='features', outputCol='IndexedFeatures').fit(gen_output)


In [14]:
(train, test) = gen_output.randomSplit([0.7, 0.3])

In [15]:
gbt = GBTRegressor(featuresCol="IndexedFeatures", labelCol="ArrDelay", maxIter=10)

In [16]:
# Train model.  This also runs the indexer.
pipeline = Pipeline(stages=[featureIndexer, gbt])
model = pipeline.fit(train)

In [17]:
predictions = model.transform(test)

In [18]:
# Select example rows to display.
predictions.select("prediction", "ArrDelay", "features").show(20)

+-------------------+--------+------------+
|         prediction|ArrDelay|    features|
+-------------------+--------+------------+
| -9.313398604325107|     -21|   (2,[],[])|
| -9.313398604325107|     -12|   (2,[],[])|
| -9.313398604325107|      -8|   (2,[],[])|
| -9.313398604325107|      -2|   (2,[],[])|
| -9.313398604325107|      12|   (2,[],[])|
|  24.87841228497457|      -6|[-50.0,57.0]|
|   5.76236767039131|       6|[-37.0,38.0]|
|-16.387766928438673|     -38| [-36.0,7.0]|
|-18.432084676605974|     -33| [-34.0,3.0]|
|  24.87841228497457|      -5|[-34.0,62.0]|
|   5.76236767039131|      -3|[-33.0,36.0]|
|-14.091672352242332|     -44|[-32.0,10.0]|
| -17.07887537964493|     -43| [-31.0,6.0]|
|-15.804428891860686|     -33| [-31.0,8.0]|
|-14.091672352242332|     -47|[-31.0,10.0]|
|-16.387766928438673|     -43| [-30.0,7.0]|
|-10.961066056336845|     -26|[-30.0,15.0]|
|  24.87841228497457|      -5|[-30.0,41.0]|
|  -15.0802918377381|     -36| [-29.0,9.0]|
| -11.30213432461033|     -35|[-

In [19]:
evaluator = RegressionEvaluator(
    labelCol="ArrDelay", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

pred_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="ArrDelay",metricName="r2")
print("R Squared (R2) on test data = %g" % pred_evaluator.evaluate(predictions))

gbtModel = model.stages[1]
print(gbtModel)  # summary only

Root Mean Squared Error (RMSE) on test data = 16.7875
R Squared (R2) on test data = 0.746941
GBTRegressionModel: uid=GBTRegressor_9d63956e8de4, numTrees=10, numFeatures=2


In [29]:
sc.stop()