In [1]:
import findspark
findspark.init()

import pyspark
import random

In [2]:
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import isnan, when, count, col

In [3]:
filename = "2004.csv"

In [4]:
def _init_spark():
    spark = SparkSession.builder.appName("Project").getOrCreate()
    sc = spark.sparkContext
    return spark, sc

spark, sc = _init_spark()

sqlContext = SQLContext(sc)

df = sqlContext.read.load(filename, 
                      format='com.databricks.spark.csv', 
                      header='true',
                      delimiter=',',
                      inferSchema='true')
df.cache()

KeyboardInterrupt: 

In [None]:
col_to_drop = ['ArrTime', 'ActualElapsedTime', 'AirTime', 'TaxiIn', 'Diverted', 'CancellationCode',
               'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 'Year', 'TailNum', ]
df = df.drop(*col_to_drop)

df = df.withColumn("ArrDelay", df["ArrDelay"].cast(IntegerType()))
df = df.withColumn("DepDelay", df["DepDelay"].cast(IntegerType()))
df = df.na.drop("any")

In [None]:
df.printSchema()

In [None]:
df.show(5)

### PCA with Linear regression

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import PCA
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from  pyspark.sql.functions import abs
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
for_PCA = df.drop('UniqueCarrier').drop("Origin").drop("Dest").drop("DepTime")

In [None]:
df.columns

In [None]:
for_PCA.columns[8:]

In [None]:
#df_features  = df.drop("ArrDel")
assembler = VectorAssembler(inputCols=for_PCA.columns[:7]+for_PCA.columns[8:], outputCol='features')

In [None]:
data = assembler.transform(for_PCA)

In [None]:
pca = PCA(k=2, inputCol='features', outputCol='PCAfeatures')

In [None]:
pca.extractParamMap()

In [None]:
pca_model = pca.fit(data)

In [None]:
pca_data = pca_model.transform(data).select('PCAfeatures', 'ArrDelay')

In [None]:
pca_data.take(5)

In [None]:
#output = assembler.transform(for_PCA).select('features', 'ArrDelay')

In [None]:
train,test = pca_data.randomSplit([0.75, 0.25])

In [None]:
train.show(5)

In [None]:
lin_reg = LinearRegression(featuresCol = 'PCAfeatures', labelCol='ArrDelay')
linear_model = lin_reg.fit(train)

In [None]:
print("Coefficients: " + str(linear_model.coefficients))
print("\nIntercept: " + str(linear_model.intercept))

In [None]:
trainSummary = linear_model.summary
print("RMSE: %f" % trainSummary.rootMeanSquaredError)
print("\nr2: %f" % trainSummary.r2)

In [None]:
predictions = linear_model.transform(test)
x =((predictions['ArrDelay']-predictions['prediction'])/predictions['ArrDelay'])*100
predictions = predictions.withColumn('Accuracy',abs(x))
predictions.select("prediction","ArrDelay","Accuracy","PCAfeatures").show(10)

In [None]:
pred_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="ArrDelay",metricName="r2")
print("R Squared (R2) on test data = %g" % pred_evaluator.evaluate(predictions))

### Generalized linear regression

In [None]:
features = df.select(['DepDelay', 'TaxiOut'])

gen_assembler = VectorAssembler(
    inputCols=features.columns,
    outputCol='features')

gen_output = gen_assembler.transform(df).select('features','ArrDelay')

In [None]:
gen_output.show(5)

In [None]:
gen_train,gen_test = gen_output.randomSplit([0.75, 0.25])

In [None]:
from pyspark.ml.regression import GeneralizedLinearRegression

In [None]:
glr = GeneralizedLinearRegression(family="gaussian", link="Identity", maxIter=10, regParam=0.3, labelCol='ArrDelay')

In [None]:
gen_model = glr.fit(gen_train)

In [None]:
print("Coefficients: " + str(gen_model.coefficients))
print("\nIntercept: " + str(gen_model.intercept))

In [None]:
trainSummary = gen_model.summary
trainSummary.pValues

In [None]:
predictions = gen_model.transform(gen_test)
x =((predictions['ArrDelay']-predictions['prediction'])/predictions['ArrDelay'])*100
predictions = predictions.withColumn('Accuracy',abs(x))
predictions.select("prediction","ArrDelay","Accuracy","features").show(10)

In [None]:
pred_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="ArrDelay",metricName="r2")
print("R Squared (R2) on test data = %g" % pred_evaluator.evaluate(predictions))

### Decision Tree and Random Forest Regressions

In [None]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer, VectorAssembler

In [None]:
features = df.select(['DepDelay', 'TaxiOut', 'ArrDelay'])

gen_assembler = VectorAssembler(
    inputCols=features.columns[:-1],
    outputCol='features')

gen_output = gen_assembler.transform(df).select('features','ArrDelay')

featureIndexer = VectorIndexer(inputCol='features', outputCol='IndexedFeatures').fit(gen_output)

In [None]:
(train, test) = gen_output.randomSplit([0.7, 0.3])

### Decision Tree Regression

In [None]:
dt = DecisionTreeRegressor(featuresCol="IndexedFeatures", labelCol='ArrDelay')

In [None]:
pipeline = Pipeline(stages=[featureIndexer, dt])

In [None]:
# Train model.  This also runs the indexer.
model = pipeline.fit(train)

In [None]:
# Make predictions.
predictions = model.transform(test)

# Select example rows to display.
predictions.select("prediction", 'ArrDelay', "features").show(25)

In [None]:
    evaluator = RegressionEvaluator(
        labelCol='ArrDelay', predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

    pred_evaluator = RegressionEvaluator(predictionCol="prediction", \
                     labelCol="ArrDelay",metricName="r2")
    print("R Squared (R2) on test data = %g" % pred_evaluator.evaluate(predictions))

    treeModel = model.stages[1]
    # summary only
    print(treeModel)

### Random Forest Regression

In [None]:
from pyspark.ml.regression import RandomForestRegressor

In [None]:
rf = RandomForestRegressor(featuresCol="IndexedFeatures", labelCol='ArrDelay')
pipeline = Pipeline(stages=[featureIndexer, rf])

In [None]:
model = pipeline.fit(train)

In [None]:
predictions = model.transform(test)

In [None]:
predictions.select("prediction", "ArrDelay", "features").show(25)

In [None]:
    evaluator = RegressionEvaluator(
        labelCol='ArrDelay', predictionCol="prediction", metricName="rmse")
    rmse = evaluator.evaluate(predictions)
    print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

    pred_evaluator = RegressionEvaluator(predictionCol="prediction", \
                     labelCol="ArrDelay",metricName="r2")
    print("R Squared (R2) on test data = %g" % pred_evaluator.evaluate(predictions))

    rfModel = model.stages[1]
    print(rfModel)

### Factorization Machines Regressor

In [None]:
features = df.drop('DepTime').drop('UniqueCarrier').drop('Origin').drop('Dest')

In [None]:
features.columns

In [None]:
gen_assembler = VectorAssembler(
    inputCols=features.columns[:7]+features.columns[8:],
    outputCol='features')

gen_output = gen_assembler.transform(df).select('features','ArrDelay')

In [None]:
gen_output.select("ArrDelay").show(12)

In [None]:
from pyspark.ml.regression import FMRegressor
from pyspark.ml.feature import MinMaxScaler

In [None]:
# Scale features.
featureScaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures").fit(gen_output)


In [None]:
(train, test) = gen_output.randomSplit([0.7, 0.3])

In [None]:
# Train a FM model.
fm = FMRegressor(featuresCol="scaledFeatures", stepSize=0.001, labelCol='ArrDelay')

# Create a Pipeline.
pipeline = Pipeline(stages=[featureScaler, fm])

# Train model.
model = pipeline.fit(train)

In [None]:
# Make predictions.
predictions = model.transform(test)

# Select example rows to display.
predictions.select("prediction", 'ArrDelay', "features").show(5)

In [None]:
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol='ArrDelay', predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

pred_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="ArrDelay",metricName="r2")
print("R Squared (R2) on test data = %g" % pred_evaluator.evaluate(predictions))

fmModel = model.stages[1]
print("Factors: " + str(fmModel.factors))
print("Linear: " + str(fmModel.linear))
print("Intercept: " + str(fmModel.intercept))

### Gradient-boosted tree regression

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer, VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
features = df.select(['DepDelay', 'TaxiOut', 'ArrDelay'])

gen_assembler = VectorAssembler(
    inputCols=features.columns[:-1],
    outputCol='features')

gen_output = gen_assembler.transform(df).select('features','ArrDelay')

featureIndexer = VectorIndexer(inputCol='features', outputCol='IndexedFeatures').fit(gen_output)


In [None]:
(train, test) = gen_output.randomSplit([0.7, 0.3])

In [None]:
gbt = GBTRegressor(featuresCol="IndexedFeatures", labelCol="ArrDelay", maxIter=10)

In [None]:
# Train model.  This also runs the indexer.
pipeline = Pipeline(stages=[featureIndexer, gbt])
model = pipeline.fit(train)

In [None]:
predictions = model.transform(test)

In [None]:
# Select example rows to display.
predictions.select("prediction", "ArrDelay", "features").show(20)

In [None]:
evaluator = RegressionEvaluator(
    labelCol="ArrDelay", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

pred_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="ArrDelay",metricName="r2")
print("R Squared (R2) on test data = %g" % pred_evaluator.evaluate(predictions))

gbtModel = model.stages[1]
print(gbtModel)  # summary only

In [None]:
sc.stop()

In [None]:
val = False

(str(val).lower() == 'true')



## New Stage (Ignore info Above)

In [7]:
import random

import findspark
findspark.init()
import pyspark

from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, Bucketizer, Normalizer
from pyspark.ml.regression import LinearRegression
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import isnan, when, count, col, abs
from pyspark.sql import functions as sf 
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator
from pyspark.ml import Pipeline


filename = "2004.csv"

def _init_spark():
    spark = SparkSession.builder.appName("Project").getOrCreate()
    sc = spark.sparkContext
    return spark, sc

spark, sc = _init_spark()
sqlContext = SQLContext(sc)

df = sqlContext.read.load(filename, 
                      format='com.databricks.spark.csv', 
                      header='true',
                      delimiter=',',
                      inferSchema='true')
df.cache()

col_to_drop = ['ArrTime', 'ActualElapsedTime', 'AirTime', 'TaxiIn', 'Diverted',
               'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay', 
               'Year', 'TailNum', 'CancellationCode' ] # Only those 3 I added up to delay, others 
                                                       # are delayed as is stated in the task
df = df.drop(*col_to_drop)
df = df.withColumn("ArrDelay", df["ArrDelay"].cast(IntegerType()))
df = df.withColumn("DepDelay", df["DepDelay"].cast(IntegerType()))
df = df.withColumn("CRSDepTime", df["CRSDepTime"].cast(IntegerType()))
df = df.withColumn("CRSArrTime", df["CRSArrTime"].cast(IntegerType()))
df = df.withColumn("DepTime", df["DepTime"].cast(IntegerType()))

#These are new lines to add
df = df.withColumn("Month", df["Month"].cast(IntegerType()))
df = df.withColumn("DayOfWeek", df["DayOfWeek"].cast(IntegerType()))
df = df.withColumn("CRSElapsedTime", df["CRSElapsedTime"].cast(IntegerType()))
df = df.withColumn("Distance", df["Distance"].cast(IntegerType()))
df = df.withColumn("TaxiOut", df["TaxiOut"].cast(IntegerType()))
#end here


df = df.filter("Cancelled == 0") #select only those flights that happened
df = df.drop("Cancelled")
df = df.drop(*["UniqueCarrier", "DayofMonth", "FlightNum"]) #Droping unimportant categorical variables

df = df.na.drop("any") # Drop columns with null values +- 99% of dataset remains 

df = df.withColumn('OrigDest', 
                    sf.concat(sf.col('Origin'),sf.lit('_'), sf.col('Dest')))
df = df.withColumn("Speed", sf.round(col("Distance") / col("CRSElapsedTime"), 2))
#df = df.withColumnRenamed("ArrDelay", "label")

#These are new lines to add
df = df.drop(*["Origin", "Dest", "Distance", "CRSElapsedTime"])
#end here

In [8]:
train, test = df.randomSplit([.5, 0.1], seed=1234)

In [9]:
train.show(5)
#train = train.limit(1000000)
#test = test.limit(250000)

+-----+---------+-------+----------+----------+--------+--------+-------+--------+-----+
|Month|DayOfWeek|DepTime|CRSDepTime|CRSArrTime|ArrDelay|DepDelay|TaxiOut|OrigDest|Speed|
+-----+---------+-------+----------+----------+--------+--------+-------+--------+-----+
|    1|        1|      2|      2355|       855|       3|       7|     17| HNL_PHX|  8.1|
|    1|        1|      3|      2355|       855|      -1|       8|     11| HNL_PHX|  8.1|
|    1|        1|      4|      2215|      2335|     101|     109|     15| SFO_ONT| 4.54|
|    1|        1|      7|      2000|      2142|     261|     247|     35| ORD_MEM| 4.81|
|    1|        1|      7|      2220|      2341|      94|     107|     11| CLT_ATL|  2.8|
+-----+---------+-------+----------+----------+--------+--------+-------+--------+-----+
only showing top 5 rows



In [10]:
X1 = ['DepDelay', 'TaxiOut']
X2 = ['DepDelay', 'TaxiOut',  'HotDepTime']

In [11]:
from pyspark.ml.feature import StandardScaler

In [16]:
splits = [-float("inf"), 500, 1200, 1700, float("inf")]



bucketizer = Bucketizer(splitsArray= [splits, splits, splits], \
                        inputCols=["CRSDepTime", "CRSArrTime", "DepTime"],\
                        outputCols=["CatCRSDepTime", "CatCRSArrTime", "CatDepTime"])

varIdxer = StringIndexer(inputCol="OrigDest", outputCol="IndOrigDest")


oneHot = OneHotEncoder(inputCols=['Month', 'DayOfWeek', 'CatCRSDepTime', 'CatCRSArrTime',\
                                  'IndOrigDest', 'CatDepTime'],
                       outputCols=['HotMonth', 'HotDayOfWeek', 'HotCRSCatDepTime', 'HotCRSCatArrTime',\
                                   'HotIndOrigDest', 'HotDepTime'])

assembler = VectorAssembler(inputCols=X2, outputCol='features')

#scaler = StandardScaler(inputCol='feat', outputCol="features",
#                        withStd=True, withMean=False)

lin_reg = LinearRegression(featuresCol = 'features', labelCol="ArrDelay")

In [17]:
# df = df.drop(*["CRSDepTime", "CRSArrTime"])
# df = df.drop("OrigDest")
# df = df.drop(*['Month', 'DayOfWeek', 'CatDepTime', 'CatCRSDepTime', 'CatCRSArrTime', 'IndOrigDest'])
# df = df.drop(*["Distance", "CRSElapsedTime"])

In [18]:
pipeline = Pipeline(stages=[bucketizer, varIdxer, oneHot, assembler,  lin_reg])

In [19]:
linParamGrid = ParamGridBuilder()\
    .addGrid(lin_reg.regParam, [0.1, 0.01]) \
    .addGrid(lin_reg.fitIntercept, [False, True])\
    .addGrid(lin_reg.elasticNetParam, [0.0, 1.0])\
    .build()

#.addGrid(lin_reg.maxIter, [1, 3])

tvs = CrossValidator(estimator=pipeline,\
                           estimatorParamMaps = linParamGrid,  
                           evaluator=RegressionEvaluator(labelCol="ArrDelay", metricName="rmse"),\
                           numFolds=3)
                           #trainRatio=0.85)

In [20]:
model = tvs.fit(train)
#model = pipeline.fit(train)

In [21]:
predictions = model.transform(test)
x =((predictions['ArrDelay']-predictions['prediction'])/predictions['ArrDelay'])*100
predictions = predictions.withColumn('Accuracy',abs(x))
#predictions.select("prediction","ArrDelay","Accuracy","features").show(10)

In [22]:
pred_evaluator = RegressionEvaluator(predictionCol="prediction",labelCol="ArrDelay",metricName="r2")
print("R Squared (R2) on test data = %g" % pred_evaluator.evaluate(predictions))

evaluator = RegressionEvaluator(labelCol='ArrDelay', predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

evaluator = RegressionEvaluator(labelCol='ArrDelay', predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(predictions)
print("Mean Absolute Error (MAE) on test data = %g" % mae)

R Squared (R2) on test data = 0.920564
Root Mean Squared Error (RMSE) on test data = 11.1777
Mean Absolute Error (MAE) on test data = 8.0773


R Squared (R2) on test data = 0.832352
Root Mean Squared Error (RMSE) on test data = 12.9493
Mean Absolute Error (MAE) on test data = 8.24118

Grid + CV
R Squared (R2) on test data = 0.89077
Root Mean Squared Error (RMSE) on test data = 12.4729
Mean Absolute Error (MAE) on test data = 8.31376

In [36]:
from pyspark.ml.regression import GBTRegressor
#from pyspark.ml.regression import FMRegressor

In [42]:
gbt = GBTRegressor(featuresCol="features", labelCol="ArrDelay", maxIter=10)
#fm = FMRegressor(featuresCol="features", stepSize=0.001, labelCol='ArrDelay')

In [43]:
pipeline = Pipeline(stages=[bucketizer, varIdxer, oneHot, assembler, fm])

In [44]:
TreeParamGrid = ParamGridBuilder()\
    .addGrid(gbt.maxDepth, [2, 10])\
    .addGrid(gbt.maxBins, [10, 20])\
    .build()

tvs = CrossValidator(estimator=pipeline,
                           estimatorParamMaps=TreeParamGrid, #remove if don't want to use ParamGridBuilder
                           evaluator=RegressionEvaluator(labelCol="ArrDelay", metricName="rmse"),
                           numFolds=3)
                     #trainRatio=0.85)

In [45]:
train.show(5)

+-----+---------+-------+----------+----------+--------------+--------+--------+--------+-------+--------+-----+
|Month|DayOfWeek|DepTime|CRSDepTime|CRSArrTime|CRSElapsedTime|ArrDelay|DepDelay|Distance|TaxiOut|OrigDest|Speed|
+-----+---------+-------+----------+----------+--------------+--------+--------+--------+-------+--------+-----+
|    1|        1|      2|      2355|       855|           360|       3|       7|    2917|     17| HNL_PHX|  8.1|
|    1|        1|      3|      2355|       855|           360|      -1|       8|    2917|     11| HNL_PHX|  8.1|
|    1|        1|      4|      2215|      2335|            80|     101|     109|     363|     15| SFO_ONT| 4.54|
|    1|        1|      7|      2000|      2142|           102|     261|     247|     491|     35| ORD_MEM| 4.81|
|    1|        1|      7|      2220|      2341|            81|      94|     107|     227|     11| CLT_ATL|  2.8|
+-----+---------+-------+----------+----------+--------------+--------+--------+--------+-------

In [46]:
model = tvs.fit(train)
#model = pipeline.fit(train)

In [47]:
predictions = model.transform(test)

In [48]:
x =((predictions['ArrDelay']-predictions['prediction'])/predictions['ArrDelay'])*100
predictions = predictions.withColumn('Accuracy',abs(x))
#predictions.select("prediction","ArrDelay","Accuracy","features").show(10)

In [49]:
pred_evaluator = RegressionEvaluator(predictionCol="prediction",labelCol="ArrDelay",metricName="r2")
print("R Squared (R2) on test data = %g" % pred_evaluator.evaluate(predictions))

evaluator = RegressionEvaluator(labelCol='ArrDelay', predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

evaluator = RegressionEvaluator(labelCol='ArrDelay', predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(predictions)
print("Mean Absolute Error (MAE) on test data = %g" % mae)

R Squared (R2) on test data = 0.615811
Root Mean Squared Error (RMSE) on test data = 24.5814
Mean Absolute Error (MAE) on test data = 13.1251


In [8]:
X = []
X.append({ "name": "X1", "variables": ['DepDelay', 'TaxiOut']})
X.append({ "name": "X2", "variables": ['DepDelay', 'TaxiOut',  'HotDepTime']})
X.append({ "name": "X3", "variables": ['DepDelay', 'TaxiOut', 'HotIndOrigDest', 'HotDepTime']})
X.append({ "name": "X4", "variables": ['DepDelay', 'TaxiOut', 'HotDayOfWeek', 'HotMonth', 'Speed']})
X.append({ "name": "X5", "variables": ['DepDelay', 'TaxiOut', 'HotDayOfWeek', 'HotIndOrigDest', 'Speed']})
X.append({ "name": "X6", "variables": ['DepDelay', 'TaxiOut', 'HotIndOrigDest', 'Speed', 'HotCRSCatDepTime', 'HotCRSCatArrTime', 'HotDepTime']})

X[2]['variables']

['DepDelay', 'TaxiOut', 'HotIndOrigDest', 'HotDepTime']

In [None]:
import pandas as pd


df = []
df.append({'name': 'X1', 'variables': ['DepDelay', 'TaxiOut'],'R2LR': 0.860337, 'maeLR': 7.83117, 'rmseLR': 12.4009, 'R2RF': 0.7113226179229843, 'maeRF': 9.632419894894845, 'rmseRF': 17.870397493687378, 'R2DT': 0.833566, 'maeDT': 7.76935, 'rmseDT': 13.5342, 'R2GBR': 0.6779749591610146, 'maeGBR': 10.291720285041185, 'rmseGBR': 18.93692861799665})
df.append({'name': 'X2', 'variables': ['DepDelay', 'TaxiOut', 'HotDepTime'], 'R2LR': 0.860466, 'maeLR': 7.82792, 'rmseLR': 12.3951 , 'R2RF': 0.6618056282411904, 'maeRF': 10.69581629434264, 'rmseRF': 19.317928448876994, 'R2DT': 0.7359741049968364, 'maeDT': 9.276273778053989, 'rmseDT': 17.06870714215145, 'R2GBR': 0.7317420904346483, 'maeGBR': 8.919500016072934, 'rmseGBR': 17.40522038041701})
df.append({'name': 'X3', 'variables': ['DepDelay', 'TaxiOut', 'HotIndOrigDest', 'HotDepTime'], 'R2LR':  0.867357, 'maeLR': 7.39696, 'rmseLR': 12.114, 'R2RF': 0.6893474138730874, 'maeRF': 10.391762216453797, 'rmseRF': 18.48095278940399, 'R2DT': 0.7429083054863235, 'maeDT': 9.244540689810753, 'rmseDT': 16.812445684525386, 'R2GBR': 0.7340066674151544, 'maeGBR': 8.970977217618819, 'rmseGBR': 17.224534616062524})
df.append({'name': 'X4', 'variables': ['DepDelay', 'TaxiOut', 'HotDayOfWeek', 'HotMonth', 'Speed'], 'R2LR': 0.860717, 'maeLR': 7.79462, 'rmseLR': 12.4614, 'R2RF': 0.6473918259213414, 'maeRF': 10.698236247649278, 'rmseRF': 19.77355050663056, 'R2DT': 0.734263104956572, 'maeDT': 9.320667734524053, 'rmseDT': 17.16581381584866, 'R2GBR': 0.7466087502783435, 'maeGBR': 8.799469470636394, 'rmseGBR': 16.83094772729247})
df.append({'name': 'X5', 'variables': ['DepDelay', 'TaxiOut', 'HotDayOfWeek', 'HotIndOrigDest', 'Speed'], 'R2LR': 0.87867, 'maeLR': 7.121, 'rmseLR': 11.6102, 'R2RF': 0.6450061986902308, 'maeRF': 10.972519639797552, 'rmseRF': 19.980872071516536, 'R2DT': 0.7374275663556327, 'maeDT': 9.256949155299973, 'rmseDT': 17.184172320005104, 'R2GBR': 0.7402478789139912, 'maeGBR': 8.916626117462856, 'rmseGBR': 17.048795624738972})
df.append({'name': 'X6', 'variables': ['DepDelay', 'TaxiOut', 'HotIndOrigDest', 'Speed', 'HotCRSCatDepTime', 'HotCRSCatArrTime', 'HotDepTime'], 'R2LR': 0.8775881, 'maeLR': 7.06387, 'rmseLR': 11.7076 , 'R2RF': 0.6932635305087731, 'maeRF': 10.377749609744654, 'rmseRF': 18.532997429411278, 'R2DT': 0.7426814121619514, 'maeDT': 9.235528412342111, 'rmseDT': 16.974560476441024, 'R2GBR': 0.7393962128806398, 'maeGBR': 8.897008426472556, 'rmseGBR': 16.96102270986277})

DF = pd.DataFrame(df)
DF

In [None]:
import pandas as pd


df = []
df.append({'name': 'X1', 'variables': ['DepDelay', 'TaxiOut'],'R2LR': 0.860337, 'maeLR': 7.83117, 'rmseLR': 12.4009 })
df.append({'name': 'X2', 'variables': ['DepDelay', 'TaxiOut', 'HotDepTime'], 'R2LR': 0.860466, 'maeLR': 7.82792, 'rmseLR': 12.3951  })
df.append({'name': 'X3', 'variables': ['DepDelay', 'TaxiOut', 'HotIndOrigDest', 'HotDepTime'], 'R2LR':  0.867357, 'maeLR': 7.39696, 'rmseLR': 12.114 })
df.append({'name': 'X4', 'variables': ['DepDelay', 'TaxiOut', 'HotDayOfWeek', 'HotMonth', 'Speed'], 'R2LR': 0.860717, 'maeLR': 7.79462, 'rmseLR': 12.4614 })
df.append({'name': 'X5', 'variables': ['DepDelay', 'TaxiOut', 'HotDayOfWeek', 'HotIndOrigDest', 'Speed'], 'R2LR': 0.87867, 'maeLR': 7.121, 'rmseLR': 11.6102 })
df.append({'name': 'X6', 'variables': ['DepDelay', 'TaxiOut', 'HotIndOrigDest', 'Speed', 'HotCRSCatDepTime', 'HotCRSCatArrTime', 'HotDepTime'], 'R2LR': 0.8775881, 'maeLR': 7.06387, 'rmseLR': 11.7076 })

DF = pd.DataFrame(df)
DF

R Squared (R2) on test data = 0.678511
Root Mean Squared Error (RMSE) on test data = 17.932
Mean Absolute Error (MAE) on test data = 9.42207

Pipeline:

R Squared (R2) on test data = 0.739797
Root Mean Squared Error (RMSE) on test data = 16.9819
Mean Absolute Error (MAE) on test data = 8.91377


Grid:
R Squared (R2) on test data = 0.700806
Root Mean Squared Error (RMSE) on test data = 18.2102
Mean Absolute Error (MAE) on test data = 9.28254

Grid +StandScale
R Squared (R2) on test data = 0.700806
Root Mean Squared Error (RMSE) on test data = 18.2102
Mean Absolute Error (MAE) on test data = 9.28254