In [1]:
# run this to shorten the data import from the files
path_data = '/home/nero/Documents/Estudos/DataCamp/Python/Machine_Learning_with_PySpark/datasets/'

# start spark session

from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[*]').appName('chapter_02').getOrCreate()

23/07/13 07:49:13 WARN Utils: Your hostname, nero resolves to a loopback address: 127.0.1.1; using 192.168.1.14 instead (on interface wlp2s0)
23/07/13 07:49:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/13 07:49:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/07/13 07:49:17 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
flights = spark.read.csv(path_data + 'clean_flights.csv',
                         sep=',',
                         header=True,
                         inferSchema=True,
                         nullValue='NA')

from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Specify column names and types
schema = StructType([
    StructField("id", IntegerType()),
    StructField("text", StringType()),
    StructField("label", IntegerType())
])

# Load data from a delimited file
sms = spark.read.csv(path_data+'sms.csv', sep=';', header=False, schema=schema)

sms.createOrReplaceTempView('sms')
flights.createOrReplaceTempView('flights')

spark.catalog.listTables()

                                                                                

[Table(name='flights', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True),
 Table(name='sms', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]

In [3]:
flights.show(5)

+---+---+---+-------+---+----+------+--------+-----+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|
+---+---+---+-------+---+----+------+--------+-----+
|  0| 22|  2|     UA|ORD| 316| 16.33|      82|   30|
|  2| 20|  4|     UA|SFO| 337|  6.17|      82|   -8|
|  9| 13|  1|     AA|ORD|1236| 10.33|     195|   -5|
|  5|  2|  1|     UA|SFO| 550|  7.98|     102|    2|
|  7|  2|  6|     AA|ORD| 733| 10.83|     135|   54|
+---+---+---+-------+---+----+------+--------+-----+
only showing top 5 rows



In [4]:
# create org_index

from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol='org', outputCol='org_idx')

indexer_model = indexer.fit(flights)

flights = indexer_model.transform(flights)

flights.show(5)

                                                                                

+---+---+---+-------+---+----+------+--------+-----+-------+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|org_idx|
+---+---+---+-------+---+----+------+--------+-----+-------+
|  0| 22|  2|     UA|ORD| 316| 16.33|      82|   30|    0.0|
|  2| 20|  4|     UA|SFO| 337|  6.17|      82|   -8|    1.0|
|  9| 13|  1|     AA|ORD|1236| 10.33|     195|   -5|    0.0|
|  5|  2|  1|     UA|SFO| 550|  7.98|     102|    2|    1.0|
|  7|  2|  6|     AA|ORD| 733| 10.83|     135|   54|    0.0|
+---+---+---+-------+---+----+------+--------+-----+-------+
only showing top 5 rows



In [5]:
# exercise 01

"""
Encoding flight origin

The org column in the flights data is a categorical variable giving the airport from which a flight departs.

    ORD — O'Hare International Airport (Chicago)
    SFO — San Francisco International Airport
    JFK — John F Kennedy International Airport (New York)
    LGA — La Guardia Airport (New York)
    SMF — Sacramento
    SJC — San Jose
    OGG — Kahului (Hawaii)

Obviously this is only a small subset of airports. Nevertheless, since this is a categorical variable, it needs to be one-hot encoded before it can be used in a regression model.

The data are in a variable called flights. You have already used a string indexer to create a column of indexed values corresponding to the strings in org.

You might find it useful to revise the slides from the lessons in the Slides panel next to the IPython Shell.
"""

# Instructions

"""

    Import the one-hot encoder class.
    Create a one-hot encoder instance, naming the input column org_idx and the output column org_dummy.
    Apply the one-hot encoder to the flights data.
    Generate a summary of the mapping from categorical values to binary encoded dummy variables. Include only unique values and order by org_idx.

"""

# solution

# Import the one hot encoder class
from pyspark.ml.feature import OneHotEncoder

# Create an instance of the one hot encoder
onehot = OneHotEncoder(inputCols=['org_idx'], outputCols=['org_dummy'])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flights)
flights_onehot = onehot.transform(flights)

# Check the results
flights_onehot.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx').show()

#----------------------------------#

# Conclusion

"""
Excellent! Note that one of the category levels, OGG, does not get a dummy variable.
"""

[Stage 10:>                                                         (0 + 1) / 1]

+---+-------+-------------+
|org|org_idx|    org_dummy|
+---+-------+-------------+
|ORD|    0.0|(7,[0],[1.0])|
|SFO|    1.0|(7,[1],[1.0])|
|JFK|    2.0|(7,[2],[1.0])|
|LGA|    3.0|(7,[3],[1.0])|
|SMF|    4.0|(7,[4],[1.0])|
|SJC|    5.0|(7,[5],[1.0])|
|TUS|    6.0|(7,[6],[1.0])|
|OGG|    7.0|    (7,[],[])|
+---+-------+-------------+



                                                                                

'\nExcellent! Note that one of the category levels, OGG, does not get a dummy variable.\n'

In [6]:
flights.show(5)
from pyspark.sql.functions import round
flights = flights.withColumn('km', round(flights.mile * 1.609344, 2) )
flights.show(5)

+---+---+---+-------+---+----+------+--------+-----+-------+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|org_idx|
+---+---+---+-------+---+----+------+--------+-----+-------+
|  0| 22|  2|     UA|ORD| 316| 16.33|      82|   30|    0.0|
|  2| 20|  4|     UA|SFO| 337|  6.17|      82|   -8|    1.0|
|  9| 13|  1|     AA|ORD|1236| 10.33|     195|   -5|    0.0|
|  5|  2|  1|     UA|SFO| 550|  7.98|     102|    2|    1.0|
|  7|  2|  6|     AA|ORD| 733| 10.83|     135|   54|    0.0|
+---+---+---+-------+---+----+------+--------+-----+-------+
only showing top 5 rows

+---+---+---+-------+---+----+------+--------+-----+-------+-------+
|mon|dom|dow|carrier|org|mile|depart|duration|delay|org_idx|     km|
+---+---+---+-------+---+----+------+--------+-----+-------+-------+
|  0| 22|  2|     UA|ORD| 316| 16.33|      82|   30|    0.0| 508.55|
|  2| 20|  4|     UA|SFO| 337|  6.17|      82|   -8|    1.0| 542.35|
|  9| 13|  1|     AA|ORD|1236| 10.33|     195|   -5|    0.0|1989.15|
|  5|  2|  1

In [7]:
flights = flights.drop('mile')

In [8]:
flights.show(2)

+---+---+---+-------+---+------+--------+-----+-------+------+
|mon|dom|dow|carrier|org|depart|duration|delay|org_idx|    km|
+---+---+---+-------+---+------+--------+-----+-------+------+
|  0| 22|  2|     UA|ORD| 16.33|      82|   30|    0.0|508.55|
|  2| 20|  4|     UA|SFO|  6.17|      82|   -8|    1.0|542.35|
+---+---+---+-------+---+------+--------+-----+-------+------+
only showing top 2 rows



In [9]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ['km'], outputCol='features')

flights = assembler.transform(flights)

flights.show(5)

+---+---+---+-------+---+------+--------+-----+-------+-------+---------+
|mon|dom|dow|carrier|org|depart|duration|delay|org_idx|     km| features|
+---+---+---+-------+---+------+--------+-----+-------+-------+---------+
|  0| 22|  2|     UA|ORD| 16.33|      82|   30|    0.0| 508.55| [508.55]|
|  2| 20|  4|     UA|SFO|  6.17|      82|   -8|    1.0| 542.35| [542.35]|
|  9| 13|  1|     AA|ORD| 10.33|     195|   -5|    0.0|1989.15|[1989.15]|
|  5|  2|  1|     UA|SFO|  7.98|     102|    2|    1.0| 885.14| [885.14]|
|  7|  2|  6|     AA|ORD| 10.83|     135|   54|    0.0|1179.65|[1179.65]|
+---+---+---+-------+---+------+--------+-----+-------+-------+---------+
only showing top 5 rows



In [10]:
# Create an instance of the one hot encoder
onehot = OneHotEncoder(inputCols=['org_idx'], outputCols=['org_dummy'])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flights)
flights = onehot.transform(flights)

flights.show(5)

+---+---+---+-------+---+------+--------+-----+-------+-------+---------+-------------+
|mon|dom|dow|carrier|org|depart|duration|delay|org_idx|     km| features|    org_dummy|
+---+---+---+-------+---+------+--------+-----+-------+-------+---------+-------------+
|  0| 22|  2|     UA|ORD| 16.33|      82|   30|    0.0| 508.55| [508.55]|(7,[0],[1.0])|
|  2| 20|  4|     UA|SFO|  6.17|      82|   -8|    1.0| 542.35| [542.35]|(7,[1],[1.0])|
|  9| 13|  1|     AA|ORD| 10.33|     195|   -5|    0.0|1989.15|[1989.15]|(7,[0],[1.0])|
|  5|  2|  1|     UA|SFO|  7.98|     102|    2|    1.0| 885.14| [885.14]|(7,[1],[1.0])|
|  7|  2|  6|     AA|ORD| 10.83|     135|   54|    0.0|1179.65|[1179.65]|(7,[0],[1.0])|
+---+---+---+-------+---+------+--------+-----+-------+-------+---------+-------------+
only showing top 5 rows



In [11]:
flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=43)

In [12]:
# exercise 02

"""
Flight duration model: Just distance

In this exercise you'll build a regression model to predict flight duration (the duration column).

For the moment you'll keep the model simple, including only the distance of the flight (the km column) as a predictor.

The data are in flights. The first few records are displayed in the terminal. These data have also been split into training and testing sets and are available as flights_train and flights_test.
"""

# Instructions

"""

    Create a linear regression object. Specify the name of the label column. Fit it to the training data.
    Make predictions on the testing data.
    Create a regression evaluator object and use it to evaluate RMSE on the testing data.

"""

# solution

from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Create a regression object and train on training data
regression = LinearRegression(labelCol='duration').fit(flights_train)

# Create predictions for the testing data and take a look at the predictions
predictions = regression.transform(flights_test)
predictions.select('duration', 'prediction').show(5, False)

# Calculate the RMSE
print(RegressionEvaluator(labelCol='duration').evaluate(predictions))

#----------------------------------#

# Conclusion

"""
You've built a simple regression model. Let's make sense of the coefficients!
"""

23/07/13 07:49:49 WARN Instrumentation: [2177ec93] regParam is zero, which might cause numerical instability and overfitting.
                                                                                

+--------+------------------+
|duration|prediction        |
+--------+------------------+
|230     |238.3539368954673 |
|250     |213.06697799844466|
|170     |133.31492829911446|
|251     |245.0409226204015 |
|155     |141.70312187002145|
+--------+------------------+
only showing top 5 rows



[Stage 21:>                                                         (0 + 1) / 1]

16.98634272885328


                                                                                

"\nYou've built a simple regression model. Let's make sense of the coefficients!\n"

In [13]:
# exercise 03

"""
Interpreting the coefficients

The linear regression model for flight duration as a function of distance takes the form:
duration = a + b * distance

where

a   — intercept (component of duration which does not depend on distance) and

b    — coefficient (rate at which duration increases as a function of distance; also called the slope).

By looking at the coefficients of your model you will be able to infer

    how much of the average flight duration is actually spent on the ground and
    what the average speed is during a flight.

The linear regression model is available as regression.
"""

# Instructions

"""

    What's the intercept?
    What are the coefficients? This is a vector.
    Extract the element from the vector which corresponds to the slope for distance.
    Find the average speed in km per hour.

"""

# solution

# Intercept (average minutes on ground)
inter = regression.intercept
print(inter)

# Coefficients
coefs = regression.coefficients
print(coefs)

# Average minutes per km
minutes_per_km = regression.coefficients[0]
print(minutes_per_km)

# Average speed in km per hour
avg_speed = 60 / minutes_per_km
print(avg_speed)

#----------------------------------#

# Conclusion

"""
The average speed of a commercial jet is around 850 km/hour. But you got that already from the data!
"""

44.2017029215891
[0.0755420890751709]
0.0755420890751709
794.2592101245018


'\nThe average speed of a commercial jet is around 850 km/hour. But you got that already from the data!\n'

In [14]:
flights = flights.drop('features')

In [15]:
assembler = VectorAssembler(inputCols = ['km', 'org_dummy'], outputCol='features')

flights = assembler.transform(flights)
flights.show(5)

+---+---+---+-------+---+------+--------+-----+-------+-------+-------------+--------------------+
|mon|dom|dow|carrier|org|depart|duration|delay|org_idx|     km|    org_dummy|            features|
+---+---+---+-------+---+------+--------+-----+-------+-------+-------------+--------------------+
|  0| 22|  2|     UA|ORD| 16.33|      82|   30|    0.0| 508.55|(7,[0],[1.0])|(8,[0,1],[508.55,...|
|  2| 20|  4|     UA|SFO|  6.17|      82|   -8|    1.0| 542.35|(7,[1],[1.0])|(8,[0,2],[542.35,...|
|  9| 13|  1|     AA|ORD| 10.33|     195|   -5|    0.0|1989.15|(7,[0],[1.0])|(8,[0,1],[1989.15...|
|  5|  2|  1|     UA|SFO|  7.98|     102|    2|    1.0| 885.14|(7,[1],[1.0])|(8,[0,2],[885.14,...|
|  7|  2|  6|     AA|ORD| 10.83|     135|   54|    0.0|1179.65|(7,[0],[1.0])|(8,[0,1],[1179.65...|
+---+---+---+-------+---+------+--------+-----+-------+-------+-------------+--------------------+
only showing top 5 rows



In [16]:
flights.select('features').show(5)

+--------------------+
|            features|
+--------------------+
|(8,[0,1],[508.55,...|
|(8,[0,2],[542.35,...|
|(8,[0,1],[1989.15...|
|(8,[0,2],[885.14,...|
|(8,[0,1],[1179.65...|
+--------------------+
only showing top 5 rows



In [17]:
flights_train, flights_test = flights.randomSplit([0.8, 0.2], seed=43)

In [18]:
# exercise 04

"""
Flight duration model: Adding origin airport

Some airports are busier than others. Some airports are bigger than others too. Flights departing from large or busy airports are likely to spend more time taxiing or waiting for their takeoff slot. So it stands to reason that the duration of a flight might depend not only on the distance being covered but also the airport from which the flight departs.

You are going to make the regression model a little more sophisticated by including the departure airport as a predictor.

These data have been split into training and testing sets and are available as flights_train and flights_test. The origin airport, stored in the org column, has been indexed into org_idx, which in turn has been one-hot encoded into org_dummy. The first few records are displayed in the terminal.
"""

# Instructions

"""

    Fit a linear regression model to the training data.
    Make predictions for the testing data.
    Calculate the RMSE for predictions on the testing data.

"""

# solution

from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Create a regression object and train on training data
regression = LinearRegression(labelCol='duration').fit(flights_train)

# Create predictions for the testing data
predictions = regression.transform(flights_test)

# Calculate the RMSE on testing data
print(RegressionEvaluator(labelCol='duration').evaluate(predictions))

#----------------------------------#

# Conclusion

"""
Looking good! Let's try to make sense of the coefficients.
"""

23/07/13 07:49:59 WARN Instrumentation: [6c5f3a25] regParam is zero, which might cause numerical instability and overfitting.
[Stage 26:>                                                         (0 + 1) / 1]

11.138902990617611


                                                                                

"\nLooking good! Let's try to make sense of the coefficients.\n"

In [19]:
# exercise 05

"""
Interpreting coefficients

Remember that origin airport, org, has eight possible values (ORD, SFO, JFK, LGA, SMF, SJC, TUS and OGG) which have been one-hot encoded to seven dummy variables in org_dummy.

The values for km and org_dummy have been assembled into features, which has eight columns with sparse representation. Column indices in features are as follows:

    0 — km
    1 — ORD
    2 — SFO
    3 — JFK
    4 — LGA
    5 — SMF
    6 — SJC and
    7 — TUS.

Note that OGG does not appear in this list because it is the reference level for the origin airport category.

In this exercise you'll be using the intercept and coefficients attributes to interpret the model.

The coefficients attribute is a list, where the first element indicates how flight duration changes with flight distance.
"""

# Instructions

"""

    Find the average speed in km per hour. This will be different to the value that you got earlier because your model is now more sophisticated.
    What's the average time on the ground at OGG?
    What's the average time on the ground at JFK?
    What's the average time on the ground at LGA?

"""

# solution

# Average speed in km per hour
avg_speed_hour = 60 / regression.coefficients[0]
print(avg_speed_hour)

# Average minutes on ground at OGG
inter = regression.intercept
print(inter)

# Average minutes on ground at JFK
avg_ground_jfk = inter + regression.coefficients[3]
print(avg_ground_jfk)

# Average minutes on ground at LGA
avg_ground_lga = inter + regression.coefficients[4]
print(avg_ground_lga)

#----------------------------------#

# Conclusion

"""
You're going to spend over an hour on the ground at JFK or LGA but only around 15 minutes at OGG.
"""

808.2526745460777
15.363012686519335
68.24811794897823
62.586065423434526


"\nYou're going to spend over an hour on the ground at JFK or LGA but only around 15 minutes at OGG.\n"

In [26]:
flights.show()

+---+---+---+-------+---+------+--------+-----+-------+-------+-------------+--------------------+
|mon|dom|dow|carrier|org|depart|duration|delay|org_idx|     km|    org_dummy|            features|
+---+---+---+-------+---+------+--------+-----+-------+-------+-------------+--------------------+
|  0| 22|  2|     UA|ORD| 16.33|      82|   30|    0.0| 508.55|(7,[0],[1.0])|(8,[0,1],[508.55,...|
|  2| 20|  4|     UA|SFO|  6.17|      82|   -8|    1.0| 542.35|(7,[1],[1.0])|(8,[0,2],[542.35,...|
|  9| 13|  1|     AA|ORD| 10.33|     195|   -5|    0.0|1989.15|(7,[0],[1.0])|(8,[0,1],[1989.15...|
|  5|  2|  1|     UA|SFO|  7.98|     102|    2|    1.0| 885.14|(7,[1],[1.0])|(8,[0,2],[885.14,...|
|  7|  2|  6|     AA|ORD| 10.83|     135|   54|    0.0|1179.65|(7,[0],[1.0])|(8,[0,1],[1179.65...|
|  1| 16|  6|     UA|ORD|   8.0|     232|   -7|    0.0|2317.46|(7,[0],[1.0])|(8,[0,1],[2317.46...|
|  1| 22|  5|     UA|SJC|  7.98|     250|  -13|    5.0|2943.49|(7,[5],[1.0])|(8,[0,6],[2943.49...|
| 11|  8| 

In [33]:
# exercise 06

"""
Bucketing departure time

Time of day data are a challenge with regression models. They are also a great candidate for bucketing.

In this lesson you will convert the flight departure times from numeric values between 0 (corresponding to 00:00) and 24 (corresponding to 24:00) to binned values. You'll then take those binned values and one-hot encode them.
"""

# Instructions

"""

    Create a bucketizer object with bin boundaries at 0, 3, 6, …, 24 which correspond to times 0:00, 03:00, 06:00, …, 24:00. Specify input column as depart and output column as depart_bucket.
    Bucket the departure times in the flights data. Show the first five values for depart and depart_bucket.
    Create a one-hot encoder object. Specify output column as depart_dummy.
    Train the encoder on the data and then use it to convert the bucketed departure times to dummy variables. Show the first five values for depart, depart_bucket and depart_dummy.

"""

# solution

from pyspark.ml.feature import Bucketizer, OneHotEncoder

# Create buckets at 3 hour intervals through the day
buckets = Bucketizer(splits=[0,3,6,9,12,15,18,21,24], inputCol='depart', outputCol='depart_bucket')

# Bucket the departure times
bucketed = buckets.transform(flights)
bucketed.select('depart','depart_bucket').show(5)

# Create a one-hot encoder
onehot = OneHotEncoder(inputCols=['depart_bucket'], outputCols=['depart_dummy'])

# One-hot encode the bucketed departure times
flights_onehot = onehot.fit(bucketed).transform(bucketed)
flights_onehot.select('depart','depart_bucket','depart_dummy').show(5)

#----------------------------------#

# Conclusion

"""
Now you can add departure time to your regression model.
"""

+------+-------------+
|depart|depart_bucket|
+------+-------------+
| 16.33|          5.0|
|  6.17|          2.0|
| 10.33|          3.0|
|  7.98|          2.0|
| 10.83|          3.0|
+------+-------------+
only showing top 5 rows

+------+-------------+-------------+
|depart|depart_bucket| depart_dummy|
+------+-------------+-------------+
| 16.33|          5.0|(7,[5],[1.0])|
|  6.17|          2.0|(7,[2],[1.0])|
| 10.33|          3.0|(7,[3],[1.0])|
|  7.98|          2.0|(7,[2],[1.0])|
| 10.83|          3.0|(7,[3],[1.0])|
+------+-------------+-------------+
only showing top 5 rows



'\nNow you can add departure time to your regression model.\n'

In [34]:
flights_onehot.show(5)

+---+---+---+-------+---+------+--------+-----+-------+-------+-------------+--------------------+-------------+-------------+
|mon|dom|dow|carrier|org|depart|duration|delay|org_idx|     km|    org_dummy|            features|depart_bucket| depart_dummy|
+---+---+---+-------+---+------+--------+-----+-------+-------+-------------+--------------------+-------------+-------------+
|  0| 22|  2|     UA|ORD| 16.33|      82|   30|    0.0| 508.55|(7,[0],[1.0])|(8,[0,1],[508.55,...|          5.0|(7,[5],[1.0])|
|  2| 20|  4|     UA|SFO|  6.17|      82|   -8|    1.0| 542.35|(7,[1],[1.0])|(8,[0,2],[542.35,...|          2.0|(7,[2],[1.0])|
|  9| 13|  1|     AA|ORD| 10.33|     195|   -5|    0.0|1989.15|(7,[0],[1.0])|(8,[0,1],[1989.15...|          3.0|(7,[3],[1.0])|
|  5|  2|  1|     UA|SFO|  7.98|     102|    2|    1.0| 885.14|(7,[1],[1.0])|(8,[0,2],[885.14,...|          2.0|(7,[2],[1.0])|
|  7|  2|  6|     AA|ORD| 10.83|     135|   54|    0.0|1179.65|(7,[0],[1.0])|(8,[0,1],[1179.65...|          3.0

In [35]:
flights_onehot = flights_onehot.drop('features')

In [36]:
assembler = VectorAssembler(inputCols=['km','org_dummy','depart_dummy'], outputCol='features')

flights_onehot = assembler.transform(flights_onehot)

In [37]:
flights_train, flights_test = flights_onehot.randomSplit([0.80,0.20])

In [42]:
regression = LinearRegression(labelCol='duration')

regression = regression.fit(flights_train)

23/07/13 08:22:13 WARN Instrumentation: [b26ea8d2] regParam is zero, which might cause numerical instability and overfitting.
                                                                                

In [43]:
predictions = regression.transform(flights_test)

In [44]:
regression.coefficients

DenseVector([0.0744, 27.3947, 20.4508, 51.7473, 45.7651, 15.2181, 17.6185, 17.487, -14.0193, 1.2458, 3.9721, 7.0535, 4.6997, 8.9605, 8.6856])

In [45]:
# exercise 07

"""
Flight duration model: Adding departure time

In the previous exercise the departure time was bucketed and converted to dummy variables. Now you're going to include those dummy variables in a regression model for flight duration.

The data are in flights. The km, org_dummy and depart_dummy columns have been assembled into features, where km is index 0, org_dummy runs from index 1 to 7 and depart_dummy from index 8 to 14.

The data have been split into training and testing sets and a linear regression model, regression, has been built on the training data. Predictions have been made on the testing data and are available as predictions.
"""

# Instructions

"""

    Find the RMSE for predictions on the testing data.
    Find the average time spent on the ground for flights departing from OGG between 21:00 and 24:00.
    Find the average time spent on the ground for flights departing from OGG between 03:00 and 06:00.
    Find the average time spent on the ground for flights departing from JFK between 03:00 and 06:00.

"""

# solution

# Find the RMSE on testing data
from pyspark.ml.evaluation import RegressionEvaluator
rmse = RegressionEvaluator(labelCol='duration').evaluate(predictions)
print("The test RMSE is", rmse)

# Average minutes on ground at OGG for flights departing between 21:00 and 24:00
avg_eve_ogg = regression.coefficients[0]
print(avg_eve_ogg)

# Average minutes on ground at OGG for flights departing between 03:00 and 06:00
avg_night_ogg = regression.coefficients[0] + regression.coefficients[8]
print(avg_night_ogg)

# Average minutes on ground at JFK for flights departing between 03:00 and 06:00
avg_night_jfk = regression.coefficients[0] + regression.coefficients[3] + regression.coefficients[9]
print(avg_night_jfk)

#----------------------------------#

# Conclusion

"""
Adding departure time resulted in a smaller RMSE. Nice!
"""

[Stage 39:>                                                         (0 + 1) / 1]

The test RMSE is 10.638596453806654
0.07435557535744923
-13.94499015852287
53.06749009982143


                                                                                

'\nAdding departure time resulted in a smaller RMSE. Nice!\n'

In [48]:
flights_onehot = flights_onehot.drop('features')

In [49]:
flights_onehot.show(5)

+---+---+---+-------+---+------+--------+-----+-------+-------+-------------+-------------+-------------+
|mon|dom|dow|carrier|org|depart|duration|delay|org_idx|     km|    org_dummy|depart_bucket| depart_dummy|
+---+---+---+-------+---+------+--------+-----+-------+-------+-------------+-------------+-------------+
|  0| 22|  2|     UA|ORD| 16.33|      82|   30|    0.0| 508.55|(7,[0],[1.0])|          5.0|(7,[5],[1.0])|
|  2| 20|  4|     UA|SFO|  6.17|      82|   -8|    1.0| 542.35|(7,[1],[1.0])|          2.0|(7,[2],[1.0])|
|  9| 13|  1|     AA|ORD| 10.33|     195|   -5|    0.0|1989.15|(7,[0],[1.0])|          3.0|(7,[3],[1.0])|
|  5|  2|  1|     UA|SFO|  7.98|     102|    2|    1.0| 885.14|(7,[1],[1.0])|          2.0|(7,[2],[1.0])|
|  7|  2|  6|     AA|ORD| 10.83|     135|   54|    0.0|1179.65|(7,[0],[1.0])|          3.0|(7,[3],[1.0])|
+---+---+---+-------+---+------+--------+-----+-------+-------+-------------+-------------+-------------+
only showing top 5 rows



In [50]:
# Create a one-hot encoder
onehot = OneHotEncoder(inputCols=['dow','mon'], outputCols=['dow_dummy','mon_dummy'])

# One-hot encode the bucketed departure times
flights_onehot = onehot.fit(flights_onehot).transform(flights_onehot)

In [51]:
flights_onehot.show(5)

+---+---+---+-------+---+------+--------+-----+-------+-------+-------------+-------------+-------------+-------------+--------------+
|mon|dom|dow|carrier|org|depart|duration|delay|org_idx|     km|    org_dummy|depart_bucket| depart_dummy|    dow_dummy|     mon_dummy|
+---+---+---+-------+---+------+--------+-----+-------+-------+-------------+-------------+-------------+-------------+--------------+
|  0| 22|  2|     UA|ORD| 16.33|      82|   30|    0.0| 508.55|(7,[0],[1.0])|          5.0|(7,[5],[1.0])|(6,[2],[1.0])|(11,[0],[1.0])|
|  2| 20|  4|     UA|SFO|  6.17|      82|   -8|    1.0| 542.35|(7,[1],[1.0])|          2.0|(7,[2],[1.0])|(6,[4],[1.0])|(11,[2],[1.0])|
|  9| 13|  1|     AA|ORD| 10.33|     195|   -5|    0.0|1989.15|(7,[0],[1.0])|          3.0|(7,[3],[1.0])|(6,[1],[1.0])|(11,[9],[1.0])|
|  5|  2|  1|     UA|SFO|  7.98|     102|    2|    1.0| 885.14|(7,[1],[1.0])|          2.0|(7,[2],[1.0])|(6,[1],[1.0])|(11,[5],[1.0])|
|  7|  2|  6|     AA|ORD| 10.83|     135|   54|    0.0|

In [52]:
assembler = VectorAssembler(inputCols=['km','org_dummy','depart_dummy','dow_dummy','mon_dummy'], outputCol='features')

flights_onehot = assembler.transform(flights_onehot)

In [53]:
flights_onehot.show(5)

+---+---+---+-------+---+------+--------+-----+-------+-------+-------------+-------------+-------------+-------------+--------------+--------------------+
|mon|dom|dow|carrier|org|depart|duration|delay|org_idx|     km|    org_dummy|depart_bucket| depart_dummy|    dow_dummy|     mon_dummy|            features|
+---+---+---+-------+---+------+--------+-----+-------+-------+-------------+-------------+-------------+-------------+--------------+--------------------+
|  0| 22|  2|     UA|ORD| 16.33|      82|   30|    0.0| 508.55|(7,[0],[1.0])|          5.0|(7,[5],[1.0])|(6,[2],[1.0])|(11,[0],[1.0])|(32,[0,1,13,17,21...|
|  2| 20|  4|     UA|SFO|  6.17|      82|   -8|    1.0| 542.35|(7,[1],[1.0])|          2.0|(7,[2],[1.0])|(6,[4],[1.0])|(11,[2],[1.0])|(32,[0,2,10,19,23...|
|  9| 13|  1|     AA|ORD| 10.33|     195|   -5|    0.0|1989.15|(7,[0],[1.0])|          3.0|(7,[3],[1.0])|(6,[1],[1.0])|(11,[9],[1.0])|(32,[0,1,11,16,30...|
|  5|  2|  1|     UA|SFO|  7.98|     102|    2|    1.0| 885.14|(

In [54]:
flights_train, flights_test = flights_onehot.randomSplit([0.80,0.20])

In [55]:
# exercise 08

"""
Flight duration model: More features!

Let's add more features to our model. This will not necessarily result in a better model. Adding some features might improve the model. Adding other features might make it worse.

More features will always make the model more complicated and difficult to interpret.

These are the features you'll include in the next model:

    km
    org (origin airport, one-hot encoded, 8 levels)
    depart (departure time, binned in 3 hour intervals, one-hot encoded, 8 levels)
    dow (departure day of week, one-hot encoded, 7 levels) and
    mon (departure month, one-hot encoded, 12 levels).

These have been assembled into the features column, which is a sparse representation of 32 columns (remember one-hot encoding produces a number of columns which is one fewer than the number of levels).

The data are available as flights, randomly split into flights_train and flights_test.

This exercise is based on a small subset of the flights data.
"""

# Instructions

"""

    Fit a linear regression model to the training data.
    Generate predictions for the testing data.
    Calculate the RMSE on the testing data.
    Look at the model coefficients. Are any of them zero?

"""

# solution

from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Fit linear regression model to training data
regression = LinearRegression(labelCol='duration').fit(flights_train)

# Make predictions on testing data
predictions = regression.transform(flights_test)

# Calculate the RMSE on testing data
rmse = RegressionEvaluator(labelCol='duration').evaluate(predictions)
print("The test RMSE is", rmse)

# Look at the model coefficients
coeffs = regression.coefficients
print(coeffs)

#----------------------------------#

# Conclusion

"""
With all those non-zero coefficients the model is a little hard to interpret!
"""

23/07/13 08:47:38 WARN Instrumentation: [548137fc] regParam is zero, which might cause numerical instability and overfitting.
[Stage 47:>                                                         (0 + 1) / 1]

The test RMSE is 10.531195157372931
[0.07436477656392289,27.487645693808908,20.397230206228524,51.889234255668,45.69223770081051,15.214310615202088,17.787236632574448,17.71180184102375,-13.75337236247439,0.4432479539468115,3.9509532780266183,6.810040136120186,4.479654429457599,8.773909520174302,8.603673311933496,0.598327932613395,0.25441407278610223,-0.06710802173014885,0.26964590860408594,0.34668367949430823,0.26819413632390743,-1.9268450265648842,-2.3515887174090317,-2.0684396127203004,-3.3670319518999987,-4.326856752875204,-4.279151249251972,-4.459499352574625,-4.566245777112856,-4.015301382600413,-2.8936400167693166,-0.9144867119445865]


                                                                                

'\nWith all those non-zero coefficients the model is a little hard to interpret!\n'

In [56]:
# exercise 09

"""
Flight duration model: Regularization!

In the previous exercise you added more predictors to the flight duration model. The model performed well on testing data, but with so many coefficients it was difficult to interpret.

In this exercise you'll use Lasso regression (regularized with a L1 penalty) to create a more parsimonious model. Many of the coefficients in the resulting model will be set to zero. This means that only a subset of the predictors actually contribute to the model. Despite the simpler model, it still produces a good RMSE on the testing data.

You'll use a specific value for the regularization strength. Later you'll learn how to find the best value using cross validation.

The data (same as previous exercise) are available as flights, randomly split into flights_train and flights_test.

There are two parameters for this model, λ (regParam) and α (elasticNetParam), where α determines the type of regularization and λ gives the strength of regularization.
"""

# Instructions

"""

    Fit a linear regression model to the training data. Set the regularization strength to 1.
    Calculate the RMSE on the testing data.
    Look at the model coefficients.
    How many of the coefficients are equal to zero?

"""

# solution

from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Fit Lasso model (λ = 1, α = 1) to training data
regression = LinearRegression(labelCol='duration', regParam=1, elasticNetParam=1).fit(flights_train)

# Calculate the RMSE on testing data
rmse = RegressionEvaluator(labelCol='duration').evaluate(regression.transform(flights_test))
print("The test RMSE is", rmse)

# Look at the model coefficients
coeffs = regression.coefficients
print(coeffs)

# Number of zero coefficients
zero_coeff = sum([1 for beta in regression.coefficients if beta == 0])
print("Number of coefficients equal to 0:", zero_coeff)


#----------------------------------#

# Conclusion

"""
Regularization produced a far simpler model with similar test performance.
"""

[Stage 50:>                                                         (0 + 1) / 1]

The test RMSE is 11.55135177997585
[0.07348072701992543,5.5614246214444645,0.0,28.99534168833446,21.626473969350062,-2.3354953526285382,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.204932714816671,1.152275265236853,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
Number of coefficients equal to 0: 25


                                                                                

'\nRegularization produced a far simpler model with similar test performance.\n'

In [58]:
data = flights_onehot.toPandas()

data.to_csv(path_data + 'flights_onehot.csv')

                                                                                

In [59]:
spark.stop()