<h1> Experimenting with different models </h1>

In this notebook, we try out different ideas.  The first thing we have to do is to create a validation set, so that we are not doing experimentation with our independent test dataset.

In [5]:
BUCKET='cs358-bucket'

import os
os.environ['BUCKET'] = BUCKET

In [2]:
from __future__ import print_function
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql.types import StringType, FloatType, StructType, StructField

In [3]:
# Create spark session

from __future__ import print_function
from pyspark.sql import SparkSession
from pyspark import SparkContext

sc = SparkContext('local', 'experimentation')
spark = SparkSession \
    .builder \
    .appName("experimentation w/ Spark ML") \
    .getOrCreate()

print(spark)
print(sc)

<pyspark.sql.session.SparkSession object at 0x7fce57630e10>
<SparkContext master=local appName=experimentation>


<h2> Read dataset </h2>

In [6]:
traindays = spark.read \
    .option("header", "true") \
    .csv('gs://{}/flights/trainday.csv'.format(BUCKET))
traindays.createOrReplaceTempView('traindays')

In [7]:
header = 'FL_DATE,UNIQUE_CARRIER,AIRLINE_ID,CARRIER,FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,DISTANCE,DEP_AIRPORT_LAT,DEP_AIRPORT_LON,DEP_AIRPORT_TZOFFSET,ARR_AIRPORT_LAT,ARR_AIRPORT_LON,ARR_AIRPORT_TZOFFSET,EVENT,NOTIFY_TIME'

def get_structfield(colname):
    if colname in ['ARR_DELAY', 'DEP_DELAY', 'DISTANCE', 'TAXI_OUT']:
        return StructField(colname, FloatType(), True)
    else:
        return StructField(colname, StringType(), True)

schema = StructType([get_structfield(colname) for colname in header.split(',')])

In [8]:
inputs = 'gs://{}/flights/tzcorr/all_flights-00000-*' # 1/30th
#inputs = 'gs://{}/flights/tzcorr/all_flights-*'  # FULL
flights = spark.read\
            .schema(schema)\
            .csv(inputs.format(BUCKET))

# this view can now be queried ...
flights.createOrReplaceTempView('flights')

<h2> Create separate training and validation data </h2>

In [9]:
from pyspark.sql.functions import rand
SEED = 13
traindays = traindays.withColumn("holdout", rand(SEED) > 0.8)  # 80% of data is for training
traindays.createOrReplaceTempView('traindays')

In [10]:
traindays.head(10)

[Row(FL_DATE='2015-01-01', is_train_day='True', holdout=False),
 Row(FL_DATE='2015-01-02', is_train_day='False', holdout=True),
 Row(FL_DATE='2015-01-03', is_train_day='False', holdout=False),
 Row(FL_DATE='2015-01-04', is_train_day='True', holdout=False),
 Row(FL_DATE='2015-01-05', is_train_day='True', holdout=True),
 Row(FL_DATE='2015-01-06', is_train_day='False', holdout=False),
 Row(FL_DATE='2015-01-07', is_train_day='True', holdout=False),
 Row(FL_DATE='2015-01-08', is_train_day='True', holdout=False),
 Row(FL_DATE='2015-01-09', is_train_day='True', holdout=False),
 Row(FL_DATE='2015-01-10', is_train_day='True', holdout=False)]

<h2> Logistic regression </h2>

In [11]:
trainquery = """
SELECT
  *
FROM flights f
JOIN traindays t
ON f.FL_DATE == t.FL_DATE
WHERE
  t.is_train_day == 'True' AND
  t.holdout == False AND
  f.CANCELLED == '0.00' AND 
  f.DIVERTED == '0.00'
"""
traindata = spark.sql(trainquery)

In [12]:
traindata.head()

Row(FL_DATE='2015-05-16', UNIQUE_CARRIER='VX', AIRLINE_ID='21171', CARRIER='VX', FL_NUM='108', ORIGIN_AIRPORT_ID='12892', ORIGIN_AIRPORT_SEQ_ID='1289203', ORIGIN_CITY_MARKET_ID='32575', ORIGIN='LAX', DEST_AIRPORT_ID='12264', DEST_AIRPORT_SEQ_ID='1226402', DEST_CITY_MARKET_ID='30852', DEST='IAD', CRS_DEP_TIME='2015-05-16T15:25:00', DEP_TIME='2015-05-16T15:20:00', DEP_DELAY=-5.0, TAXI_OUT=12.0, WHEELS_OFF='2015-05-16T15:32:00', WHEELS_ON='2015-05-16T20:07:00', TAXI_IN='7.00', CRS_ARR_TIME='2015-05-16T20:25:00', ARR_TIME='2015-05-16T20:14:00', ARR_DELAY=-11.0, CANCELLED='0.00', CANCELLATION_CODE=None, DIVERTED='0.00', DISTANCE=2288.0, DEP_AIRPORT_LAT='33.94250000', DEP_AIRPORT_LON='-118.40805556', DEP_AIRPORT_TZOFFSET='-25200.0', ARR_AIRPORT_LAT='38.94750000', ARR_AIRPORT_LON='-77.46000000', ARR_AIRPORT_TZOFFSET='-14400.0', EVENT=None, NOTIFY_TIME=None, FL_DATE='2015-05-16', is_train_day='True', holdout=False)

In [13]:
def to_example(fields):
    return LabeledPoint(\
              float(fields['ARR_DELAY'] < 15), #ontime \
              [ \
                  fields['DEP_DELAY'], # DEP_DELAY \
                  fields['TAXI_OUT'], # TAXI_OUT \
                  fields['DISTANCE'], # DISTANCE \
              ])

In [14]:
examples = traindata.rdd.map(to_example)

In [15]:
lrmodel = LogisticRegressionWithLBFGS.train(examples, intercept=True)
print(lrmodel.weights,lrmodel.intercept)

[-0.16588414918675193,-0.1258966534699923,0.0002796516834817356] 5.1096425611240575


In [16]:
lrmodel.setThreshold(0.7) # cancel if prob-of-ontime < 0.7

<h2> Evaluate model on the heldout data </h2>


In [17]:
evalquery = trainquery.replace("t.holdout == False","t.holdout == True")
print(evalquery)


SELECT
  *
FROM flights f
JOIN traindays t
ON f.FL_DATE == t.FL_DATE
WHERE
  t.is_train_day == 'True' AND
  t.holdout == True AND
  f.CANCELLED == '0.00' AND 
  f.DIVERTED == '0.00'



In [18]:
evaldata = spark.sql(evalquery)
examples = evaldata.rdd.map(to_example)

In [20]:
def eval(labelpred):
    ''' 
        data = (label, pred)
            data[0] = label
            data[1] = pred
    '''
    cancel = labelpred.filter(lambda data: data[1] < 0.7)
    nocancel = labelpred.filter(lambda data: data[1] >= 0.7)
    corr_cancel = cancel.filter(lambda data: data[0] == int(data[1] >= 0.7)).count()
    corr_nocancel = nocancel.filter(lambda data: data[0] == int(data[1] >= 0.7)).count()
    
    cancel_denom = cancel.count()
    nocancel_denom = nocancel.count()
    if cancel_denom == 0:
        cancel_denom = 1
    if nocancel_denom == 0:
        nocancel_denom = 1
    return {'total_cancel': cancel.count(), \
            'correct_cancel': float(corr_cancel)/cancel_denom, \
            'total_noncancel': nocancel.count(), \
            'correct_noncancel': float(corr_nocancel)/nocancel_denom \
           }

In [21]:
labelpred = examples.map(lambda p: (p.label, lrmodel.predict(p.features)))
print(eval(labelpred))

{'total_cancel': 6540, 'correct_cancel': 0.7807339449541284, 'total_noncancel': 27572, 'correct_noncancel': 0.9657986362976934}


Copyright 2019 Google Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.