<h1> Experimenting with different models </h1>

In this notebook, we try out different ideas.  The first thing we have to do is to create a validation set, so that we are not doing experimentation with our independent test dataset.

In [1]:
BUCKET='cloud-training-demos-ml'

os.environ['BUCKET'] = BUCKET

In [2]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.regression import LabeledPoint

<h2> Read dataset </h2>

In [3]:
traindays = spark.read \
    .option("header", "true") \
    .csv('gs://cloud-training-demos-ml/flights/trainday.csv')
traindays.createOrReplaceTempView('traindays')

In [4]:
from pyspark.sql.types import StringType, FloatType, StructType, StructField

header = 'FL_DATE,UNIQUE_CARRIER,AIRLINE_ID,CARRIER,FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST_CITY_MARKET_ID,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,TAXI_OUT,WHEELS_OFF,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CANCELLED,CANCELLATION_CODE,DIVERTED,DISTANCE,DEP_AIRPORT_LAT,DEP_AIRPORT_LON,DEP_AIRPORT_TZOFFSET,ARR_AIRPORT_LAT,ARR_AIRPORT_LON,ARR_AIRPORT_TZOFFSET,EVENT,NOTIFY_TIME'

def get_structfield(colname):
   if colname in ['ARR_DELAY', 'DEP_DELAY', 'DISTANCE', 'TAXI_OUT']:
      return StructField(colname, FloatType(), True)
   else:
      return StructField(colname, StringType(), True)

schema = StructType([get_structfield(colname) for colname in header.split(',')])

In [5]:
inputs = 'gs://cloud-training-demos-ml/flights/tzcorr/all_flights-00000-*' # 1/30th
#inputs = 'gs://cloud-training-demos-ml/flights/tzcorr/all_flights-*'  # FULL
flights = spark.read\
            .schema(schema)\
            .csv(inputs)

# this view can now be queried ...
flights.createOrReplaceTempView('flights')

<h2> Create separate training and validation data </h2>

In [6]:
from pyspark.sql.functions import rand

traindays = traindays.withColumn("holdout", rand() > 0.8)  # 80% of data is for training
traindays.createOrReplaceTempView('traindays')

In [7]:
traindays.head(10)

[Row(FL_DATE=u'2015-01-01', is_train_day=u'True', holdout=False),
 Row(FL_DATE=u'2015-01-02', is_train_day=u'False', holdout=False),
 Row(FL_DATE=u'2015-01-03', is_train_day=u'False', holdout=False),
 Row(FL_DATE=u'2015-01-04', is_train_day=u'True', holdout=True),
 Row(FL_DATE=u'2015-01-05', is_train_day=u'True', holdout=True),
 Row(FL_DATE=u'2015-01-06', is_train_day=u'False', holdout=False),
 Row(FL_DATE=u'2015-01-07', is_train_day=u'True', holdout=False),
 Row(FL_DATE=u'2015-01-08', is_train_day=u'True', holdout=False),
 Row(FL_DATE=u'2015-01-09', is_train_day=u'True', holdout=True),
 Row(FL_DATE=u'2015-01-10', is_train_day=u'True', holdout=False)]

<h2> Logistic regression </h2>

In [8]:
trainquery = """
SELECT
  *
FROM flights f
JOIN traindays t
ON f.FL_DATE == t.FL_DATE
WHERE
  t.is_train_day == 'True' AND
  t.holdout == False AND
  f.dep_delay IS NOT NULL AND 
  f.arr_delay IS NOT NULL
"""
traindata = spark.sql(trainquery)

In [9]:
traindata.head()

Row(FL_DATE=u'2015-02-02', UNIQUE_CARRIER=u'EV', AIRLINE_ID=u'20366', CARRIER=u'EV', FL_NUM=u'4410', ORIGIN_AIRPORT_ID=u'12266', ORIGIN_AIRPORT_SEQ_ID=u'1226603', ORIGIN_CITY_MARKET_ID=u'31453', ORIGIN=u'IAH', DEST_AIRPORT_ID=u'10693', DEST_AIRPORT_SEQ_ID=u'1069302', DEST_CITY_MARKET_ID=u'30693', DEST=u'BNA', CRS_DEP_TIME=u'2015-02-02T21:35:00', DEP_TIME=u'2015-02-02T21:35:00', DEP_DELAY=0.0, TAXI_OUT=19.0, WHEELS_OFF=u'2015-02-02T21:54:00', WHEELS_ON=u'2015-02-02T23:24:00', TAXI_IN=u'6.00', CRS_ARR_TIME=u'2015-02-02T23:28:00', ARR_TIME=u'2015-02-02T23:30:00', ARR_DELAY=2.0, CANCELLED=u'0.00', CANCELLATION_CODE=None, DIVERTED=u'0.00', DISTANCE=657.0, DEP_AIRPORT_LAT=u'29.98444444', DEP_AIRPORT_LON=u'-95.34138889', DEP_AIRPORT_TZOFFSET=u'-21600.0', ARR_AIRPORT_LAT=u'36.12444444', ARR_AIRPORT_LON=u'-86.67805556', ARR_AIRPORT_TZOFFSET=u'-21600.0', EVENT=None, NOTIFY_TIME=None, FL_DATE=u'2015-02-02', is_train_day=u'True', holdout=False)

In [10]:
def to_example(fields):
  return LabeledPoint(\
              float(fields['ARR_DELAY'] < 15), #ontime \
              [ \
                  fields['DEP_DELAY'], # DEP_DELAY \
                  fields['TAXI_OUT'], # TAXI_OUT \
                  fields['DISTANCE'], # DISTANCE \
              ])

In [11]:
examples = traindata.rdd.map(to_example)

In [12]:
lrmodel = LogisticRegressionWithLBFGS.train(examples, intercept=True)
print lrmodel.weights,lrmodel.intercept

[-0.164030282422,-0.132130886713,0.000269267424422] 5.15293848132


In [13]:
lrmodel.setThreshold(0.7) # cancel if prob-of-ontime < 0.7

<h2> Evaluate model on the heldout data </h2>


In [14]:
evalquery = trainquery.replace("t.holdout == False","t.holdout == True")
print evalquery


SELECT
  *
FROM flights f
JOIN traindays t
ON f.FL_DATE == t.FL_DATE
WHERE
  t.is_train_day == 'True' AND
  t.holdout == True AND
  f.dep_delay IS NOT NULL AND 
  f.arr_delay IS NOT NULL



In [15]:
evaldata = spark.sql(evalquery)
examples = evaldata.rdd.map(to_example)

In [16]:
def eval(labelpred):
    cancel = labelpred.filter(lambda (label, pred): pred == 0)
    nocancel = labelpred.filter(lambda (label, pred): pred == 1)
    corr_cancel = cancel.filter(lambda (label, pred): label == pred).count()
    corr_nocancel = nocancel.filter(lambda (label, pred): label == pred).count()
    return {'total_cancel': cancel.count(), \
            'correct_cancel': float(corr_cancel)/cancel.count(), \
            'total_noncancel': nocancel.count(), \
            'correct_noncancel': float(corr_nocancel)/nocancel.count() \
           }

In [17]:
labelpred = examples.map(lambda p: (p.label, lrmodel.predict(p.features)))
print eval(labelpred)

{'correct_cancel': 0.821504397937519, 'total_noncancel': 42595, 'correct_noncancel': 0.9624369057401103, 'total_cancel': 13188}
