## Load data
Get a sample data of nyc yellow taxi from Azure Open Datasets

In [1]:
from azureml.opendatasets import NycTlcYellow
from datetime import datetime
from dateutil import parser

start_date = parser.parse('2018-05-01')
end_date = parser.parse('2018-05-07')
nyc_tlc = NycTlcYellow(start_date=start_date, end_date=end_date)
nyc_tlc_df = nyc_tlc.to_pandas_dataframe()
nyc_tlc_df.info()

StatementMeta(sparkpool, 3, 1, Finished, Available)

[Info] read from /tmp/tmpx7udqyqq/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/yellow/puYear=2018/puMonth=5/part-00000-tid-8898858832658823408-a1de80bd-eed3-4d11-b9d4-fa74bfbd47bc-426339-118.c000.snappy.parquet
[Info] read from /tmp/tmpx7udqyqq/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/yellow/puYear=2018/puMonth=5/part-00001-tid-8898858832658823408-a1de80bd-eed3-4d11-b9d4-fa74bfbd47bc-426336-117.c000.snappy.parquet
[Info] read from /tmp/tmpx7udqyqq/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/yellow/puYear=2018/puMonth=5/part-00002-tid-8898858832658823408-a1de80bd-eed3-4d11-b9d4-fa74bfbd47bc-426334-119.c000.snappy.parquet
[Info] read from /tmp/tmpx7udqyqq/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/yellow/puYear=2018/puMonth=5/part-00003-tid-8898858832658823408-a1de80bd-eed3-4d11-b9d4-fa74bfbd47bc-426340-115.c000.snappy.parquet
[Info] read from /tmp/tmpx7udqyqq/https%3A/%2Fazureopendatastorage.azurefd.net/nyctlc/yellow/puYear=2018/puMonth=5/part-00004-ti

In [2]:
from IPython.display import display

sampled_df = nyc_tlc_df.sample(n=10000, random_state=123)
display(sampled_df.head(5))

StatementMeta(sparkpool, 3, 2, Finished, Available)

Unnamed: 0,vendorID,tpepPickupDateTime,tpepDropoffDateTime,passengerCount,tripDistance,puLocationId,doLocationId,startLon,startLat,endLon,endLat,rateCodeId,storeAndFwdFlag,paymentType,fareAmount,extra,mtaTax,improvementSurcharge,tipAmount,tollsAmount,totalAmount,puYear,puMonth
87213,2,2018-05-05 19:07:01,2018-05-05 19:28:44,1,3.95,164,112,,,,,1,N,2,17.0,0.0,0.5,0.3,0.0,5.76,23.56,2018,5
145405,2,2018-05-05 22:46:06,2018-05-05 22:59:11,1,1.22,264,264,,,,,1,N,1,9.5,0.5,0.5,0.3,2.16,0.0,12.96,2018,5
457648,1,2018-05-06 18:53:06,2018-05-06 19:06:31,1,2.2,246,162,,,,,1,N,2,11.0,0.0,0.5,0.3,0.0,0.0,11.8,2018,5
369051,2,2018-05-02 09:25:13,2018-05-02 09:36:32,1,0.89,161,162,,,,,1,N,1,8.0,0.0,0.5,0.3,1.76,0.0,10.56,2018,5
38871,2,2018-05-04 02:58:10,2018-05-04 03:01:10,3,0.45,79,4,,,,,1,N,1,4.0,0.5,0.5,0.3,1.32,0.0,6.62,2018,5


## Prepare and featurize data
- There are extra dimensions that are not going to be useful in the model. We just take the dimensions that we need and put them into the featurised dataframe. 
- There are also a bunch of outliers in the data so we need to filter them out.

In [3]:
import numpy
import pandas

def get_pickup_time(df):
    pickupHour = df['pickupHour'];
    if ((pickupHour >= 7) & (pickupHour <= 10)):
        return 'AMRush'
    elif ((pickupHour >= 11) & (pickupHour <= 15)):
        return 'Afternoon'
    elif ((pickupHour >= 16) & (pickupHour <= 19)):
        return 'PMRush'
    else:
        return 'Night'

featurized_df = pandas.DataFrame()
featurized_df['tipped'] = (sampled_df['tipAmount'] > 0).astype('int')
featurized_df['fareAmount'] = sampled_df['fareAmount'].astype('float32')
featurized_df['paymentType'] = sampled_df['paymentType'].astype('int')
featurized_df['passengerCount'] = sampled_df['passengerCount'].astype('int')
featurized_df['tripDistance'] = sampled_df['tripDistance'].astype('float32')
featurized_df['pickupHour'] = sampled_df['tpepPickupDateTime'].dt.hour.astype('int')
featurized_df['tripTimeSecs'] = ((sampled_df['tpepDropoffDateTime'] - sampled_df['tpepPickupDateTime']) / numpy.timedelta64(1, 's')).astype('int')

featurized_df['pickupTimeBin'] = featurized_df.apply(get_pickup_time, axis=1)
featurized_df = featurized_df.drop(columns='pickupHour')

display(featurized_df.head(5))


StatementMeta(sparkpool, 3, 3, Finished, Available)

Unnamed: 0,tipped,fareAmount,paymentType,passengerCount,tripDistance,tripTimeSecs,pickupTimeBin
87213,0,17.0,2,1,3.95,1303,PMRush
145405,1,9.5,1,1,1.22,785,Night
457648,0,11.0,2,1,2.2,805,PMRush
369051,1,8.0,1,1,0.89,679,AMRush
38871,1,4.0,1,3,0.45,180,Night


In [4]:
filtered_df = featurized_df[(featurized_df.tipped >= 0) & (featurized_df.tipped <= 1)\
    & (featurized_df.fareAmount >= 1) & (featurized_df.fareAmount <= 250)\
    & (featurized_df.paymentType >= 1) & (featurized_df.paymentType <= 2)\
    & (featurized_df.passengerCount > 0) & (featurized_df.passengerCount < 8)\
    & (featurized_df.tripDistance >= 0) & (featurized_df.tripDistance <= 100)\
    & (featurized_df.tripTimeSecs >= 30) & (featurized_df.tripTimeSecs <= 7200)]

filtered_df.info()

StatementMeta(sparkpool, 3, 4, Finished, Available)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9776 entries, 87213 to 333274
Data columns (total 7 columns):
tipped            9776 non-null int64
fareAmount        9776 non-null float32
paymentType       9776 non-null int64
passengerCount    9776 non-null int64
tripDistance      9776 non-null float32
tripTimeSecs      9776 non-null int64
pickupTimeBin     9776 non-null object
dtypes: float32(2), int64(4), object(1)
memory usage: 534.6+ KB

## Save the data to spark table

In [5]:
spark_df = spark.createDataFrame(filtered_df)
spark_df.write.mode("overwrite").saveAsTable("default.NYC_Taxi")

StatementMeta(sparkpool, 3, 5, Finished, Available)

  'JavaPackage' object is not callable
Attempting non-optimization as 'spark.sql.execution.arrow.fallback.enabled' is set to true.

## Drop the spark table

In [6]:
%%sql

/* drop table default.NYC_Taxi */

StatementMeta(sparkpool, 3, 6, Finished, Available)

<Spark SQL result set with 0 rows and 0 fields>