## Demo 5 NYC Taxi Fare Prediction with Tensorflow

### Using pytorch_env kernel

# Frameworks, Tools, Libs used 

### Tensorflow,  Pandas

## Set Up
### In this first cell, we'll load the necessary libraries.

In [1]:
import math
import shutil
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import os
from dateutil.parser import parse
from pytz import timezone
import tensorflow as tf

In [2]:
import logging
logger = tf.get_logger()
logger.setLevel(logging.ERROR)
#tf.logging.set_verbosity(tf.logging.INFO)
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

## Constants and Hyper-parameters

In [3]:
#Constants

output_dir = "."
OUTDIR = "."
OUTPUT_RESULT="submission.csv"
#Hyper prameters
BUCKETS=20
HIDDEN_UNITS = "128 32 4"
SCALE = 10
BATCH_SIZE=32
ROWS_TO_READ=40000
ROWS_TO_SKIP=10
LEARNING_RATE=0.04
STEPS_TO_PROCESS=40000
STEPS=374000

## Next, we'll load our data set.

In [4]:
time1=datetime.now()
print("================Data Load=====================")
df = type('', (), {})()
print(datetime.now())
df.train = pd.read_csv('./input/trainCleanData.csv')
print(datetime.now())

print(datetime.now())
#df.train.head(10)
df.train.describe()
time2=datetime.now()
data_load_time=time2-time1
print("Data Load Consuming Time:")
print(data_load_time)
print("====================End Data Load==============")

2020-04-02 10:14:56.551929
2020-04-02 10:16:51.282087
2020-04-02 10:16:51.282423
Data Load Consuming Time:
0:02:01.896109


## Test Data Load

In [5]:

print("================Test Data Load=====================")
test_df = type('', (), {})()
print(datetime.now())
test_df.test = pd.read_csv('./input/trainCleanData.csv')
print("====================End Data Load==============")

2020-04-02 10:16:58.456608


## Examine the data¶
####  It's a good idea to get to know your data a little bit before you work with it.

#### We'll print out a quick summary of a few useful statistics on each column.

#### This will include things like mean, standard deviation, max, min, and various quantiles.

## Calculate Time of each ride
#### The calcuating the hour and week day for millions of rows is costly so we pre-calcualte all possible values

In [6]:
time5=datetime.now()
##calculate times
df.train['nyctime'] = df.train.apply(lambda row: row['pickup_datetime'][:14]+'00:00 UTC', axis=1)
##df.test['nyctime'] = df.test.apply(lambda row: row['pickup_datetime'][:14]+'00:00 UTC', axis=1)

nycTimes = []
def findTimes(timeStr, nycDict, field):
    if not(timeStr[:14]+'00:00 UTC' in nycDict):
        nycTime = {}
        nycTime['time'] = parse(timeStr).astimezone(timezone('US/Eastern'))
        nycTime['weekday'] = int(nycTime['time'].weekday())
        nycTime['hour'] = int(nycTime['time'].hour)
        nycTime['hourSince2000'] = int(((nycTime['time'].year-2009)*366+int(nycTime['time'].strftime("%j")))*25+nycTime['time'].hour)
        nycTime['nyctime'] = timeStr[:14]+'00:00 UTC'
        nycTimes.append(nycTime)
    return 

minDate=parse(df.train['pickup_datetime'].min())
maxDate=parse(df.train['pickup_datetime'].max())
while (minDate < maxDate):
    findTimes(minDate.strftime("%Y-%m-%d %H:%M:%S%z"),nycTimes,'time')
    minDate = minDate + timedelta(hours=1)

df.times = pd.DataFrame(nycTimes)

#### Now join the data frames on the hourly time key

In [7]:
df.train=df.train.join(df.times.set_index('nyctime'), on='nyctime')
##df.test=df.test.join(df.times.set_index('nyctime'), on='nyctime')
df.times.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56928 entries, 0 to 56927
Data columns (total 5 columns):
time             56928 non-null datetime64[ns, US/Eastern]
weekday          56928 non-null int64
hour             56928 non-null int64
hourSince2000    56928 non-null int64
nyctime          56928 non-null object
dtypes: datetime64[ns, US/Eastern](1), int64(3), object(1)
memory usage: 2.2+ MB


## Feature Engineering on data set

In [8]:
# Create feature engineering function that will be used in the input and serving input functions
def add_engineered(features):
    # this is how you can do feature engineering in TensorFlow
    lat1 = features['pickup_latitude']
    lat2 = features['dropoff_latitude']
    lon1 = features['pickup_longitude']
    lon2 = features['dropoff_longitude']
    latdiff = (lat1 - lat2)
    londiff = (lon1 - lon2)
    
    # set features for distance with sign that indicates direction
    features['latdiff'] = latdiff
    features['londiff'] = londiff
    dist = (latdiff * latdiff + londiff * londiff)**(0.5)
    features['euclidean'] = dist
    features['cityBlockDist'] = abs(latdiff) + abs(londiff)
    return features

df.train = add_engineered(df.train)

time6=datetime.now()
print("=================End Data Prepare==================")
data_process_time=time6-time5
print("Data Process Consuming Time:")
print(data_process_time)

data_prepare_time=data_load_time+data_process_time
print("Data Prepare Consuming time:")
print(data_prepare_time)

Data Process Consuming Time:
0:09:19.378515
Data Prepare Consuming time:
0:11:21.274624


## Test Data Process

In [9]:
##calculate times
test_df.test['nyctime'] = test_df.test.apply(lambda row: row['pickup_datetime'][:14]+'00:00 UTC', axis=1)
##df.test['nyctime'] = df.test.apply(lambda row: row['pickup_datetime'][:14]+'00:00 UTC', axis=1)

nycTimes = []
def findTimes(timeStr, nycDict, field):
    if not(timeStr[:14]+'00:00 UTC' in nycDict):
        nycTime = {}
        nycTime['time'] = parse(timeStr).astimezone(timezone('US/Eastern'))
        nycTime['weekday'] = int(nycTime['time'].weekday())
        nycTime['hour'] = int(nycTime['time'].hour)
        nycTime['hourSince2000'] = int(((nycTime['time'].year-2009)*366+int(nycTime['time'].strftime("%j")))*25+nycTime['time'].hour)
        nycTime['nyctime'] = timeStr[:14]+'00:00 UTC'
        nycTimes.append(nycTime)
    return 

minDate=parse(test_df.test['pickup_datetime'].min())
maxDate=parse(test_df.test['pickup_datetime'].max())
while (minDate < maxDate):
    findTimes(minDate.strftime("%Y-%m-%d %H:%M:%S%z"),nycTimes,'time')
    minDate = minDate + timedelta(hours=1)

test_df.times = pd.DataFrame(nycTimes)
test_df.test=test_df.test.join(test_df.times.set_index('nyctime'), on='nyctime')
# Create feature engineering function that will be used in the input and serving input functions
def add_engineered(features):
    # this is how you can do feature engineering in TensorFlow
    lat1 = features['pickup_latitude']
    lat2 = features['dropoff_latitude']
    lon1 = features['pickup_longitude']
    lon2 = features['dropoff_longitude']
    latdiff = (lat1 - lat2)
    londiff = (lon1 - lon2)
    
    # set features for distance with sign that indicates direction
    features['latdiff'] = latdiff
    features['londiff'] = londiff
    dist = (latdiff * latdiff + londiff * londiff)**(0.5)
    features['euclidean'] = dist
    features['cityBlockDist'] = abs(latdiff) + abs(londiff)
    return features

test_df.test = add_engineered(test_df.test)


#### This is the measure used to see how close the data is to actual taxi fares

In [10]:
def rmse(labels, predictions):
    pred_values = tf.cast(predictions['predictions'],tf.float64)
    return {'rmse': tf.metrics.root_mean_squared_error(labels*SCALE, pred_values*SCALE)}

## Build an estimator starting from INPUT COLUMNS.
####  These include feature transformations and synthetic features.
#### The model is a wide-and-deep model.

In [11]:
# These are the raw input columns, and will be provided for prediction also
INPUT_COLUMNS = [
    # Define features
    
    # Numeric columns
    tf.feature_column.numeric_column('weekday'),
    tf.feature_column.numeric_column('hour'),
    tf.feature_column.numeric_column('pickup_latitude'),
    tf.feature_column.numeric_column('pickup_longitude'),
    tf.feature_column.numeric_column('dropoff_latitude'),
    tf.feature_column.numeric_column('dropoff_longitude'),
    tf.feature_column.numeric_column('passenger_count'),
    #tf.feature_column.numeric_column('hourSince2000'),
    
    # Engineered features that are created in the input_fn
    tf.feature_column.numeric_column('latdiff'),
    tf.feature_column.numeric_column('londiff'),
    tf.feature_column.numeric_column('euclidean'),
    tf.feature_column.numeric_column('cityBlockDist')
]
# Build the estimator
def build_estimator(model_dir, nbuckets, hidden_units):
    """
     
  """

    # Input columns   hourSince2000,
    (dayofweek, hourofday, plat, plon, dlat, dlon, pcount, latdiff, londiff, euclidean,cityBlockDist) = INPUT_COLUMNS

    # Bucketize the times 
    hourbuckets = np.linspace(0.0, 23.0, 24).tolist()
    b_hourofday = tf.feature_column.bucketized_column(hourofday, hourbuckets)
    weekdaybuckets = np.linspace(0.0, 6.0, 7).tolist()
    b_dayofweek = tf.feature_column.bucketized_column(dayofweek, weekdaybuckets)
    #since2000buckets = np.linspace(0.0, 599999, 60000).tolist()
    #b_hourSince2000 = tf.feature_column.bucketized_column(hourSince2000, since2000buckets)
    
    # Bucketize the lats & lons
    latbuckets = np.linspace(38.0, 42.0, nbuckets).tolist()
    lonbuckets = np.linspace(-76.0, -72.0, nbuckets).tolist()
    b_plat = tf.feature_column.bucketized_column(plat, latbuckets)
    b_dlat = tf.feature_column.bucketized_column(dlat, latbuckets)
    b_plon = tf.feature_column.bucketized_column(plon, lonbuckets)
    b_dlon = tf.feature_column.bucketized_column(dlon, lonbuckets)
   
    # Feature cross
    ploc = tf.feature_column.crossed_column([b_plat, b_plon], nbuckets * nbuckets)
    dloc = tf.feature_column.crossed_column([b_dlat, b_dlon], nbuckets * nbuckets)
    pd_pair = tf.feature_column.crossed_column([ploc, dloc], nbuckets ** 4 )
    day_hr =  tf.feature_column.crossed_column([b_dayofweek, b_hourofday], 24 * 7)

    # Wide columns and deep columns.
    wide_columns = [
        # Feature crosses
        dloc, ploc, pd_pair,
        day_hr,

        # Sparse columns
        b_dayofweek, b_hourofday,
        #b_hourSince2000,

        # Anything with a linear relationship
        pcount 
    ]

    deep_columns = [
        # Embedding_column to "group" together ...
        tf.feature_column.embedding_column(pd_pair, 10),
        tf.feature_column.embedding_column(day_hr, 10),
        #tf.feature_column.embedding_column(b_hourSince2000, 60000),
        # Numeric columns
        plat, plon, dlat, dlon,
        latdiff, londiff, euclidean,cityBlockDist
    ]
    
    estimator = tf.estimator.DNNLinearCombinedRegressor(
        model_dir = model_dir,
        linear_feature_columns = wide_columns,
        dnn_feature_columns = deep_columns,
        dnn_hidden_units = hidden_units)

    # add extra evaluation metric for hyperparameter tuning
      
    estimator = tf.contrib.estimator.add_metrics(estimator, rmse)
    return estimator

## Build a neural network model
#### In this exercise, we'll be trying to predicttaxi fares. Ok get all the features into a dictionary

In [12]:
feature_columns={}
for i in INPUT_COLUMNS:
    feature_columns[i.key]=i
list(feature_columns.keys())

['weekday',
 'hour',
 'pickup_latitude',
 'pickup_longitude',
 'dropoff_latitude',
 'dropoff_longitude',
 'passenger_count',
 'latdiff',
 'londiff',
 'euclidean',
 'cityBlockDist']

In [13]:
evaldf=test_df.test
eval_input_fn = tf.estimator.inputs.pandas_input_fn(x = evaldf[list(feature_columns.keys())],
                                                    y = evaldf["fare_amount"] / SCALE,  # note the scaling
                                                    num_epochs = 1000, 
                                                    batch_size = len(evaldf), 
                                                    shuffle=False)

#### Take the panda data and use the estimator functions to turn it into processed data

In [14]:
traindf=df.train
time7=datetime.now()
# Split into train and eval and create input functions

train_input_fn = tf.estimator.inputs.pandas_input_fn(x = traindf[list(feature_columns.keys())],
                                                    y = traindf["fare_amount"] / SCALE,
                                                    num_epochs = 1000,
                                                    batch_size = BATCH_SIZE,
                                                    shuffle = True)

In [15]:
#tf.logging.set_verbosity(tf.logging.INFO)
myopt = tf.train.FtrlOptimizer(learning_rate = LEARNING_RATE) # note the learning rate
estimator = estimator = build_estimator(OUTDIR, BUCKETS, HIDDEN_UNITS.split(' '))
    
estimator = tf.contrib.estimator.add_metrics(estimator,rmse)
  
train_spec=tf.estimator.TrainSpec(
                    input_fn = train_input_fn,max_steps = STEPS_TO_PROCESS)
eval_spec=tf.estimator.EvalSpec(
                    input_fn = eval_input_fn,
                    steps = None,
                    start_delay_secs = 1, # start evaluating after N seconds
                    throttle_secs = 10,  # evaluate every N seconds
                    )
print("***********Begin train****************")
tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
print("***********End train******************")

time8=datetime.now()
model_train_time=time8-time7
print("Model Train Consuming Time:")
print(model_train_time)

***********Begin train****************
***********End train******************
Model Train Consuming Time:
0:00:21.231638


## Calculate RMSE

In [16]:
def RMSE(x, y):
    return np.sqrt(((x - y) ** 2).mean())

In [17]:
evalaution_input_fn = tf.estimator.inputs.pandas_input_fn(x = evaldf[list(feature_columns.keys())],
                                                    y = None,  
                                                    num_epochs = 1, 
                                                    batch_size = len(evaldf), 
                                                    shuffle=False)
evlaution_y = evaldf["fare_amount"]
evlaution_result=estimator.predict(input_fn=evalaution_input_fn)
eval_pred_result = pd.DataFrame({'fare_amount':[i['predictions'][0]*SCALE for i in evlaution_result]})
eval_pred_result2=eval_pred_result["fare_amount"]

print(eval_pred_result2)
print(evlaution_y)
RMSE_value=RMSE(eval_pred_result2,evlaution_y)
print("RMSE Value:")
print(RMSE(eval_pred_result2,evlaution_y))
print("Data Prepare Consuming time:")
print(data_prepare_time)
print("Model Train Consuming Time:")
print(model_train_time)

0          29.5
1          12.5
2           6.9
3           7.5
4           9.1
           ... 
37931777   11.7
37931778    8.5
37931779    7.6
37931780   31.6
37931781    8.2
Name: fare_amount, Length: 37931782, dtype: float64
0           4.5
1           7.7
2           5.3
3           7.5
4          16.5
           ... 
37931777    6.1
37931778   12.0
37931779    4.2
37931780   28.9
37931781    7.5
Name: fare_amount, Length: 37931782, dtype: float64
RMSE Value:
19.302091589003933
Data Prepare Consuming time:
0:11:21.274624
Model Train Consuming Time:
0:00:21.231638


In [18]:
# first time
RMSE Value:
19.302091589003933
Data Prepare Consuming time:
0:09:56.071902
Model Train Consuming Time:
0:00:14.203152

SyntaxError: invalid syntax (<ipython-input-18-95a4457c4c76>, line 2)

In [None]:
#second time
RMSE Value:
19.302091589003933
Data Prepare Consuming time:
0:11:21.274624
Model Train Consuming Time:
0:00:21.231638