# Predicting NYC Taxi Fares with public version of XGboost and Modin

### Install necessary packages 

In [1]:
#!pip3 install pandas
#!pip3 install scikit-learn
#!pip3 install xgboost==0.82
#!pip3 install scipy 

### Check conda list - We are running public version of XGboost, Modin

In [2]:
!conda list | grep 'scikit\|xgboost\|modin\|pandas'

pandas                    1.3.5                    pypi_0    pypi
scikit-learn              1.0.2                    pypi_0    pypi
xgboost                   0.82                     pypi_0    pypi


In [3]:
import glob
import socket, time
import pandas as modin_omni_pd
import xgboost as xgb
import numpy as np
from sklearn import preprocessing

import warnings
warnings.filterwarnings('ignore')
import time

# Read the Data 

We'll use Modin on Omnisci to load and parse all CSV files into a DataFrame.

As usual, the data needs to be massaged a bit before we can start adding features that are useful to an ML model.

For example, in the 2014 taxi CSV files, there are `pickup_datetime` and `dropoff_datetime` columns. 

We'll do a little string manipulation, column renaming, and concatenating of DataFrames to sidestep the problems.

In [4]:
#Dictionary of required columns and their datatypes
must_haves = {
     'pickup_datetime': 'datetime64[s]',
     'dropoff_datetime': 'datetime64[s]',
     'passenger_count': 'int32',
     'trip_distance': 'float32',
     'pickup_longitude': 'float32',
     'pickup_latitude': 'float32',
     'rate_code': 'int32',
     'dropoff_longitude': 'float32',
     'dropoff_latitude': 'float32',
     'fare_amount': 'float32'
    }

In [5]:
def clean(ddf, must_haves):
    # replace the extraneous spaces in column names and lower the font type
    tmp = {col:col.strip().lower() for col in list(ddf.columns)}
    ddf = ddf.rename(columns=tmp)

    ddf = ddf.rename(columns={
        'tpep_pickup_datetime': 'pickup_datetime',
        'tpep_dropoff_datetime': 'dropoff_datetime',
        'ratecodeid': 'rate_code'
    })
    
    for col in ddf.columns:
        if col not in must_haves:
            ddf = ddf.drop(columns=col)
            continue
        if ddf[col].dtype == 'object':
            ddf[col] = ddf[col].fillna('-1')
    
    return ddf

In [6]:
##%%time

start = time.time()
df_2014 = modin_omni_pd.concat([
    clean(modin_omni_pd.read_csv(x, parse_dates=['pickup_datetime', 'dropoff_datetime'], nrows=2000000), must_haves)
    for x in glob.glob('nyc_taxi_data.csv')], ignore_index=True)

end = time.time()
data_reading_time = end - start
print(data_reading_time)

4.700435638427734


In [7]:
df_2014.dtypes

pickup_datetime      datetime64[ns]
dropoff_datetime     datetime64[ns]
passenger_count               int64
trip_distance               float64
pickup_longitude            float64
pickup_latitude             float64
rate_code                     int64
dropoff_longitude           float64
dropoff_latitude            float64
fare_amount                 float64
dtype: object

In [8]:
df_2014.shape

(2000000, 10)

# Data Cleanup

We'll use a Euclidean Distance calculation to find total trip distance, and extract additional useful variables from the datetime fields.

In [9]:
#concatenate multiple DataFrames into one bigger one
taxi_df = modin_omni_pd.concat([df_2014], ignore_index=True)

Taking the NYC map coordinates into consideration, we will only select records where tripdistance < 500 miles.

In [10]:
# apply a list of filter conditions to throw out records with missing or outlier values
taxi_df = taxi_df[
    (taxi_df.fare_amount > 1) &
    (taxi_df.fare_amount < 500) &
    (taxi_df.passenger_count > 0) &
    (taxi_df.passenger_count < 6) &
    (taxi_df.pickup_longitude > -75) &
    (taxi_df.pickup_longitude < -73) &
    (taxi_df.dropoff_longitude > -75) &
    (taxi_df.dropoff_longitude < -73) &
    (taxi_df.pickup_latitude > 40) &
    (taxi_df.pickup_latitude < 42) &
    (taxi_df.dropoff_latitude > 40) &
    (taxi_df.dropoff_latitude < 42) &
    (taxi_df.trip_distance > 0) &
    (taxi_df.trip_distance < 500) &
    ((taxi_df.trip_distance <= 50) | (taxi_df.fare_amount >= 50)) &
    ((taxi_df.trip_distance >= 10) | (taxi_df.fare_amount <= 300)) &
    (taxi_df.dropoff_datetime > taxi_df.pickup_datetime)]

In [11]:
# reset_index and drop index column
taxi_df = taxi_df.reset_index(drop=True)

# Adding Interesting Features

We will add a column as a new feature 

In [12]:
## add features
taxi_df['day'] = taxi_df['pickup_datetime'].dt.day

#calculate the time difference between dropoff and pickup.
taxi_df['diff'] = taxi_df['dropoff_datetime'].astype('int64') - taxi_df['pickup_datetime'].astype('int64')

taxi_df['pickup_latitude_r'] = taxi_df['pickup_latitude']//.01*.01
taxi_df['pickup_longitude_r'] = taxi_df['pickup_longitude']//.01*.01
taxi_df['dropoff_latitude_r'] = taxi_df['dropoff_latitude']//.01*.01
taxi_df['dropoff_longitude_r'] = taxi_df['dropoff_longitude']//.01*.01

taxi_df = taxi_df.drop('pickup_datetime', axis=1)
taxi_df = taxi_df.drop('dropoff_datetime', axis=1)

dlon = taxi_df['dropoff_longitude'] - taxi_df['pickup_longitude']
dlat = taxi_df['dropoff_latitude'] - taxi_df['pickup_latitude']
taxi_df['e_distance'] = dlon * dlon + dlat * dlat

# Pick a Training Set

Let's imagine you're making a trip to New York on the 25th and want to build a model to predict what fare prices will be like the last few days of the month based on the first part of the month. We'll use a query expression to identify the `day` of the month to use to divide the data into train and test sets.

In [13]:
#since we calculated the h_distance let's drop the trip_distance column, and then do model training with XGB.
taxi_df = taxi_df.drop('trip_distance', axis=1)

In [14]:
# this is the original data partition for train and test sets.
X_train = taxi_df[taxi_df.day < 25]

# create a Y_train ddf with just the target variable
Y_train = X_train[['fare_amount']]
# drop the target variable from the training ddf
X_train = X_train[X_train.columns.difference(['fare_amount'])]

# Train the XGBoost Regression Model

The wall time output below indicates how long it took to train an XGBoost model over the training set.

In [15]:
X_train.shape

(1929562, 13)

In [16]:
Y_train.shape

(1929562, 1)

In [17]:
X_train.dtypes

day                      int64
diff                     int64
dropoff_latitude       float64
dropoff_latitude_r     float64
dropoff_longitude      float64
dropoff_longitude_r    float64
e_distance             float64
passenger_count          int64
pickup_latitude        float64
pickup_latitude_r      float64
pickup_longitude       float64
pickup_longitude_r     float64
rate_code                int64
dtype: object

In [18]:
Y_train.dtypes

fare_amount    float64
dtype: object

In [19]:
X_train.head()

Unnamed: 0,day,diff,dropoff_latitude,dropoff_latitude_r,dropoff_longitude,dropoff_longitude_r,e_distance,passenger_count,pickup_latitude,pickup_latitude_r,pickup_longitude,pickup_longitude_r,rate_code
0,9,426000000000,40.73179,40.73,-73.982227,-73.99,0.000183,1,40.736828,40.73,-73.99477,-74.0,1
1,9,540000000000,40.763995,40.76,-73.960449,-73.97,0.00057,1,40.773382,40.77,-73.982392,-73.99,1
2,9,899000000000,40.765217,40.76,-73.986626,-73.99,0.00067,2,40.739406,40.73,-73.98857,-73.99,1
3,9,403000000000,40.77705,40.77,-73.979863,-73.98,0.000429,1,40.770464,40.77,-73.960213,-73.97,1
4,9,383000000000,40.720524,40.72,-73.984367,-73.99,0.000132,1,40.717248,40.71,-73.995371,-74.0,1


In [20]:
#backup y_train for scikit learn 
y_train = Y_train


encoder = preprocessing.LabelEncoder()
Y_train = encoder.fit_transform(Y_train)

In [21]:
dtrain = xgb.DMatrix(X_train, Y_train)

## Time to Train our XGBoost Model!

In [22]:
#%%time

#reg:squarederror
start = time.time()

trained_model = xgb.train({
    'learning_rate': 0.3,
    'max_depth': 8,
    'objective': 'reg:linear',
    'subsample': 0.6,
    'gamma': 1,
    'silent': True,
    'verbose_eval': True,
    'tree_method':'hist'
    },
    dtrain,
    num_boost_round=100, evals=[(dtrain, 'train')])

end = time.time()
data_training_time = end - start
print(data_training_time)

[0]	train-rmse:23.461
[1]	train-rmse:16.8155
[2]	train-rmse:12.292
[3]	train-rmse:9.26366
[4]	train-rmse:7.294
[5]	train-rmse:6.07585
[6]	train-rmse:5.3468
[7]	train-rmse:4.93009
[8]	train-rmse:4.67608
[9]	train-rmse:4.53335
[10]	train-rmse:4.44764
[11]	train-rmse:4.3933
[12]	train-rmse:4.3507
[13]	train-rmse:4.31815
[14]	train-rmse:4.29412
[15]	train-rmse:4.27754
[16]	train-rmse:4.24805
[17]	train-rmse:4.23907
[18]	train-rmse:4.23242
[19]	train-rmse:4.22835
[20]	train-rmse:4.22469
[21]	train-rmse:4.219
[22]	train-rmse:4.19642
[23]	train-rmse:4.17557
[24]	train-rmse:4.16801
[25]	train-rmse:4.15575
[26]	train-rmse:4.14998
[27]	train-rmse:4.14701
[28]	train-rmse:4.14293
[29]	train-rmse:4.14005
[30]	train-rmse:4.13449
[31]	train-rmse:4.12057
[32]	train-rmse:4.11319
[33]	train-rmse:4.10494
[34]	train-rmse:4.09091
[35]	train-rmse:4.07712
[36]	train-rmse:4.06976
[37]	train-rmse:4.06833
[38]	train-rmse:4.05999
[39]	train-rmse:4.05727
[40]	train-rmse:4.03192
[41]	train-rmse:4.02167
[42]	train-

In [23]:
#ax = xgb.plot_importance(trained_model, height=0.8, max_num_features=10, importance_type="gain")
#ax.grid(False, axis="y")
#ax.set_title('Estimated feature importance')
#ax.set_xlabel('importance')
#plt.show()

# How Good is Our Model?

Now that we have a trained model, we need to test it with the 25% of records we held out.

In [24]:
X_test = taxi_df[taxi_df.day >= 25]

# Create Y_test with just the fare amount
Y_test = X_test[['fare_amount']]

# Drop the fare amount from X_test
X_test = X_test[X_test.columns.difference(['fare_amount'])]

# display test set size
X_test.shape

(274, 13)

## Calculate Prediction

In [25]:
# generate predictions on the test set
booster = trained_model
prediction = modin_omni_pd.Series(booster.predict(xgb.DMatrix(X_test)))
prediction.shape

(274,)

In [26]:
# prediction = prediction.map_partitions(lambda part: cudf.Series(part)).reset_index(drop=True)
actual = Y_test['fare_amount'].reset_index(drop=True)

In [27]:
prediction.head()

0    28.713934
1    12.464524
2    13.762292
3    13.779843
4    30.785995
dtype: float32

In [28]:
actual.head()

0    13.0
1     7.5
2     8.0
3     8.0
4    14.5
Name: fare_amount, dtype: float64

In [29]:
# Calculate RMSE
squared_error = ((prediction-actual)**2)

# compute the actual RMSE over the full test set
np.sqrt(squared_error.mean())

15.863060614125194

In [30]:
print(data_reading_time)
print(data_training_time)

4.700435638427734
11.158612251281738


In [31]:
df = modin_omni_pd.read_csv('compare.csv', sep='\t')
data = [data_reading_time, data_training_time]
df['With_Public_version'] = data
df.to_csv('compare.csv', sep='\t', index=False)
df

Unnamed: 0,Type_of_operation,With_intel_optimizations,With_Public_version
0,data_reading_time,3.509441,4.700436
1,XGboost_training_time,1.837119,11.158612
