A simple machine learning algorithm which predicts the cost of a taxi trip in NYC, using data from 2024

In [1]:
# Importing dependencies

import io
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import requests

import keras

2024-09-01 17:35:08.215058: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-01 17:35:08.224260: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-01 17:35:08.275048: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-01 17:35:08.349083: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-01 17:35:08.411284: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been 

In [3]:
# Create data frame containing nyc taxi data
data = pq.ParquetFile("data.parquet")
first_thousand = next(data.iter_batches(batch_size=70000))
df = pa.Table.from_batches([first_thousand]).to_pandas()

training_df = df[['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'trip_distance', 'fare_amount']]
training_df

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,fare_amount
0,2024-01-01 00:57:55,2024-01-01 01:17:43,1.72,17.70
1,2024-01-01 00:03:00,2024-01-01 00:09:36,1.80,10.00
2,2024-01-01 00:17:06,2024-01-01 00:35:01,4.70,23.30
3,2024-01-01 00:36:38,2024-01-01 00:44:56,1.40,10.00
4,2024-01-01 00:46:51,2024-01-01 00:52:57,0.80,7.90
...,...,...,...,...
69995,2024-01-01 23:06:03,2024-01-01 23:28:54,5.10,25.40
69996,2024-01-01 23:51:07,2024-01-01 23:59:24,4.44,19.10
69997,2024-01-01 23:00:00,2024-01-01 23:17:31,7.99,33.10
69998,2024-01-01 23:03:21,2024-01-01 23:18:07,7.37,19.47


In [4]:
# Summary statistics of nyc taxi dataset
training_df.describe(include="all")

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,fare_amount
count,70000,70000,70000.0,70000.0
mean,2024-01-01 04:00:41.879085,2024-01-01 04:17:12.342257,4.196633,21.323614
min,2002-12-31 22:59:39,2002-12-31 23:05:41,0.0,-404.1
25%,2024-01-01 03:08:08,2024-01-01 03:22:32.500000,1.14,8.6
50%,2024-01-01 12:44:45,2024-01-01 12:58:25,2.08,13.5
75%,2024-01-01 17:15:06,2024-01-01 17:32:06.500000,4.52,25.4
max,2024-01-03 22:46:45,2024-01-03 22:53:44,80.85,820.0
std,,,5.277469,22.529081


In [5]:
# Create machine learning model

def build_model(learning_rate, num_features):
    
    model = keras.models.Sequential()
    model.add(keras.layers.Input(shape=(num_features,)))
    model.add(keras.layers.Dense(units=1))
    model.compile(optimizer=keras.optimizers.RMSprop(learning_rate=learning_rate),
                  loss="mean_squared_error",
                  metrics=[keras.metrics.RootMeanSquaredError()])
    
    return model

def train_model(model, df, features, label, epochs, batch_size):
    history = model.fit(x=features,
                        y=label,
                        batch_size=batch_size,
                        epochs=epochs)
    
    trained_weight = model.get_weights()[0]
    trained_bias = model.get_weights()[1]

    epochs = history.epoch

    hist = pd.DataFrame(history.history)

    rmse = hist["root_mean_squared_error"]

    return trained_weight, trained_bias, epochs, rmse

def run(df, feature_names, label_name, learning_rate, epochs, batch_size):
    num_features = len(feature_names)
    features = df.loc[:, feature_names].values
    label = df[label_name].values

    model = build_model(learning_rate, num_features)
    model_output = train_model(model, df, features, label, epochs, batch_size)

    return model


In [6]:
# Generate a correlation matrix to determine fields with greatest correlation to fare amount
print(df.corr(numeric_only=True))

# From this, we see that trip_distance, toll_amount and tip_amount have the greatest influence on fare amount

                       VendorID  passenger_count  trip_distance  RatecodeID  \
VendorID               1.000000         0.075500       0.041811   -0.076264   
passenger_count        0.075500         1.000000       0.046219   -0.015674   
trip_distance          0.041811         0.046219       1.000000    0.081495   
RatecodeID            -0.076264        -0.015674       0.081495    1.000000   
PULocationID          -0.018388         0.001400      -0.119775   -0.023331   
DOLocationID          -0.019159        -0.008502      -0.075902   -0.009678   
payment_type           0.018626         0.019698      -0.040390   -0.019087   
fare_amount            0.031102         0.060569       0.824931    0.111314   
extra                 -0.611593        -0.060294       0.118988   -0.046627   
mta_tax               -0.063564        -0.022567      -0.047848   -0.049243   
tip_amount             0.031927         0.016859       0.549019    0.008681   
tolls_amount           0.020758         0.048000    

In [8]:
# Hence, use these as the features for the model.
training_df = df[['trip_distance', 'tip_amount', 'tolls_amount', 'fare_amount', 'Airport_fee']].copy()

learning_rate = 0.0001
epochs = 100
batch_size = 100

# Specify the feature and the label.
features = ['trip_distance', 'tip_amount', 'tolls_amount', 'Airport_fee']
label = 'fare_amount'

model_1 = run(training_df, features, label, learning_rate, epochs, batch_size)

Epoch 1/100
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 886.1326 - root_mean_squared_error: 29.7642
Epoch 2/100
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 819.7328 - root_mean_squared_error: 28.6295
Epoch 3/100
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 902us/step - loss: 796.3590 - root_mean_squared_error: 28.2149
Epoch 4/100
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 906us/step - loss: 729.4849 - root_mean_squared_error: 27.0082
Epoch 5/100
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 989us/step - loss: 679.8127 - root_mean_squared_error: 26.0716
Epoch 6/100
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 995us/step - loss: 634.3954 - root_mean_squared_error: 25.1828
Epoch 7/100
[1m700/700[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 903us/step - loss: 597.3792 - root_mean_squared_error: 24.4398
Epoch 8/100
[1m7

In [25]:
results = model_1.predict_on_batch(training_df.loc[:,features].values)
print(results)


       VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0             2  2024-01-01 00:57:55   2024-01-01 01:17:43                1   
1             1  2024-01-01 00:03:00   2024-01-01 00:09:36                1   
2             1  2024-01-01 00:17:06   2024-01-01 00:35:01                1   
3             1  2024-01-01 00:36:38   2024-01-01 00:44:56                1   
4             1  2024-01-01 00:46:51   2024-01-01 00:52:57                1   
...         ...                  ...                   ...              ...   
69995         1  2024-01-01 23:06:03   2024-01-01 23:28:54                1   
69996         2  2024-01-01 23:51:07   2024-01-01 23:59:24                1   
69997         2  2024-01-01 23:00:00   2024-01-01 23:17:31                1   
69998         2  2024-01-01 23:03:21   2024-01-01 23:18:07                1   
69999         2  2024-01-01 23:09:36   2024-01-01 23:24:43                1   

       trip_distance  RatecodeID store_and_fwd_flag