# New York City Taxi Fare Prediction

In [1]:
import polars as pl
import numpy as np

df = pl.read_csv("../../datasets/new-york-city-taxi-fare-prediction/train.csv", n_rows=1_000_000)
# 52.5s

In [2]:
test = pl.read_csv("../../datasets/new-york-city-taxi-fare-prediction/test.csv")

In [3]:
df.head()

key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
str,f64,str,f64,f64,f64,f64,i64
"""2009-06-15 17:26:21.0000001""",4.5,"""2009-06-15 17:26:21 UTC""",-73.844311,40.721319,-73.84161,40.712278,1
"""2010-01-05 16:52:16.0000002""",16.9,"""2010-01-05 16:52:16 UTC""",-74.016048,40.711303,-73.979268,40.782004,1
"""2011-08-18 00:35:00.00000049""",5.7,"""2011-08-18 00:35:00 UTC""",-73.982738,40.76127,-73.991242,40.750562,2
"""2012-04-21 04:30:42.0000001""",7.7,"""2012-04-21 04:30:42 UTC""",-73.98713,40.733143,-73.991567,40.758092,1
"""2010-03-09 07:51:00.000000135""",5.3,"""2010-03-09 07:51:00 UTC""",-73.968095,40.768008,-73.956655,40.783762,1


In [4]:
df.describe()

statistic,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
str,str,f64,str,f64,f64,f64,f64,f64
"""count""","""1000000""",1000000.0,"""1000000""",1000000.0,1000000.0,999990.0,999990.0,1000000.0
"""null_count""","""0""",0.0,"""0""",0.0,0.0,10.0,10.0,0.0
"""mean""",,11.348079,,-72.52664,39.929008,-72.52786,39.919954,1.684924
"""std""",,9.82209,,12.057937,7.626154,11.324494,8.201418,1.323911
"""min""","""2009-01-01 00:00:46.0000002""",-44.9,"""2009-01-01 00:00:46 UTC""",-3377.680935,-3116.285383,-3383.296608,-3114.338567,0.0
"""25%""",,6.0,,-73.99206,40.734965,-73.991385,40.734046,1.0
"""50%""",,8.5,,-73.981792,40.752695,-73.980135,40.753166,1.0
"""75%""",,12.5,,-73.967094,40.767154,-73.963654,40.768129,2.0
"""max""","""2015-06-30 23:53:49.0000003""",500.0,"""2015-06-30 23:53:49 UTC""",2522.271325,2621.62843,45.581619,1651.553433,208.0


## Feature Engineering

Drop un-useful features

In [5]:
df.columns

['key',
 'fare_amount',
 'pickup_datetime',
 'pickup_longitude',
 'pickup_latitude',
 'dropoff_longitude',
 'dropoff_latitude',
 'passenger_count']

Extract time columns from "pickup_datetime"

In [6]:
# Extract time columns from "pickup_datetime"
# Day of month, Month of year, hour of day, and convert then to INTs

if "pickup_datetime" in df.columns:
	df = df.with_columns(
		[
			pl.col("pickup_datetime").str.slice(8, 2).cast(pl.Int32).alias("day"),
			pl.col("pickup_datetime").str.slice(5, 2).cast(pl.Int32).alias("month"),
			pl.col("pickup_datetime").str.slice(11, 2).cast(pl.Int32).alias("hour"),
		]
	)
	df = df.drop("pickup_datetime")

if "pickup_datetime" in test.columns:
	test = test.with_columns(
		[
			pl.col("pickup_datetime").str.slice(8, 2).cast(pl.Int32).alias("day"),
			pl.col("pickup_datetime").str.slice(5, 2).cast(pl.Int32).alias("month"),
			pl.col("pickup_datetime").str.slice(11, 2).cast(pl.Int32).alias("hour"),
		]
	)
	test = test.drop("pickup_datetime")


In [7]:
test.head(1)

key,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,day,month,hour
str,f64,f64,f64,f64,i64,i32,i32,i32
"""2015-01-27 13:08:24.0000002""",-73.97332,40.763805,-73.98143,40.743835,1,27,1,13


## Modelling

### 0. Set up

In [8]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.callbacks import EarlyStopping

2024-09-19 14:46:03.343741: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-19 14:46:03.357210: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-19 14:46:03.360830: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-19 14:46:03.371367: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
# handle missing values (Polars)
df = df.drop_nulls()

In [10]:
from sklearn.model_selection import train_test_split

X = df.drop(["key", "fare_amount"]).to_numpy()
y = df["fare_amount"].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
print(np.isnan(X_train).sum())  # Check for NaNs in features
print(np.isnan(y_train).sum())  # Check for NaNs in target

print(np.isinf(X_train).sum())  # Check for infinities in features
print(np.isinf(y_train).sum())  # Check for infinities in target

0
0
0
0


### 1. Architecture

In [12]:
def get_model():
    model = Sequential()
    
    n_cols = X_train.shape[1]
    
    model.add(Dense(64, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    
    model.add(Dense(32, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    
    model.add(Dense(16, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    
    model.add(Dense(1))
    
    return model

model = get_model()

I0000 00:00:1726749965.689893   16257 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1726749965.714539   16257 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1726749965.714658   16257 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1726749965.717485   16257 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1726749965.717628   16257 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:0

### 2. Compile

In [13]:
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='mean_squared_error')

### 3. Fit

In [39]:
from tensorflow.keras.callbacks import Callback

class EpochLogger(Callback):
    def on_epoch_end(self, epoch, logs=None):
        if epoch % 10 == 0:
            print(f"Epoch: {epoch}, Loss: {logs['loss']}, Val Loss: {logs['val_loss']}")

early_stopping = EarlyStopping(monitor='loss', patience=50, verbose=1, restore_best_weights=True)
epoch_logger = EpochLogger()

model.fit(X_train, y_train,
			epochs=300,
   			batch_size=4096,
      		validation_split=0.2,
			callbacks=[early_stopping, epoch_logger],
			verbose=0)


model.save("model.keras")

Epoch: 0, Loss: 36.76386642456055, Val Loss: 161.05006408691406
Epoch: 10, Loss: 39.82878494262695, Val Loss: 193.03878784179688
Epoch: 20, Loss: 38.46057891845703, Val Loss: 1384.248291015625
Epoch: 30, Loss: 37.15626525878906, Val Loss: 186.79763793945312
Epoch: 40, Loss: 37.25606155395508, Val Loss: 226.6927490234375
Epoch: 50, Loss: 37.330047607421875, Val Loss: 380.3210144042969
Epoch 56: early stopping
Restoring model weights from the end of the best epoch: 6.


### 4. Evaluate predictions

In [40]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

y_pred = model.predict(X_test)

print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")

[1m6250/6250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 1ms/step
MSE: 597.4277367138501
MAE: 23.42888110469049


In [41]:
predictions_df = pl.DataFrame(
	{
		"key": test["key"],
		"fare_amount": model.predict(test.drop(["key"]).to_numpy()).flatten(),
	}
)

[1m310/310[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step


In [42]:
predictions_df.write_csv("predicted_fare_amounts.csv")