# New York City Taxi Trip Duration - Attempts 3

## Modified the metric from `log1p RMSE` to Raw `RMSE` for both Keras and Keras Tuner

## Keras - 3 Attempts

# Attempt 1

In [None]:

import os
import random
import time
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError

# Reproducibility
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

# Data Loading
# Assuming CSVs are extracted from the provided ZIPs
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Identify columns
id_col = 'id'
target_columns = ['trip_duration']

# Prepare training data
df = df_train.copy()
# Log-transform target for regression
y_values = df[target_columns].astype(float).values
y_enc = np.log1p(y_values)
X = df.drop(columns=target_columns + [id_col], errors='ignore')

# Prepare test features
test_ids = df_test[id_col]
X_test = df_test.drop(columns=target_columns + [id_col], errors='ignore')

# Feature Engineering
# 1. Drop columns with all missing values
X = X.loc[:, X.isnull().mean() < 1.0]
X_test = X_test[X.columns]

# 2. Extract time features
for df_ in [X, X_test]:
    df_['pickup_datetime'] = pd.to_datetime(df_['pickup_datetime'])
    df_['hour'] = df_['pickup_datetime'].dt.hour
    df_['dayofweek'] = df_['pickup_datetime'].dt.dayofweek

# 3. Compute haversine distance
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    dphi = np.radians(lat2 - lat1)
    dlambda = np.radians(lon2 - lon1)
    a = np.sin(dphi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(dlambda/2)**2
    return R * 2 * np.arcsin(np.sqrt(a))

for df_ in [X, X_test]:
    df_['haversine'] = haversine_distance(
        df_['pickup_latitude'], df_['pickup_longitude'],
        df_['dropoff_latitude'], df_['dropoff_longitude']
    )

# 4. Drop raw datetime & coordinate columns
drop_cols = ['pickup_datetime', 'dropoff_datetime',
             'pickup_latitude', 'pickup_longitude',
             'dropoff_latitude', 'dropoff_longitude']
X = X.drop(columns=[c for c in drop_cols if c in X])
X_test = X_test.drop(columns=[c for c in drop_cols if c in X_test])

# Train/Validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y_enc, test_size=0.2, random_state=seed
)

# Preprocessing Pipeline
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
# Drop high-cardinality cats
cat_cols = [c for c in cat_cols if X_train[c].nunique() < 50]

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols)
])

X_train_proc = preprocessor.fit_transform(X_train)
X_val_proc = preprocessor.transform(X_val)
X_test_proc = preprocessor.transform(X_test)

# Model Architecture for small feature set
n_features = X_train_proc.shape[1]
layer1 = min(n_features * 2, 128)
layer2 = min(n_features, 64)

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(n_features,)),
    tf.keras.layers.Dense(layer1, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(layer2, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='linear')
])

model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=[RootMeanSquaredError(name='root_mean_squared_error'), MeanAbsoluteError(name='mean_absolute_error')]
)

# Callbacks
callbacks = [
    EarlyStopping(patience=10, restore_best_weights=True),
    ModelCheckpoint('best_model.h5', save_best_only=True, monitor='val_loss')
]

# Training
t0 = time.time()
history = model.fit(
    X_train_proc, y_train,
    validation_data=(X_val_proc, y_val),
    epochs=100,
    batch_size=64,
    callbacks=callbacks,
    verbose=2
)
train_duration = time.time() - t0

# Evaluation & Logging
results = {
    'training_accuracy': history.history['root_mean_squared_error'][-1],
    'training_loss': history.history['loss'][-1],
    'validation_accuracy': history.history['val_root_mean_squared_error'][-1],
    'validation_loss': history.history['val_loss'][-1]
}
with open('results.json', 'w') as f:
    json.dump(results, f)

# Prediction & Submission
raw_preds = model.predict(X_test_proc)
final_preds = np.expm1(np.clip(raw_preds, a_min=None, a_max=20))
if final_preds.ndim == 1:
    final_preds = final_preds.reshape(-1, 1)

submission = pd.DataFrame(final_preds, columns=target_columns)
submission.insert(0, id_col, test_ids.reset_index(drop=True))
submission.to_csv('submission_result.csv', index=False)


KeyError: "['dropoff_datetime'] not in index"

# Attempt 2 - Follow-up

In [None]:
import os
import random
import time
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError

# Reproducibility
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

# Data Loading
# Assuming CSVs are extracted from the provided ZIPs
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Identify columns
id_col = 'id'
target_columns = ['trip_duration']

# Prepare training data
df = df_train.copy()
# Log-transform target for regression
y_values = df[target_columns].astype(float).values
y_enc = np.log1p(y_values)
X = df.drop(columns=target_columns + [id_col], errors='ignore')

# Prepare test features
test_ids = df_test[id_col]
X_test = df_test.drop(columns=target_columns + [id_col], errors='ignore')

# Feature Engineering
# 1. Drop columns with all missing values
X = X.loc[:, X.isnull().mean() < 1.0]
X_test = X_test.loc[:, X.columns]  # Fixed this line

# 2. Extract time features
for df_ in [X, X_test]:
    df_['pickup_datetime'] = pd.to_datetime(df_['pickup_datetime'])
    df_['hour'] = df_['pickup_datetime'].dt.hour
    df_['dayofweek'] = df_['pickup_datetime'].dt.dayofweek

# 3. Compute haversine distance
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    dphi = np.radians(lat2 - lat1)
    dlambda = np.radians(lon2 - lon1)
    a = np.sin(dphi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(dlambda/2)**2
    return R * 2 * np.arcsin(np.sqrt(a))

for df_ in [X, X_test]:
    df_['haversine'] = haversine_distance(
        df_['pickup_latitude'], df_['pickup_longitude'],
        df_['dropoff_latitude'], df_['dropoff_longitude']
    )

# 4. Drop raw datetime & coordinate columns
drop_cols = ['pickup_datetime', 'dropoff_datetime',
             'pickup_latitude', 'pickup_longitude',
             'dropoff_latitude', 'dropoff_longitude']
X = X.drop(columns=[c for c in drop_cols if c in X])
X_test = X_test.drop(columns=[c for c in drop_cols if c in X_test])

# Train/Validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y_enc, test_size=0.2, random_state=seed
)

# Preprocessing Pipeline
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
# Drop high-cardinality cats
cat_cols = [c for c in cat_cols if X_train[c].nunique() < 50]

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols)
])

X_train_proc = preprocessor.fit_transform(X_train)
X_val_proc = preprocessor.transform(X_val)
X_test_proc = preprocessor.transform(X_test)

# Model Architecture for small feature set
n_features = X_train_proc.shape[1]
layer1 = min(n_features * 2, 128)
layer2 = min(n_features, 64)

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(n_features,)),
    tf.keras.layers.Dense(layer1, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(layer2, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='linear')
])

model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=[RootMeanSquaredError(name='root_mean_squared_error'), MeanAbsoluteError(name='mean_absolute_error')]
)

# Callbacks
callbacks = [
    EarlyStopping(patience=10, restore_best_weights=True),
    ModelCheckpoint('best_model.h5', save_best_only=True, monitor='val_loss')
]

# Training
t0 = time.time()
history = model.fit(
    X_train_proc, y_train,
    validation_data=(X_val_proc, y_val),
    epochs=100,
    batch_size=64,
    callbacks=callbacks,
    verbose=2
)
train_duration = time.time() - t0

# Evaluation & Logging
results = {
    'training_accuracy': history.history['root_mean_squared_error'][-1],
    'training_loss': history.history['loss'][-1],
    'validation_accuracy': history.history['val_root_mean_squared_error'][-1],
    'validation_loss': history.history['val_loss'][-1]
}
with open('results.json', 'w') as f:
    json.dump(results, f)

# Prediction & Submission
raw_preds = model.predict(X_test_proc)
final_preds = np.expm1(np.clip(raw_preds, a_min=None, a_max=20))
if final_preds.ndim == 1:
    final_preds = final_preds.reshape(-1, 1)

submission = pd.DataFrame(final_preds, columns=target_columns)
submission.insert(0, id_col, test_ids.reset_index(drop=True))
submission.to_csv('submission_result.csv', index=False)

KeyError: "['dropoff_datetime'] not in index"

# Attempt 3 - Follow-up with the error and a hint: 

# Error: 

In [None]:
KeyError                                  Traceback (most recent call last)
Cell In[10], line 51
     49 X_train = X_train.dropna(axis=1, how='all')
     50 X_val = X_val[X_train.columns]
---> 51 df_test = df_test[X_train.columns]
     53 # Extract datetime features and drop originals
     54 for ds in [X_train, X_val, df_test]:

File ~/.local/lib/python3.12/site-packages/pandas/core/frame.py:4113, in DataFrame.__getitem__(self, key)
   4111     if is_iterator(key):
   4112         key = list(key)
-> 4113     indexer = self.columns._get_indexer_strict(key, "columns")[1]
   4115 # take() does not accept boolean indexers
   4116 if getattr(indexer, "dtype", None) == bool:

File ~/.local/lib/python3.12/site-packages/pandas/core/indexes/base.py:6212, in Index._get_indexer_strict(self, key, axis_name)
   6209 else:
   6210     keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)
-> 6212 self._raise_if_missing(keyarr, indexer, axis_name)
   6214 keyarr = self.take(indexer)
   6215 if isinstance(key, Index):
   6216     # GH 42790 - Preserve name from an Index

File ~/.local/lib/python3.12/site-packages/pandas/core/indexes/base.py:6264, in Index._raise_if_missing(self, key, indexer, axis_name)
   6261     raise KeyError(f"None of [{key}] are in the [{axis_name}]")
   6263 not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
-> 6264 raise KeyError(f"{not_found} not in index")

KeyError: "['dropoff_datetime'] not in index"

# Hint:

In [None]:
the column doesnt exist in the testing file drop it from the training then too 

# Results

In [None]:
import random, os, time, json
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Data loading
# Assumes CSV inside each zip has same base name
df_train = pd.read_csv('nyc-taxi-trip-duration/train.zip', compression='zip')
df_test = pd.read_csv('nyc-taxi-trip-duration/test.zip', compression='zip')
sample_sub = pd.read_csv('nyc-taxi-trip-duration/sample_submission.zip', compression='zip')

# Infer columns
id_col = sample_sub.columns[0]
target_columns = sample_sub.columns.tolist()[1:]

# Prepare train data
df = df_train.copy()

# Target encoding for regression
y_values = df[target_columns].astype(float).values
y_enc = np.log1p(y_values)

# Features and split
X = df.drop(columns=target_columns + [id_col, 'dropoff_datetime'], errors='ignore')  # Drop 'dropoff_datetime'
X_train, X_val, y_train, y_val = train_test_split(
    X, y_enc, test_size=0.2, random_state=SEED
)

# Store IDs for submission
test_ids = df_test[id_col]

# Feature engineering: drop all-missing
X_train = X_train.dropna(axis=1, how='all')
X_val = X_val[X_train.columns]
df_test = df_test[X_train.columns]

# Extract datetime features and drop originals
for ds in [X_train, X_val, df_test]:
    ds['pickup_datetime'] = pd.to_datetime(ds['pickup_datetime'])
    ds['hour'] = ds['pickup_datetime'].dt.hour
    ds['day'] = ds['pickup_datetime'].dt.day
    ds['month'] = ds['pickup_datetime'].dt.month
    ds['weekday'] = ds['pickup_datetime'].dt.weekday
    ds.drop(columns=['pickup_datetime'], inplace=True, errors='ignore')  # Drop only 'pickup_datetime'

# Compute haversine distance
def haversine_np(lat1, lon1, lat2, lon2):
    R = 6371  # km
    lat1, lon1, lat2, lon2 = map(np.radians, (lat1, lon1, lat2, lon2))
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    return 2 * R * np.arcsin(np.sqrt(a)) * 1000  # meters

for ds in [X_train, X_val, df_test]:
    ds['distance'] = haversine_np(
        ds['pickup_latitude'], ds['pickup_longitude'],
        ds['dropoff_latitude'], ds['dropoff_longitude']
    )
    ds.drop(columns=['pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude'], inplace=True)

# Identify numeric vs categorical
numeric_cols = X_train.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object','category']).columns.tolist()
# Cap categorical cardinality
categorical_cols = [c for c in categorical_cols if X_train[c].nunique() <= 50]

# Preprocessing pipelines
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

# Fit-transform
elems = preprocessor.fit_transform(X_train)
X_train_proc = elems
X_val_proc = preprocessor.transform(X_val)
X_test_proc = preprocessor.transform(df_test)

n_samples, n_features = X_train_proc.shape

# Build model: large dataset -> deeper
units = [min(int(n_features * i), 1024) for i in [2,1,0.5,0.25]]
units = [u for u in units if u >= 16]
model = Sequential()
for u in units:
    model.add(Dense(u, activation='relu', input_shape=(n_features,)))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))
# Output layer
model.add(Dense(len(target_columns), activation='linear'))

CAP = 15.0      

def to_seconds(log_v):
    log_v = tf.clip_by_value(log_v, -1e6, CAP)          
    return tf.math.expm1(log_v)

def mse_real(y_true_log, y_pred_log):
    y_true = to_seconds(y_true_log)
    y_pred = to_seconds(y_pred_log)
    return tf.reduce_mean(tf.square(y_true - y_pred))
mse_real.__name__ = "mse_real"

def rmse_real(y_true_log, y_pred_log):
    return tf.sqrt(mse_real(y_true_log, y_pred_log))
rmse_real.__name__ = "rmse_real"

# Compile
model.compile(
    optimizer=Adam(),
    loss='mean_squared_error',
    metrics=[rmse_real, mse_real]
)

# Callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ModelCheckpoint('best_model.h5', save_best_only=True)
]

# Train
start_time = time.time()
history = model.fit(
    X_train_proc, y_train,
    validation_data=(X_val_proc, y_val),
    epochs=100, batch_size=64,
    callbacks=callbacks, verbose=2
)
duration = time.time() - start_time

# Save results
results = {
    'training_loss': history.history['mse_real'][-1],
    'training_rmse': history.history['rmse_real'][-1],
    'validation_loss': history.history['val_mse_real'][-1],
    'validation_rmse': history.history['val_rmse_real'][-1],
    'duration_sec': duration
}
with open('Keras/results.json','w') as f:
    json.dump(results, f)

# Predict & submit
raw_preds = model.predict(X_test_proc)
final = np.expm1(np.clip(raw_preds, a_min=None, a_max=20))
if final.ndim == 1:
    final = final.reshape(-1,1)
submission = pd.DataFrame(final, columns=target_columns)
submission.insert(0, id_col, test_ids.reset_index(drop=True))
submission.to_csv('Keras/submission_result.csv', index=False)

Epoch 1/100




18234/18234 - 21s - 1ms/step - loss: 2.3396 - mse_real: 199568032.0000 - rmse_real: 2124.6367 - val_loss: 0.4060 - val_mse_real: 379958592.0000 - val_rmse_real: 2660.3555
Epoch 2/100




18234/18234 - 21s - 1ms/step - loss: 0.4297 - mse_real: 248985904.0000 - rmse_real: 2203.8848 - val_loss: 0.4060 - val_mse_real: 379952960.0000 - val_rmse_real: 2660.3311
Epoch 3/100
18234/18234 - 21s - 1ms/step - loss: 0.4297 - mse_real: 248983232.0000 - rmse_real: 2203.8794 - val_loss: 0.4060 - val_mse_real: 379952960.0000 - val_rmse_real: 2660.3311
Epoch 4/100
18234/18234 - 21s - 1ms/step - loss: 0.4297 - mse_real: 248983232.0000 - rmse_real: 2203.8794 - val_loss: 0.4060 - val_mse_real: 379952960.0000 - val_rmse_real: 2660.3311
Epoch 5/100
18234/18234 - 21s - 1ms/step - loss: 0.4297 - mse_real: 248983232.0000 - rmse_real: 2203.8794 - val_loss: 0.4060 - val_mse_real: 379952960.0000 - val_rmse_real: 2660.3311
Epoch 6/100
18234/18234 - 21s - 1ms/step - loss: 0.4297 - mse_real: 248983232.0000 - rmse_real: 2203.8794 - val_loss: 0.4060 - val_mse_real: 379952960.0000 - val_rmse_real: 2660.3308
Epoch 7/100
18234/18234 - 21s - 1ms/step - loss: 0.4297 - mse_real: 248983232.0000 - rmse_real: 2

In [16]:
print(duration)

250.13052558898926


## Keras Tuner - 2 Attempts 

## Attempt 1

In [None]:
import random, os, time, json
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Data loading
# Assumes CSV inside each zip has same base name
df_train = pd.read_csv('nyc-taxi-trip-duration/train.zip', compression='zip')
df_test = pd.read_csv('nyc-taxi-trip-duration/test.zip', compression='zip')
sample_sub = pd.read_csv('nyc-taxi-trip-duration/sample_submission.zip', compression='zip')

# Infer columns
id_col = sample_sub.columns[0]
target_columns = sample_sub.columns.tolist()[1:]

# Prepare train data
df = df_train.copy()

# Target encoding for regression
y_values = df[target_columns].astype(float).values
y_enc = np.log1p(y_values)

# Features and split
X = df.drop(columns=target_columns + [id_col, 'dropoff_datetime'], errors='ignore')  # Drop 'dropoff_datetime'
X_train, X_val, y_train, y_val = train_test_split(
    X, y_enc, test_size=0.2, random_state=SEED
)

# Store IDs for submission
test_ids = df_test[id_col]

# Feature engineering: drop all-missing
X_train = X_train.dropna(axis=1, how='all')
X_val = X_val[X_train.columns]
df_test = df_test[X_train.columns]

# Extract datetime features and drop originals
for ds in [X_train, X_val, df_test]:
    ds['pickup_datetime'] = pd.to_datetime(ds['pickup_datetime'])
    ds['hour'] = ds['pickup_datetime'].dt.hour
    ds['day'] = ds['pickup_datetime'].dt.day
    ds['month'] = ds['pickup_datetime'].dt.month
    ds['weekday'] = ds['pickup_datetime'].dt.weekday
    ds.drop(columns=['pickup_datetime'], inplace=True, errors='ignore')  # Drop only 'pickup_datetime'

# Compute haversine distance
def haversine_np(lat1, lon1, lat2, lon2):
    R = 6371  # km
    lat1, lon1, lat2, lon2 = map(np.radians, (lat1, lon1, lat2, lon2))
    dlat = lat2 - lat1
    dlon = lat2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    return 2 * R * np.arcsin(np.sqrt(a)) * 1000  # meters

for ds in [X_train, X_val, df_test]:
    ds['distance'] = haversine_np(
        ds['pickup_latitude'], ds['pickup_longitude'],
        ds['dropoff_latitude'], ds['dropoff_longitude']
    )
    ds.drop(columns=['pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude'], inplace=True)

# Identify numeric vs categorical
numeric_cols = X_train.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object','category']).columns.tolist()
# Cap categorical cardinality
categorical_cols = [c for c in categorical_cols if X_train[c].nunique() <= 50]

# Preprocessing pipelines
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

# Fit-transform
elems = preprocessor.fit_transform(X_train)
X_train_proc = elems
X_val_proc = preprocessor.transform(X_val)
X_test_proc = preprocessor.transform(df_test)

n_samples, n_features = X_train_proc.shape

# Keras-Tuner model definition
import keras_tuner as kt
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

class MyHyperModel(kt.HyperModel):
    def build(self, hp):
        layers = hp.Int('layers', 2, 8)
        units = hp.Int('units', 64, 1024, 64)
        drop = hp.Float('dropout', 0.0, 0.5)
        opt = hp.Choice('optimizer', ['adam'])
        lr = hp.Float('learning_rate', 1e-5, 0.01, sampling='log')

        model = Sequential()
        for _ in range(layers):
            model.add(Dense(units, activation='relu', input_shape=(n_features,)))
            model.add(BatchNormalization())
            model.add(Dropout(drop))
        model.add(Dense(len(target_columns), activation='linear'))

        model.compile(optimizer=opt, loss='mean_squared_error', metrics=[RootMeanSquaredError(name='rmse'), MeanAbsoluteError(name='mae')])
        return model

bs = 32  # Example batch size
ep = 100  # Example epochs

tuner = kt.BayesianOptimization(
    MyHyperModel(),
    objective='val_loss',
    max_trials=1,
    executions_per_trial=1,
    seed=42,
    overwrite=True,
    project_name='bayesian_tuner'
)

if y_val is not None:
    tuner.search(
        X_train_proc, y_train,
        validation_data=(X_val_proc, y_val),
        batch_size=bs, epochs=1,
        callbacks=[early_stopping, checkpoint]
    )
else:
    tuner.search(
        X_train_proc, y_train,
        validation_split=0.2,
        batch_size=bs, epochs=1,
        callbacks=[early_stopping, checkpoint]
    )

model = tuner.hypermodel.build(
    tuner.get_best_hyperparameters(1)[0]
)

if y_val is not None:
    history = model.fit(
        X_train_proc, y_train,
        validation_data=(X_val_proc, y_val),
        epochs=1, batch_size=bs,
        callbacks=[early_stopping, checkpoint],
        verbose=2
    )
else:
    history = model.fit(
        X_train_proc, y_train,
        validation_split=0.2,
        epochs=1, batch_size=bs,
        callbacks=[early_stopping, checkpoint],
        verbose=2
    )

# Save results
results = {
    'training_loss': history.history['loss'][-1],
    'training_rmse': history.history['rmse'][-1],
    'validation_loss': history.history['val_loss'][-1],
    'validation_rmse': history.history['val_rmse'][-1],
    'duration_sec': duration
}
with open('results.json','w') as f:
    json.dump(results, f)

# Predict & submit
raw_preds = model.predict(X_test_proc)
final = np.expm1(np.clip(raw_preds, a_min=None, a_max=20))
if final.ndim == 1:
    final = final.reshape(-1,1)
submission = pd.DataFrame(final, columns=target_columns)
submission.insert(0, id_col, test_ids.reset_index(drop=True))
submission.to_csv('submission_result.csv', index=False)

Trial 1 Complete [00h 01m 41s]
val_loss: 0.6225534677505493

Best val_loss So Far: 0.6225534677505493
Total elapsed time: 00h 01m 41s




36467/36467 - 95s - 3ms/step - loss: 0.9448 - mae: 0.6957 - rmse: 0.9720 - val_loss: 0.6464 - val_mae: 0.5978 - val_rmse: 0.8040


NameError: name 'duration' is not defined

## Attempt 2

In [None]:
import random, os, time, json
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Data loading
# Assumes CSV inside each zip has same base name
df_train = pd.read_csv('nyc-taxi-trip-duration/train.zip', compression='zip')
df_test = pd.read_csv('nyc-taxi-trip-duration/test.zip', compression='zip')
sample_sub = pd.read_csv('nyc-taxi-trip-duration/sample_submission.zip', compression='zip')

# Infer columns
id_col = sample_sub.columns[0]
target_columns = sample_sub.columns.tolist()[1:]

# Prepare train data
df = df_train.copy()

# Target encoding for regression
y_values = df[target_columns].astype(float).values
y_enc = np.log1p(y_values)

# Features and split
X = df.drop(columns=target_columns + [id_col, 'dropoff_datetime'], errors='ignore')  # Drop 'dropoff_datetime'
X_train, X_val, y_train, y_val = train_test_split(
    X, y_enc, test_size=0.2, random_state=SEED
)

# Store IDs for submission
test_ids = df_test[id_col]

# Feature engineering: drop all-missing
X_train = X_train.dropna(axis=1, how='all')
X_val = X_val[X_train.columns]
df_test = df_test[X_train.columns]

# Extract datetime features and drop originals
for ds in [X_train, X_val, df_test]:
    ds['pickup_datetime'] = pd.to_datetime(ds['pickup_datetime'])
    ds['hour'] = ds['pickup_datetime'].dt.hour
    ds['day'] = ds['pickup_datetime'].dt.day
    ds['month'] = ds['pickup_datetime'].dt.month
    ds['weekday'] = ds['pickup_datetime'].dt.weekday
    ds.drop(columns=['pickup_datetime'], inplace=True, errors='ignore')  # Drop only 'pickup_datetime'

# Compute haversine distance
def haversine_np(lat1, lon1, lat2, lon2):
    R = 6371  # km
    lat1, lon1, lat2, lon2 = map(np.radians, (lat1, lon1, lat2, lon2))
    dlat = lat2 - lat1
    dlon = lat2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    return 2 * R * np.arcsin(np.sqrt(a)) * 1000  # meters

for ds in [X_train, X_val, df_test]:
    ds['distance'] = haversine_np(
        ds['pickup_latitude'], ds['pickup_longitude'],
        ds['dropoff_latitude'], ds['dropoff_longitude']
    )
    ds.drop(columns=['pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude'], inplace=True)

# Identify numeric vs categorical
numeric_cols = X_train.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object','category']).columns.tolist()
# Cap categorical cardinality
categorical_cols = [c for c in categorical_cols if X_train[c].nunique() <= 50]

# Preprocessing pipelines
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
    ('num', numeric_pipeline, numeric_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

# Fit-transform
elems = preprocessor.fit_transform(X_train)
X_train_proc = elems
X_val_proc = preprocessor.transform(X_val)
X_test_proc = preprocessor.transform(df_test)

n_samples, n_features = X_train_proc.shape

# Keras-Tuner model definition
import keras_tuner as kt
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.metrics import RootMeanSquaredError, MeanAbsoluteError

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_loss', save_best_only=True)

def mse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.reduce_mean(tf.square(y_true - y_pred))
mse_real.__name__ = 'mse_real'      

def rmse_real(y_true_log, y_pred_log):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_log)
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))
rmse_real.__name__ = 'rmse_real'


class MyHyperModel(kt.HyperModel):
    def build(self, hp):
        layers = hp.Int('layers', 2, 8)
        units = hp.Int('units', 64, 1024, 64)
        drop = hp.Float('dropout', 0.0, 0.5)
        opt = hp.Choice('optimizer', ['adam'])
        lr = hp.Float('learning_rate', 1e-5, 0.01, sampling='log')

        model = Sequential()
        for _ in range(layers):
            model.add(Dense(units, activation='relu', input_shape=(n_features,)))
            model.add(BatchNormalization())
            model.add(Dropout(drop))
        model.add(Dense(len(target_columns), activation='linear'))

        model.compile(optimizer=opt, loss='mean_squared_error', metrics=[rmse_real, mse_real])
        return model

bs = 32  # Example batch size
ep = 100  # Example epochs

tuner = kt.BayesianOptimization(
    MyHyperModel(),
    objective='val_loss',
    max_trials=10,
    executions_per_trial=1,
    seed=42,
    overwrite=False,
    project_name='bayesian_tuner'
)

if y_val is not None:
    tuner.search(
        X_train_proc, y_train,
        validation_data=(X_val_proc, y_val),
        batch_size=bs, epochs=ep,
        callbacks=[early_stopping, checkpoint]
    )
else:
    tuner.search(
        X_train_proc, y_train,
        validation_split=0.2,
        batch_size=bs, epochs=ep,
        callbacks=[early_stopping, checkpoint]
    )

model = tuner.hypermodel.build(
    tuner.get_best_hyperparameters(1)[0]
)

start_time = time.time()  # Start timing


if y_val is not None:
    history = model.fit(
        X_train_proc, y_train,
        validation_data=(X_val_proc, y_val),
        epochs=100, batch_size=bs,
        callbacks=[early_stopping, checkpoint],
        verbose=2
    )
else:
    history = model.fit(
        X_train_proc, y_train,
        validation_split=0.2,
        epochs=100, batch_size=bs,
        callbacks=[early_stopping, checkpoint],
        verbose=2
    )

duration = time.time() - start_time  # Calculate duration

# Save results
results = {
    'training_loss': history.history['mse_real'][-1],
    'validation_loss': history.history['val_mse_real'][-1],
    'training_RMSE': history.history['rmse_real'][-1],
    'validation_RMSE': history.history['val_rmse_real'][-1]
}
with open('results.json','w') as f:
    json.dump(results, f)

# Predict & submit
raw_preds = model.predict(X_test_proc)
final = np.expm1(np.clip(raw_preds, a_min=None, a_max=20))
if final.ndim == 1:
    final = final.reshape(-1,1)
submission = pd.DataFrame(final, columns=target_columns)
submission.insert(0, id_col, test_ids.reset_index(drop=True))
submission.to_csv('submission_result.csv', index=False)

Trial 10 Complete [00h 34m 01s]
val_loss: 0.6902359127998352

Best val_loss So Far: 0.6206799149513245
Total elapsed time: 05h 31m 39s
Epoch 1/100




36467/36467 - 130s - 4ms/step - loss: 0.9448 - mse_real: 36912424.0000 - rmse_real: 1405.2650 - val_loss: 0.6464 - val_mse_real: inf - val_rmse_real: inf
Epoch 2/100




36467/36467 - 109s - 3ms/step - loss: 0.6339 - mse_real: 31727296.0000 - rmse_real: 1315.4545 - val_loss: 0.6363 - val_mse_real: 10658688.0000 - val_rmse_real: 1298.0585
Epoch 3/100
36467/36467 - 128s - 4ms/step - loss: 0.6202 - mse_real: 31721886.0000 - rmse_real: 1310.7314 - val_loss: 0.6655 - val_mse_real: 10665403.0000 - val_rmse_real: 1300.5341
Epoch 4/100
36467/36467 - 145s - 4ms/step - loss: 0.6175 - mse_real: 31720854.0000 - rmse_real: 1309.6033 - val_loss: 0.7125 - val_mse_real: 10671490.0000 - val_rmse_real: 1303.9047
Epoch 5/100
36467/36467 - 146s - 4ms/step - loss: 0.6173 - mse_real: 31719858.0000 - rmse_real: 1309.4104 - val_loss: 0.6960 - val_mse_real: 10668701.0000 - val_rmse_real: 1302.5890
Epoch 6/100
36467/36467 - 137s - 4ms/step - loss: 0.6170 - mse_real: 31719784.0000 - rmse_real: 1309.2445 - val_loss: 0.9268 - val_mse_real: 10667623.0000 - val_rmse_real: 1302.5286
Epoch 7/100
36467/36467 - 141s - 4ms/step - loss: 0.6167 - mse_real: 31719604.0000 - rmse_real: 1308.9

In [None]:
print(duration)

1714.245115
