In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.metrics import mean_squared_error

# data 

In [2]:
train = pd.read_csv("/workspaces/NYC_taxi_trip/data/processed/train.csv")
train.head()

Unnamed: 0,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_day,pickup_month,pickup_year,pickup_Hour,pickup_Minute,pickup_Second
0,1,-73.982155,40.767937,-73.96463,40.765602,0,455.0,14,3,2016,17,24,55
1,1,-73.980415,40.738564,-73.999481,40.731152,0,663.0,12,6,2016,0,43,35
2,1,-73.979027,40.763939,-74.005333,40.710087,0,2124.0,19,1,2016,11,35,24
3,1,-74.01004,40.719971,-74.012268,40.706718,0,429.0,6,4,2016,19,32,31
4,1,-73.973053,40.793209,-73.972923,40.78252,0,435.0,26,3,2016,13,30,55


In [3]:
train.shape

(112463, 13)

In [4]:
test = pd.read_csv("/workspaces/NYC_taxi_trip/data/processed/test.csv")
test.head()

Unnamed: 0,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,pickup_day,pickup_month,pickup_year,pickup_Hour,pickup_Minute,pickup_Second
0,1.0,-73.988129,40.732029,-73.990173,40.75668,0,30,6,2016,23,59,58
1,1.0,-73.964203,40.679993,-73.959808,40.655403,0,30,6,2016,23,59,53
2,1.0,-73.997437,40.737583,-73.98616,40.729523,0,30,6,2016,23,59,47
3,1.0,-73.95607,40.7719,-73.986427,40.730469,0,30,6,2016,23,59,41
4,1.0,-73.970215,40.761475,-73.96151,40.75589,0,30,6,2016,23,59,33


In [5]:
test.shape

(162039, 12)

In [6]:
train.columns

Index(['passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration', 'pickup_day', 'pickup_month', 'pickup_year',
       'pickup_Hour', 'pickup_Minute', 'pickup_Second'],
      dtype='object')

In [7]:
test.columns

Index(['passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'pickup_day', 'pickup_month', 'pickup_year', 'pickup_Hour',
       'pickup_Minute', 'pickup_Second'],
      dtype='object')

In [8]:
X_train = train.drop(['trip_duration'],axis=1)
y_train = train[['trip_duration']]

# Hyperparameter Tuning using hyperopt

In [32]:
search_space = {
    'n_estimators': hp.choice('n_estimators', [1,5,10,15,20,25,50,100,150,200]),
    'max_depth' : hp.choice('max_depth', [2,5,6,8]),
    'random_state': hp.choice('random_state', [0,3,42])
}

In [47]:
# Define the objective function
def objective(params):
    # RandomForestRegressor parameters
    # Create a RandomForestRegressor model with the given parameters
    rf = RandomForestRegressor(**params)
    
    # Define 4-fold cross-validation
    kf = KFold(n_splits=4)

    
    
    # Calculate the negative mean squared error (Hyperopt minimizes the objective function)
    neg_mse = -np.mean(cross_val_score(rf, X_train, y_train.values.ravel(), cv=kf, scoring='neg_mean_squared_error'))
    
    return neg_mse

In [48]:
# Run the optimization
trials = Trials()
best = fmin(fn=objective,
            space=search_space,
            algo=tpe.suggest,
            max_evals=10,
            trials=trials)

100%|██████████| 10/10 [07:44<00:00, 46.47s/trial, best loss: 8666895.000302486]


In [35]:
best

{'max_depth': 1, 'n_estimators': 8, 'random_state': 0}

In [38]:
# Convert indices to hyperparameter values
best_hyperparams = {
    'n_estimators': [1, 5, 10, 15, 20, 25, 50,100,150,200][best['n_estimators']],
    'max_depth': [2, 5, 6, 8][best['max_depth']],
    'random_state': [0,3, 42][best['random_state']]
}


In [39]:
best_hyperparams

{'n_estimators': 150, 'max_depth': 5, 'random_state': 0}

In [40]:
rf_model = RandomForestRegressor(**best_hyperparams)

In [41]:
rf_model.fit(X_train, y_train.values.ravel())

In [42]:
y_pred = rf_model.predict(X_train)

In [43]:
y_pred

array([ 766.70328734,  770.28616734, 1368.9186792 , ...,  814.22546261,
        781.81268859,  761.46586076])

In [44]:
mean_squared_error(y_train.values.ravel(), y_pred)

8265809.657962441