In [96]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import joblib

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [73]:
pick=pd.read_csv('/content/drive/MyDrive/pickup_data (2).csv')

In [74]:
time_columns = ['accept_time','time_window_start','time_window_end','pickup_time','pickup_gps_time','accept_gps_time','ds']

for col in time_columns:
    pick[col] = pd.to_datetime(pick[col],errors='coerce')

In [51]:
pick.head()

Unnamed: 0,order_id,region_id,city,courier_id,accept_time,time_window_start,time_window_end,lng,lat,aoi_id,...,pickup_time,pickup_gps_time,pickup_gps_lng,pickup_gps_lat,accept_gps_time,accept_gps_lng,accept_gps_lat,ds,task_duration,distance
0,483671,3,Chongqing,1518,2014-01-08 07:57:00,2014-01-08 09:00:00,2014-01-08 11:00:00,106.46877,29.47204,218,...,2014-01-08 09:38:00,2014-01-08 09:38:00,106.52113,29.5981,2014-01-08 07:57:00,106.52199,29.59889,2014-01-08 09:38:00,101.0,0.001168
1,1746131,3,Chongqing,4706,2024-10-09 07:46:00,2024-10-09 09:00:00,2024-10-09 11:00:00,106.46872,29.472,218,...,2024-10-09 09:42:00,2024-10-09 09:42:00,106.52113,29.5981,2024-10-09 07:46:00,106.52199,29.59889,2024-10-09 09:42:00,116.0,0.001168
2,2301722,3,Chongqing,4706,2024-10-09 13:57:00,2024-10-09 13:57:00,2024-10-09 15:57:00,106.46869,29.47191,218,...,2024-10-09 15:53:00,2024-10-09 15:53:00,106.46821,29.46771,2024-10-09 13:57:00,106.46929,29.47231,2024-10-09 15:53:00,116.0,0.004725
3,3788723,3,Chongqing,4706,2019-01-05 08:13:00,2019-01-05 11:00:00,2019-01-05 13:00:00,106.46878,29.47208,218,...,2019-01-05 11:59:00,2019-01-05 11:59:00,106.52113,29.5981,2019-01-05 08:13:00,106.52199,29.59889,2019-01-05 11:59:00,226.0,0.001168
4,713435,3,Chongqing,4706,2022-01-05 08:16:00,2022-01-05 09:00:00,2022-01-05 11:00:00,106.46813,29.47228,218,...,2022-01-05 10:40:00,2022-01-05 10:40:00,106.46827,29.4727,2022-01-05 08:16:00,106.52199,29.59889,2022-01-05 10:40:00,144.0,0.137149


In [75]:
encoder = LabelEncoder()
pick['city'] = encoder.fit_transform(pick['city'])

In [76]:
pick.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3957642 entries, 0 to 3957641
Data columns (total 21 columns):
 #   Column             Dtype         
---  ------             -----         
 0   order_id           int64         
 1   region_id          int64         
 2   city               int64         
 3   courier_id         int64         
 4   accept_time        datetime64[ns]
 5   time_window_start  datetime64[ns]
 6   time_window_end    datetime64[ns]
 7   lng                float64       
 8   lat                float64       
 9   aoi_id             int64         
 10  aoi_type           int64         
 11  pickup_time        datetime64[ns]
 12  pickup_gps_time    datetime64[ns]
 13  pickup_gps_lng     float64       
 14  pickup_gps_lat     float64       
 15  accept_gps_time    datetime64[ns]
 16  accept_gps_lng     float64       
 17  accept_gps_lat     float64       
 18  ds                 datetime64[ns]
 19  task_duration      float64       
 20  distance           float

In [77]:
## log transformation
pick['log_distance'] = np.log1p(pick['distance'])

In [78]:
## standardization
scaler = StandardScaler()
features_to_scale = ['distance', 'log_distance']
pick[features_to_scale] = scaler.fit_transform(pick[features_to_scale])

In [79]:
## scaling
minmax=MinMaxScaler()
features_to_scale = ['distance', 'log_distance']
pick[features_to_scale] = minmax.fit_transform(pick[features_to_scale])

In [71]:
pick.head()

Unnamed: 0,order_id,region_id,city,courier_id,lng,lat,aoi_id,aoi_type,pickup_gps_lng,pickup_gps_lat,...,ds,ETA,distance,log_distance,ETA.1,accept_hour,accept_day,accept_month,accept_weekday,is_weekend
9713,3952510,3,0,4647,106.46738,29.49415,5939,1,106.46706,29.49396,...,1930-01-05 08:32:00,40.0,0.006408,0.013352,40.0,7,5,1,6,1
47211,1408149,22,0,15344,106.48377,29.53724,1105,4,106.52113,29.5981,...,1930-01-05 08:33:00,74.0,0.000415,0.000872,74.0,7,5,1,6,1
7796,3470003,3,0,8904,106.46255,29.47972,4462,1,106.52113,29.5981,...,1930-01-05 08:35:00,34.0,0.000415,0.000872,34.0,8,5,1,6,1
48579,4491492,22,0,9259,106.47586,29.53952,1601,1,106.52113,29.5981,...,1930-01-05 08:36:00,36.0,0.000415,0.000872,36.0,8,5,1,6,1
90367,4403886,22,0,15498,106.49179,29.52801,13958,0,106.49194,29.52895,...,1930-01-05 08:42:00,55.0,0.001965,0.004119,55.0,7,5,1,6,1


In [80]:
pick['ETA']=pick['task_duration']

In [81]:

# Extract time features
pick['accept_hour'] = pick['accept_time'].dt.hour
pick['accept_day'] = pick['accept_time'].dt.day
pick['accept_month'] = pick['accept_time'].dt.month
pick['accept_weekday'] = pick['accept_time'].dt.weekday
pick['is_weekend'] = (pick['accept_weekday'] >= 5).astype(int)

In [82]:
pick.columns

Index(['order_id', 'region_id', 'city', 'courier_id', 'accept_time',
       'time_window_start', 'time_window_end', 'lng', 'lat', 'aoi_id',
       'aoi_type', 'pickup_time', 'pickup_gps_time', 'pickup_gps_lng',
       'pickup_gps_lat', 'accept_gps_time', 'accept_gps_lng', 'accept_gps_lat',
       'ds', 'task_duration', 'distance', 'log_distance', 'ETA', 'accept_hour',
       'accept_day', 'accept_month', 'accept_weekday', 'is_weekend'],
      dtype='object')

In [83]:
pick.drop([ 'accept_time',
       'time_window_start', 'time_window_end',
        'pickup_time', 'pickup_gps_time', 'accept_gps_time'],axis=1,inplace=True)

In [84]:
pick=pick.head(100000)

In [67]:
pick.head()

Unnamed: 0,order_id,region_id,city,courier_id,lng,lat,aoi_id,aoi_type,pickup_gps_lng,pickup_gps_lat,accept_gps_lng,accept_gps_lat,ETA,distance,log_distance,accept_hour,accept_day,accept_month,accept_weekday,is_weekend
0,483671,3,0,1518,106.46877,29.47204,218,14,106.52113,29.5981,106.52199,29.59889,101.0,0.001168,0.001167,7,8,1,2,0
1,1746131,3,0,4706,106.46872,29.472,218,14,106.52113,29.5981,106.52199,29.59889,116.0,0.001168,0.001167,7,9,10,2,0
2,2301722,3,0,4706,106.46869,29.47191,218,14,106.46821,29.46771,106.46929,29.47231,116.0,0.004725,0.004714,13,9,10,2,0
3,3788723,3,0,4706,106.46878,29.47208,218,14,106.52113,29.5981,106.52199,29.59889,226.0,0.001168,0.001167,8,5,1,5,1
4,713435,3,0,4706,106.46813,29.47228,218,14,106.46827,29.4727,106.52199,29.59889,144.0,0.137149,0.128524,8,5,1,2,0


In [85]:
pick = pick.sort_values(by='ds')

In [86]:
X=pick.drop(['ETA','ds'],axis=1)
y=pick['ETA']

splitting the dataset

In [87]:
# Split into 60% train, 20% validation, 20% test
train_size = int(0.6 * len(pick))
valid_size = int(0.2 * len(pick))

X_train, y_train = X[:train_size], y[:train_size]
X_valid, y_valid = X[train_size:train_size+valid_size], y[train_size:train_size+valid_size]
X_test, y_test = X[train_size+valid_size:], y[train_size+valid_size:]

Lasso

In [88]:
# Train Lasso Regression Model
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)

# Predict on Validation Set
y_pred_lasso = lasso.predict(X_valid)

# Evaluate Model
print("Lasso Regression Performance:")
print("MAE:", mean_absolute_error(y_valid, y_pred_lasso))
print("MSE:", mean_squared_error(y_valid, y_pred_lasso))
print("RMSE:", np.sqrt(mean_squared_error(y_valid, y_pred_lasso)))
print("R² Score:", r2_score(y_valid, y_pred_lasso))

Lasso Regression Performance:
MAE: 0.0009353404556467812
MSE: 1.4261482865174025e-06
RMSE: 0.0011942145060739309
R² Score: 0.9999999997705534


In [89]:
# Evaluation Metrics
mae = mean_absolute_error(y_test, y_pred_lasso)
mse = mean_squared_error(y_test, y_pred_lasso)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_lasso)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R² Score: {r2}")

MAE: 87.66376523567098
MSE: 12974.371981947812
RMSE: 113.90510077229997
R² Score: -0.9248612644341818


Random Forest

In [90]:
# Train Random Forest Model
rf = RandomForestRegressor(n_estimators=50, max_depth=10, min_samples_split=5, min_samples_leaf=3, random_state=42)
rf.fit(X_train, y_train)

# Predict on Validation Set
y_pred_rf = rf.predict(X_valid)

# Evaluate Model
print("Random Forest Regressor Performance:")
print("MAE:", mean_absolute_error(y_valid, y_pred_rf))
print("MSE:", mean_squared_error(y_valid, y_pred_rf))
print("RMSE:", np.sqrt(mean_squared_error(y_valid, y_pred_rf)))
print("R² Score:", r2_score(y_valid, y_pred_rf))

Random Forest Regressor Performance:
MAE: 0.0
MSE: 0.0
RMSE: 0.0
R² Score: 1.0


In [91]:
# Evaluation Metrics
mae = mean_absolute_error(y_test, y_pred_rf)
mse = mean_squared_error(y_test, y_pred_rf)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_rf)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R² Score: {r2}")

MAE: 87.6644
MSE: 12974.5605
RMSE: 113.90592829172677
R² Score: -0.9248892327317462


Comparison

In [68]:
lasso_rmse = np.sqrt(mean_squared_error(y_valid, y_pred_lasso))
rf_rmse = np.sqrt(mean_squared_error(y_valid, y_pred_rf))

lasso_r2 = r2_score(y_valid, y_pred_lasso)
rf_r2 = r2_score(y_valid, y_pred_rf)

print(f"Lasso RMSE: {lasso_rmse}, R²: {lasso_r2}")
print(f"Random Forest RMSE: {rf_rmse}, R²: {rf_r2}")

# Comparison based on RMSE and R²
if rf_rmse < lasso_rmse:
    print("Random Forest performs better based on RMSE.")
else:
    print("Lasso Regression performs better based on RMSE.")

if rf_r2 > lasso_r2:
    print("Random Forest performs better based on R².")
else:
    print("Lasso Regression performs better based on R².")

Lasso RMSE: 74.9442439710935, R²: 0.09636425123645376
Random Forest RMSE: 71.01635921712905, R²: 0.1886025372772162
Random Forest performs better based on RMSE.
Random Forest performs better based on R².


In [92]:
param_dist = {
    'n_estimators': np.arange(50, 300, 50),
    'max_depth': [5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5],
    'max_features': ['sqrt', 'log2']
}

# Randomized Search
random_search = RandomizedSearchCV(rf, param_distributions=param_dist,
                                   n_iter=20, cv=5, scoring='r2', n_jobs=-1, verbose=2, random_state=42)
random_search.fit(X_train, y_train)

# Best Parameters
print("Best Parameters:", random_search.best_params_)

# Train with Best Parameters
best_rf = random_search.best_estimator_
y_pred_rf = best_rf.predict(X_valid)

# Evaluate Model
print("Random Forest Regressor Performance after Randomized Tuning:")
print("MAE:", mean_absolute_error(y_valid, y_pred_rf))
print("MSE:", mean_squared_error(y_valid, y_pred_rf))
print("RMSE:", np.sqrt(mean_squared_error(y_valid, y_pred_rf)))
print("R² Score:", r2_score(y_valid, y_pred_rf))

Fitting 5 folds for each of 20 candidates, totalling 100 fits




Best Parameters: {'n_estimators': 150, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None}
Random Forest Regressor Performance after Randomized Tuning:
MAE: 5.571384
MSE: 80.01251334765432
RMSE: 8.94497140004675
R² Score: 0.9871271487553784


In [93]:
 # Predict on Test Set
y_pred_test = best_rf.predict(X_test)

# Evaluate Performance on Test Set
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

print("Test Set Performance:")
print("MAE:", mean_absolute_error(y_test, y_pred_test))
print("MSE:", mean_squared_error(y_test, y_pred_test))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test)))
print("R² Score:", r2_score(y_test, y_pred_test))

Test Set Performance:
MAE: 5.072185805555555
MSE: 72.81412599526234
RMSE: 8.533119359018855
R² Score: 0.989197389220309


In [94]:
# Apply Regularization
best_rf.set_params(
    max_depth=20,  # Reduce max depth to avoid overfitting
    min_samples_split=5,  # Require more samples to split nodes
    min_samples_leaf=3,  # Minimum samples for leaf nodes
    n_estimators=100  # Reduce the number of estimators to prevent overfitting
)

# Retrain the model with regularization
best_rf.fit(X_train, y_train)

# Evaluate on Test Set again
y_pred_test = best_rf.predict(X_test)

print("After Regularization - Test Set Performance:")
print("MAE:", mean_absolute_error(y_test, y_pred_test))
print("MSE:", mean_squared_error(y_test, y_pred_test))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_test)))
print("R² Score:", r2_score(y_test, y_pred_test))

After Regularization - Test Set Performance:
MAE: 5.97366558927588
MSE: 98.32638270002067
RMSE: 9.915966049761398
R² Score: 0.9854124233839958


In [95]:
lasso_rmse = np.sqrt(mean_squared_error(y_valid, y_pred_lasso))
rf_rmse = np.sqrt(mean_squared_error(y_valid, y_pred_rf))

lasso_r2 = r2_score(y_valid, y_pred_lasso)
rf_r2 = r2_score(y_valid, y_pred_rf)

print(f"Lasso RMSE: {lasso_rmse}, R²: {lasso_r2}")
print(f"Random Forest RMSE: {rf_rmse}, R²: {rf_r2}")

# Comparison based on RMSE and R²
if rf_rmse < lasso_rmse:
    print("Random Forest performs better based on RMSE.")
else:
    print("Lasso Regression performs better based on RMSE.")

if rf_r2 > lasso_r2:
    print("Random Forest performs better based on R².")
else:
    print("Lasso Regression performs better based on R².")

Lasso RMSE: 0.0011942145060739309, R²: 0.9999999997705534
Random Forest RMSE: 8.94497140004675, R²: 0.9871271487553784
Lasso Regression performs better based on RMSE.
Lasso Regression performs better based on R².


In [97]:
best_model = rf if rf_rmse < lasso_rmse else lasso
joblib.dump(best_model, "best_model.pkl")

print("Best model saved successfully.")

Best model saved successfully.


Due to overfitting Initially Random Forest performs better than lasso. After hypertuning and regularisation
Lasso model ouperforms Random Forest