In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from datetime import datetime

In [2]:
# Load the dataset
df = pd.read_csv("cleaned_data.csv")

# Display basic info
print(df.info())

print(df.isnull().sum())

# Display first few rows
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4511284 entries, 0 to 4511283
Data columns (total 21 columns):
 #   Column             Dtype  
---  ------             -----  
 0   order_id           float64
 1   region_id          float64
 2   city               int64  
 3   courier_id         float64
 4   lng                float64
 5   lat                float64
 6   aoi_id             float64
 7   aoi_type           int64  
 8   accept_time        object 
 9   accept_gps_time    object 
 10  accept_gps_lng     float64
 11  accept_gps_lat     float64
 12  delivery_time      object 
 13  delivery_gps_time  object 
 14  delivery_gps_lng   float64
 15  delivery_gps_lat   float64
 16  ds                 float64
 17  ETA                float64
 18  hour_of_day        int64  
 19  day_of_week        int64  
 20  is_weekend         int64  
dtypes: float64(12), int64(5), object(4)
memory usage: 722.8+ MB
None
order_id             0
region_id            0
city                 0
courier_id  

Unnamed: 0,order_id,region_id,city,courier_id,lng,lat,aoi_id,aoi_type,accept_time,accept_gps_time,...,accept_gps_lat,delivery_time,delivery_gps_time,delivery_gps_lng,delivery_gps_lat,ds,ETA,hour_of_day,day_of_week,is_weekend
0,0.450041,0.05988,0,0.014971,0.268083,0.368965,0.000831,14,1900-10-22 10:26:00,1900-10-22 10:26:00,...,0.70442,1900-10-22 17:04:00,1900-10-22 17:04:00,0.777523,0.676699,0.983019,398.0,10,0,0
1,0.949146,0.05988,0,0.739336,0.26811,0.368984,0.000831,14,1900-09-07 10:13:00,1900-09-07 10:13:00,...,0.704431,1900-09-09 15:44:00,1900-09-09 15:44:00,0.777901,0.675292,0.766038,3211.0,10,4,0
2,0.898584,0.05988,0,0.739336,0.268113,0.36898,0.000831,14,1900-06-26 09:49:00,1900-06-26 09:49:00,...,0.704431,1900-06-27 16:03:00,1900-06-27 16:03:00,0.777901,0.675289,0.235849,1814.0,9,1,0
3,0.795072,0.05988,0,0.739336,0.268115,0.369021,0.000831,14,1900-09-11 11:01:00,1900-09-11 11:01:00,...,0.704422,1900-09-13 17:14:00,1900-09-13 17:14:00,0.777902,0.675309,0.773585,3253.0,11,1,0
4,0.609643,0.05988,0,0.739336,0.268098,0.369051,0.000831,14,1900-10-01 09:52:00,1900-10-01 09:52:00,...,0.704423,1900-10-01 18:30:00,1900-10-01 18:30:00,0.777885,0.675321,0.943396,518.0,9,0,0


In [3]:
from geopy.distance import geodesic

# Function to calculate the geodesic distance
def calculate_distance(row):
    pickup_coords = (row['accept_gps_lat'], row['accept_gps_lng'])
    delivery_coords = (row['delivery_gps_lat'], row['delivery_gps_lng'])
    return geodesic(pickup_coords, delivery_coords).kilometers

# Apply function to calculate distance
df['distance_km'] = df.apply(calculate_distance, axis=1)

# Show the first few rows to verify the feature
df[['ETA', 'distance_km']].head()

Unnamed: 0,ETA,distance_km
0,398.0,9.52677
1,3211.0,9.538442
2,1814.0,9.5386
3,3253.0,9.537718
4,518.0,9.539031


In [4]:
# Calculate historical average delivery time per courier
df['historical_avg_time'] = df.groupby('courier_id')['ETA'].transform('mean')

# Show the first few rows
df[['courier_id', 'ETA', 'historical_avg_time']].head()

Unnamed: 0,courier_id,ETA,historical_avg_time
0,0.014971,398.0,219.153846
1,0.739336,3211.0,2585.524272
2,0.739336,1814.0,2585.524272
3,0.739336,3253.0,2585.524272
4,0.739336,518.0,2585.524272


In [5]:
# Convert 'accept_time' to datetime if not already done
df['accept_time'] = pd.to_datetime(df['accept_time'])

# Calculate the number of deliveries per hour for each courier
df['hour_of_day'] = df['accept_time'].dt.hour
df['deliveries_per_hour'] = df.groupby(['courier_id', 'hour_of_day'])['order_id'].transform('count')

# Show the first few rows
df[['courier_id', 'hour_of_day', 'deliveries_per_hour']].head()



Unnamed: 0,courier_id,hour_of_day,deliveries_per_hour
0,0.014971,10,101
1,0.739336,10,62
2,0.739336,9,36
3,0.739336,11,4
4,0.739336,9,36


In [6]:
# Convert accept_time to datetime if not already done
df['accept_time'] = pd.to_datetime(df['accept_time'])

# Sort data by accept_time
df = df.sort_values('accept_time')

# Split the data
train_size = int(0.6 * len(df))
val_size = int(0.2 * len(df))

# Train, validation, test split
train_data = df[:train_size]
val_data = df[train_size:train_size+val_size]
test_data = df[train_size+val_size:]

# Show the split sizes
print(f"Train Data: {len(train_data)} rows")
print(f"Validation Data: {len(val_data)} rows")
print(f"Test Data: {len(test_data)} rows")

Train Data: 2706770 rows
Validation Data: 902256 rows
Test Data: 902258 rows


In [7]:
# Select features and target variable
features = ['hour_of_day', 'day_of_week', 'is_weekend', 'distance_km', 'historical_avg_time', 'deliveries_per_hour']
target = 'ETA'

X_train = train_data[features]
y_train = train_data[target]
X_val = val_data[features]
y_val = val_data[target]


In [8]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

# Initialize Lasso regression model
lasso = Lasso()

# Define parameter grid for alpha (regularization strength)
param_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}

# Perform GridSearchCV for optimal alpha value
grid_search = GridSearchCV(lasso, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Best model
best_lasso = grid_search.best_estimator_

# Make predictions
y_pred = best_lasso.predict(X_val)

# Evaluate model
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
rmse = mean_squared_error(y_val, y_pred, squared=False)
r2 = r2_score(y_val, y_pred)

# Display results
print(f"Lasso Regression - MAE: {mae}, MSE: {mse}, RMSE: {rmse}, R^2: {r2}")


Lasso Regression - MAE: 120.32821221046048, MSE: 238476.3224061998, RMSE: 488.3403755642163, R^2: 0.13788863614673752
