In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from datetime import datetime


In [3]:
# Load the dataset
df = pd.read_csv("cleaned_delivery_data.csv")

# Display basic info
print(df.info())

print(df.isnull().sum())

# Display first few rows
df.head()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4511284 entries, 0 to 4511283
Data columns (total 17 columns):
 #   Column             Dtype  
---  ------             -----  
 0   order_id           float64
 1   region_id          float64
 2   city               int64  
 3   courier_id         float64
 4   lng                float64
 5   lat                float64
 6   aoi_id             float64
 7   aoi_type           int64  
 8   accept_time        object 
 9   accept_gps_time    object 
 10  accept_gps_lng     float64
 11  accept_gps_lat     float64
 12  delivery_time      object 
 13  delivery_gps_time  object 
 14  delivery_gps_lng   float64
 15  delivery_gps_lat   float64
 16  ds                 float64
dtypes: float64(11), int64(2), object(4)
memory usage: 585.1+ MB
None
order_id             0
region_id            0
city                 0
courier_id           0
lng                  0
lat                  0
aoi_id               0
aoi_type             0
accept_time          0
ac

Unnamed: 0,order_id,region_id,city,courier_id,lng,lat,aoi_id,aoi_type,accept_time,accept_gps_time,accept_gps_lng,accept_gps_lat,delivery_time,delivery_gps_time,delivery_gps_lng,delivery_gps_lat,ds
0,0.450041,0.05988,0,0.014971,0.268083,0.368965,0.000831,14,1900-10-22 10:26:00+00:00,1900-10-22 10:26:00,0.858558,0.70442,1900-10-22 17:04:00,1900-10-22 17:04:00,0.777523,0.676699,0.983019
1,0.949146,0.05988,0,0.739336,0.26811,0.368984,0.000831,14,1900-09-07 10:13:00+00:00,1900-09-07 10:13:00,0.858556,0.704431,1900-09-09 15:44:00,1900-09-09 15:44:00,0.777901,0.675292,0.766038
2,0.898584,0.05988,0,0.739336,0.268113,0.36898,0.000831,14,1900-06-26 09:49:00+00:00,1900-06-26 09:49:00,0.858556,0.704431,1900-06-27 16:03:00,1900-06-27 16:03:00,0.777901,0.675289,0.235849
3,0.795072,0.05988,0,0.739336,0.268115,0.369021,0.000831,14,1900-09-11 11:01:00+00:00,1900-09-11 11:01:00,0.858558,0.704422,1900-09-13 17:14:00,1900-09-13 17:14:00,0.777902,0.675309,0.773585
4,0.609643,0.05988,0,0.739336,0.268098,0.369051,0.000831,14,1900-10-01 09:52:00+00:00,1900-10-01 09:52:00,0.858558,0.704423,1900-10-01 18:30:00,1900-10-01 18:30:00,0.777885,0.675321,0.943396


In [4]:
# Convert datetime columns to timezone-naive
df['accept_time'] = pd.to_datetime(df['accept_time']).dt.tz_localize(None)
df['delivery_time'] = pd.to_datetime(df['delivery_time']).dt.tz_localize(None)

# Compute target variable (Delivery ETA in minutes)
df['ETA'] = (df['delivery_time'] - df['accept_time']).dt.total_seconds() / 60  # in minutes

# Extract time-based features
df['hour_of_day'] = df['accept_time'].dt.hour
df['day_of_week'] = df['accept_time'].dt.weekday
df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

# Check new columns
print(df[['ETA', 'hour_of_day', 'day_of_week', 'is_weekend']].head())

# Save the cleaned dataset
df.to_csv("cleaned_data.csv", index=False)




      ETA  hour_of_day  day_of_week  is_weekend
0   398.0           10            0           0
1  3211.0           10            4           0
2  1814.0            9            1           0
3  3253.0           11            1           0
4   518.0            9            0           0
