In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
df = pd.read_csv("logistics_shipments_dataset.csv")

# Step 1A ‚Äî Quick Sanity Check

In [3]:
df.shape

(2000, 11)

In [4]:
df.head()

Unnamed: 0,Shipment_ID,Origin_Warehouse,Destination,Carrier,Shipment_Date,Delivery_Date,Weight_kg,Cost,Status,Distance_miles,Transit_Days
0,SH10000,Warehouse_MIA,San Francisco,UPS,2023-10-02,2023-10-04,25.7,67.46,Delivered,291,2
1,SH10001,Warehouse_MIA,Atlanta,DHL,2023-12-06,2023-12-09,38.9,268.85,Delivered,1225,3
2,SH10002,Warehouse_LA,Houston,DHL,2023-09-18,2023-09-20,37.2,74.35,Delivered,220,2
3,SH10003,Warehouse_BOS,Seattle,OnTrac,2023-01-26,2023-02-04,42.6,187.04,Delivered,1156,9
4,SH10004,Warehouse_SF,Dallas,OnTrac,2023-06-03,2023-06-06,7.9,120.01,Delivered,1017,3


In [5]:
df["Shipment_ID"].nunique(), df.shape[0]

(2000, 2000)

In [6]:
df["Carrier"].value_counts()

Carrier
LaserShip           303
OnTrac              299
FedEx               295
USPS                292
DHL                 281
Amazon Logistics    274
UPS                 256
Name: count, dtype: int64

In [7]:
df["Status"].value_counts()

Status
Delivered     1648
Delayed        199
In Transit      76
Lost            45
Returned        32
Name: count, dtype: int64

In [8]:
df[["Weight_kg", "Cost", "Distance_miles", "Transit_Days"]].describe()

Unnamed: 0,Weight_kg,Cost,Distance_miles,Transit_Days
count,2000.0,1959.0,2000.0,2000.0
mean,30.1848,205.161598,1275.868,4.1825
std,124.967053,222.586082,691.382829,1.837902
min,0.0,17.89,101.0,1.0
25%,12.3,117.71,690.25,3.0
50%,20.7,196.42,1262.5,4.0
75%,33.925,272.115,1867.25,5.0
max,5404.2,6562.21,2499.0,12.0


# Step 1B ‚Äî Fix Data Types

In [9]:
# Convert dates
df["Shipment_Date"] = pd.to_datetime(df["Shipment_Date"], errors="coerce")
df["Delivery_Date"] = pd.to_datetime(df["Delivery_Date"], errors="coerce")

In [10]:
# Convert numeric columns
num_cols = ["Weight_kg", "Cost", "Distance_miles", "Transit_Days"]
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors="coerce")

In [11]:
# Convert categorical columns
cat_cols = ["Carrier", "Origin_Warehouse", "Destination", "Status"]
df[cat_cols] = df[cat_cols].astype("category")

In [12]:
# Delivery date earlier than shipment date
df[df["Delivery_Date"] < df["Shipment_Date"]].head(3)

Unnamed: 0,Shipment_ID,Origin_Warehouse,Destination,Carrier,Shipment_Date,Delivery_Date,Weight_kg,Cost,Status,Distance_miles,Transit_Days
20,SH10020,Warehouse_MIA,Miami,FedEx,2023-08-10,2023-07-11,7.8,216.42,Delivered,1696,5
114,SH10114,Warehouse_SF,Los Angeles,FedEx,2023-04-03,2023-03-04,23.2,270.43,Delivered,1800,4
250,SH10250,Warehouse_NYC,Portland,Amazon Logistics,2023-02-26,2023-01-27,5.5,259.45,Delivered,2132,7


# Step 1C ‚Äî Missing Values

In [13]:
# Missing values report
df.isna().sum()

Shipment_ID          0
Origin_Warehouse     0
Destination          0
Carrier              0
Shipment_Date        0
Delivery_Date       32
Weight_kg            0
Cost                41
Status               0
Distance_miles       0
Transit_Days         0
dtype: int64

In [14]:
# Drop rows missing core variables
core_cols = ["Cost", "Weight_kg", "Distance_miles", "Carrier"]
df = df.dropna(subset=core_cols)

In [15]:
# Delivered shipments missing delivery date (flag)
df[(df["Status"] == "Delivered") & (df["Delivery_Date"].isna())].head(5)

Unnamed: 0,Shipment_ID,Origin_Warehouse,Destination,Carrier,Shipment_Date,Delivery_Date,Weight_kg,Cost,Status,Distance_miles,Transit_Days
30,SH10030,Warehouse_SF,San Francisco,LaserShip,2023-11-04,NaT,28.5,109.33,Delivered,502,2
39,SH10039,Warehouse_MIA,New York,LaserShip,2023-12-08,NaT,65.2,284.81,Delivered,1204,3
97,SH10097,Warehouse_MIA,Atlanta,Amazon Logistics,2023-09-19,NaT,80.4,259.98,Delivered,1554,5
109,SH10109,Warehouse_DEN,Phoenix,LaserShip,2023-12-13,NaT,16.5,73.12,Delivered,371,2
154,SH10154,Warehouse_MIA,Minneapolis,DHL,2023-05-15,NaT,55.3,362.11,Delivered,2192,6


# Step 1D ‚Äî Duplicates & Consistency

In [16]:
# Duplicate Shipment_ID
df.duplicated("Shipment_ID").sum()

np.int64(0)

In [17]:
df = df.drop_duplicates()

In [18]:
df = df[
    (df["Weight_kg"] > 0) &
    (df["Cost"] > 0) &
    (df["Distance_miles"] > 0) &
    (df["Transit_Days"] >= 0)
]

In [19]:
# Delivered shipments must have delivery date
df = df[~((df["Status"] == "Delivered") & (df["Delivery_Date"].isna()))]

# Step 1E ‚Äî Outlier Handling (IQR Winsorization)

In [20]:
def winsorize_iqr(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return series.clip(lower, upper)


In [21]:
outlier_cols = ["Cost", "Weight_kg", "Distance_miles", "Transit_Days"]

for col in outlier_cols:
    df[col] = winsorize_iqr(df[col])


# Step 1F ‚Äî Feature Engineering

In [22]:
# Cost per mile
df["Cost_per_mile"] = df["Cost"] / df["Distance_miles"]
# Cost per kg
df["Cost_per_kg"] = df["Cost"] / df["Weight_kg"]
# Shipping month
df["Ship_Month"] = df["Shipment_Date"].dt.month
# Optional route feature
df["Route"] = df["Origin_Warehouse"].astype(str) + " ‚Üí " + df["Destination"].astype(str)

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1932 entries, 0 to 1998
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Shipment_ID       1932 non-null   object        
 1   Origin_Warehouse  1932 non-null   category      
 2   Destination       1932 non-null   category      
 3   Carrier           1932 non-null   category      
 4   Shipment_Date     1932 non-null   datetime64[ns]
 5   Delivery_Date     1927 non-null   datetime64[ns]
 6   Weight_kg         1932 non-null   float64       
 7   Cost              1932 non-null   float64       
 8   Status            1932 non-null   category      
 9   Distance_miles    1932 non-null   int64         
 10  Transit_Days      1932 non-null   int64         
 11  Cost_per_mile     1932 non-null   float64       
 12  Cost_per_kg       1932 non-null   float64       
 13  Ship_Month        1932 non-null   int32         
 14  Route             1932 non-nu

In [24]:
df.head()

Unnamed: 0,Shipment_ID,Origin_Warehouse,Destination,Carrier,Shipment_Date,Delivery_Date,Weight_kg,Cost,Status,Distance_miles,Transit_Days,Cost_per_mile,Cost_per_kg,Ship_Month,Route
0,SH10000,Warehouse_MIA,San Francisco,UPS,2023-10-02,2023-10-04,25.7,67.46,Delivered,291,2,0.231821,2.624903,10,Warehouse_MIA ‚Üí San Francisco
1,SH10001,Warehouse_MIA,Atlanta,DHL,2023-12-06,2023-12-09,38.9,268.85,Delivered,1225,3,0.219469,6.911311,12,Warehouse_MIA ‚Üí Atlanta
2,SH10002,Warehouse_LA,Houston,DHL,2023-09-18,2023-09-20,37.2,74.35,Delivered,220,2,0.337955,1.998656,9,Warehouse_LA ‚Üí Houston
3,SH10003,Warehouse_BOS,Seattle,OnTrac,2023-01-26,2023-02-04,42.6,187.04,Delivered,1156,8,0.161799,4.39061,1,Warehouse_BOS ‚Üí Seattle
4,SH10004,Warehouse_SF,Dallas,OnTrac,2023-06-03,2023-06-06,7.9,120.01,Delivered,1017,3,0.118004,15.191139,6,Warehouse_SF ‚Üí Dallas


In [37]:
df.to_csv("logistics_shipments_feature_engineered.csv", index=False)

# Step 1G ‚Äî EDA Questions

### 1. Which carrier has the lowest average cost?

In [25]:
lowest_carrier_cost = df.groupby('Carrier')['Cost'].mean().sort_values(ascending=True).head(1)
print(lowest_carrier_cost.to_string())

Carrier
USPS    181.916398


### 2. Which carrier has the lowest Cost_per_mile?

In [26]:
lowest_mile_cost = df.groupby('Carrier')['Cost_per_mile'].min().sort_values(ascending=True)
print(lowest_mile_cost.head(1).to_string())

Carrier
LaserShip    0.012293


### 3. Which carrier has the best transit time on average?

In [27]:
best_transit_time = df.groupby('Carrier')['Transit_Days'].mean().sort_values(ascending=True)
print(best_transit_time.head(1).to_string())

Carrier
USPS    4.039568


### 4. How does Cost change with Weight_kg?

In [28]:
df[['Cost', 'Weight_kg']].corr()

Unnamed: 0,Cost,Weight_kg
Cost,1.0,0.256295
Weight_kg,0.256295,1.0


In [29]:
df[['Weight_kg', 'Cost']].describe()

Unnamed: 0,Weight_kg,Cost
count,1932.0,1932.0
mean,25.230228,198.787016
std,16.594774,96.635422
min,0.2,17.89
25%,12.4,117.9325
50%,20.65,196.9
75%,33.8,272.175
max,65.9,503.53875


### 5. Is there a cost vs speed tradeoff by carrier?

In [30]:
df.groupby("Carrier").agg(
    avg_cost=("Cost", "mean"),
    avg_transit_days=("Transit_Days", "mean")
)

Unnamed: 0_level_0,avg_cost,avg_transit_days
Carrier,Unnamed: 1_level_1,Unnamed: 2_level_1
Amazon Logistics,193.660865,4.270677
DHL,221.468,4.255556
FedEx,218.420068,4.283276
LaserShip,188.281576,4.075601
OnTrac,185.144793,4.07931
UPS,203.666383,4.192623
USPS,181.916398,4.039568


### 6. How does cost change with shipment weight?

In [31]:
df.groupby(pd.cut(df["Weight_kg"], bins=5))["Cost"].mean()

Weight_kg
(0.134, 13.34]    174.803827
(13.34, 26.48]    192.286119
(26.48, 39.62]    202.536844
(39.62, 52.76]    219.991235
(52.76, 65.9]     265.728770
Name: Cost, dtype: float64

### 7. How does cost change with distance?

In [32]:
df.groupby(pd.cut(df["Distance_miles"], bins=5))["Cost"].mean()

Distance_miles
(98.602, 580.6]      75.456412
(580.6, 1060.2]     145.101584
(1060.2, 1539.8]    202.677758
(1539.8, 2019.4]    265.150789
(2019.4, 2499.0]    320.747229
Name: Cost, dtype: float64

### 8. Is Cost per mile higher for short or long distances?

In [33]:
df.groupby(pd.cut(df["Distance_miles"], bins=5))["Cost_per_mile"].mean()

Distance_miles
(98.602, 580.6]     0.251492
(580.6, 1060.2]     0.173789
(1060.2, 1539.8]    0.155040
(1539.8, 2019.4]    0.148225
(2019.4, 2499.0]    0.141749
Name: Cost_per_mile, dtype: float64

### 9. Do heavier shipments always increase transit days?

In [34]:
df.groupby(pd.cut(df["Weight_kg"], bins=5))["Transit_Days"].mean()

Weight_kg
(0.134, 13.34]    4.178439
(13.34, 26.48]    4.110787
(26.48, 39.62]    4.164804
(39.62, 52.76]    4.283951
(52.76, 65.9]     4.265957
Name: Transit_Days, dtype: float64

### 10. What percentage of shipments are Delivered vs Not Delivered?

In [35]:
df["Status"].value_counts(normalize=True) * 100

Status
Delivered     82.039337
Delayed       10.041408
In Transit     3.933747
Lost           2.329193
Returned       1.656315
Name: proportion, dtype: float64

In [36]:
# Are any ‚ÄúDelivered‚Äù shipments missing delivery date? (data quality)
df[(df['Status'] == 'Delivered') & (df['Delivery_Date'].isna())].shape[0]

0

## üìä EDA Insights Summary

The exploratory analysis reveals clear and realistic logistics patterns. Cost per mile is significantly higher for shorter distances and steadily decreases as distance increases, indicating strong economies of scale in shipping‚Äîlonger routes spread fixed costs more efficiently. Heavier shipments do not consistently increase transit time; average transit days remain relatively stable across weight bins, suggesting that delivery speed is driven more by routing and carrier operations than shipment weight. From a service quality perspective, over 82% of shipments are successfully delivered, with a smaller proportion experiencing delays or exceptions, indicating overall operational reliability. Importantly, no delivered shipments are missing delivery dates, confirming strong data integrity. These findings support using distance, cost efficiency, and carrier behavior as key factors for carrier selection rather than assuming heavier shipments automatically lead to slower delivery.

### üìå Key Business Insights

- USPS has the lowest average shipping cost, making it the most cost-effective option overall for standard shipments.

- LaserShip has the lowest cost per mile, indicating strong efficiency on short-distance or regional deliveries.

- Cost per mile decreases as distance increases, showing economies of scale for long-distance shipments.

- UPS and FedEx tend to be more expensive but offer competitive transit times, making them better suited for time-sensitive deliveries.

- Shipment weight does not significantly impact transit days, suggesting delivery speed depends more on carrier operations than package weight.

- Over 80% of shipments are delivered successfully, indicating reliable overall logistics performance.

- No delivered shipments are missing delivery dates, confirming good data quality and trustworthy performance metrics.