In [5]:
import pandas as pd
import numpy as np

# --- 1. LOAD DATA ---
# Ensure the path is correct relative to the notebook's location
print("Loading data...")
df = pd.read_csv('../01_Raw_Data/logistics_transactions_raw.csv')
print(f"Original Row Count: {df.shape[0]}")

# --- 2. DATA CLEANING & STANDARDIZATION ---

# A. Convert timestamp to datetime objects
df['timestamp'] = pd.to_datetime(df['timestamp'])

# B. Handle Null Values in critical columns by imputation (using the mean)
key_impute_cols = ['shipping_costs', 'delivery_time_deviation', 'loading_unloading_time']
df[key_impute_cols] = df[key_impute_cols].fillna(df[key_impute_cols].mean())

# C. CRITICAL FILTERING based on Metadata (1 = Fulfilled)
# 1. Convert the status column to numeric (in case it was read as a float/string)
df['order_fulfillment_status'] = pd.to_numeric(df['order_fulfillment_status'], errors='coerce')

# 2. Filter for successful, fulfilled orders (status == 1)
df_clean = df[df['order_fulfillment_status'] >= 0.80].copy()

# 3. Filter out non-sensical data (e.g., zero or negative costs)
df_clean = df_clean[df_clean['shipping_costs'] > 0]


# --- 3. CREATING THE CORE ANALYTICAL METRIC (Operational Cost Impact) ---

# Rationale: We link time-waste (deviation) back to cost to find true inefficiency.
# OCI = Shipping Cost * (1 + Deviation Factor)

# A. Create a factor for delays only (early delivery is not a cost penalty here)
# We only penalize if the deviation is positive (delivery was late)
df_clean['deviation_factor'] = np.where(
    df_clean['delivery_time_deviation'] > 0,
    df_clean['delivery_time_deviation'] / 24,  # Normalize deviation hours to a daily factor or similar
    0
)

# B. Calculate Operational Cost Impact (OCI)
df_clean['Operational_Cost_Impact'] = df_clean['shipping_costs'] * (1 + df_clean['deviation_factor'])


# --- 4. RESULTS & SAVE ---
print("-" * 50)
print(f"Cleaned Row Count (Successful Orders > 0 Cost): {df_clean.shape[0]}")
print(f"Average Operational Cost Impact (OCI): ${df_clean['Operational_Cost_Impact'].mean():.2f}")
print("-" * 50)

# Save the final cleaned dataset
df_clean.to_csv('../02_Processed_Data/logistics_transactions_clean.csv', index=False)
print("\nFile saved successfully to 02_Processed_Data!")

Loading data...
Original Row Count: 32065
--------------------------------------------------
Cleaned Row Count (Successful Orders > 0 Cost): 13061
Average Operational Cost Impact (OCI): $562.64
--------------------------------------------------

File saved successfully to 02_Processed_Data!


In [4]:
# Re-load the raw data to check the unfiltered columns
df_raw = pd.read_csv('../01_Raw_Data/logistics_transactions_raw.csv')

print("--- 1. ORDER FULFILLMENT STATUS DISTRIBUTION ---")
# Check the top 5 values and their counts. This helps confirm if 1 or 0 is used.
print(df_raw['order_fulfillment_status'].value_counts(normalize=True).head()) 

print("\n--- 2. SHIPPING COSTS DISTRIBUTION ---")
# Check the descriptive statistics. This confirms if costs are > 0.
print(df_raw['shipping_costs'].describe())

--- 1. ORDER FULFILLMENT STATUS DISTRIBUTION ---
order_fulfillment_status
0.761166    0.000031
0.968104    0.000031
0.930171    0.000031
0.010665    0.000031
0.981872    0.000031
Name: proportion, dtype: float64

--- 2. SHIPPING COSTS DISTRIBUTION ---
count    32065.000000
mean       459.374452
std        312.183487
min        100.000000
25%        154.017124
50%        388.996911
75%        753.007203
max        999.999853
Name: shipping_costs, dtype: float64
