In [3]:
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from fontTools.misc.classifyTools import Classifier
from numba.np.random.distributions import random_standard_exponential
from pandas.core.common import random_state
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor 
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error 
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


import warnings
warnings.filterwarnings('ignore')
import pickle 

In [4]:
import kagglehub 
path=kagglehub.dataset_download("olistbr/brazilian-ecommerce")
print('path to dataset file:', path)

path to dataset file: /Users/modupeolafagbenro/.cache/kagglehub/datasets/olistbr/brazilian-ecommerce/versions/2


In [5]:
files= os.listdir(path)
print("files in the directory path and the dataset inside:")
for file in files:
    print(file)

files in the directory path and the dataset inside:
olist_sellers_dataset.csv
product_category_name_translation.csv
olist_orders_dataset.csv
olist_order_items_dataset.csv
olist_customers_dataset.csv
olist_geolocation_dataset.csv
olist_order_payments_dataset.csv
olist_order_reviews_dataset.csv
olist_products_dataset.csv


In [6]:
def load_clean_dataset():
    " Loading clean preprocessed dataset"""
    try:
        with open("../data/processed/main_df_cleaned.pkl", "rb") as f:
            main_df = pickle.load(f)
            print(f"Loading clean data from data preprocessing: {main_df.shape}")
            return main_df  
    except FileNotFoundError:
        print("File is not found: Run data preprocessing file")
        return None
        

In [7]:
main_df = load_clean_dataset()

if main_df is not None:
    print("Combined dataset")
    print(main_df.head())
    

Loading clean data from data preprocessing: (112650, 33)
Combined dataset
                           order_id                       customer_id  \
0  e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
1  53cdb2fc8bc7dce0b6741e2150273451  b0830fb4747a6c6d20dea0b8c802d7ef   
2  47770eb9100c2d0c44946d9cf07ec65d  41ce2a54c0b03bf3443c3d931a367089   
3  949d5b44dbf5de918fe9c16f97b45f8a  f88197465ea7920adcdbec7375364d82   
4  ad21c59c0840e6cb83a9ceb5573f8159  8ab97904e6daea8866dbdbc4fb7aad2c   

  order_status order_purchase_timestamp   order_approved_at  \
0    delivered      2017-10-02 10:56:33 2017-10-02 11:07:15   
1    delivered      2018-07-24 20:41:37 2018-07-26 03:24:27   
2    delivered      2018-08-08 08:38:49 2018-08-08 08:55:23   
3    delivered      2017-11-18 19:28:06 2017-11-18 19:45:59   
4    delivered      2018-02-13 21:18:39 2018-02-13 22:20:29   

  order_delivered_carrier_date order_delivered_customer_date  \
0          2017-10-04 19:55:00           201

In [8]:
#merge the payment dataset with the clean dataset for modeling purposes
payment_df=pd.read_csv(os.path.join(path, "olist_order_payments_dataset.csv"))

#get primary payment method or order(most common payment method for each order)
primary_payments =payment_df.groupby('order_id').agg({
    'payment_type': lambda x: x.mode()[0], 
    'payment_value': 'sum' 
}).reset_index()

#merge with your current main_df
main_df_with_payments = main_df.merge(primary_payments, on='order_id', how='left')

print(f"Shape before:{main_df.shape}")
print(f"shape after :{main_df.shape}")

Shape before:(112650, 33)
shape after :(112650, 33)


Modeling - Regression, Classification, Ensemble method

Model 2- Delivery Time Prediction

In [9]:
#filtered down to delivered orders only

delivered_orders=main_df_with_payments.dropna(subset=['delivery_days'])
print(f"Delivered orders: {len(delivered_orders)} out of {len(main_df_with_payments)}")

#feature selections -for main features and target features 

delivery_features=[
    'customer_state', 'customer_zip_code_prefix',
    #Product features
    'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_category_name',
    #order timing
    'order_month', 'order_day_of_week', 'is_weekend',
    #order characteristics
    'freight_value', 'price'
    
]

target_delivery= 'delivery_days'
    

Delivered orders: 110196 out of 112650


In [10]:
#modeling data preparation
X_delivery = delivered_orders[delivery_features].copy()
y_delivery= delivered_orders[target_delivery].copy()

#verifying 

print(f"X_delivery shape: {X_delivery.shape}")
print(f"y_delievry shape: {y_delivery.shape}")

X_delivery shape: (110196, 11)
y_delievry shape: (110196,)


Handling Categorical Variable -Encoding

In [11]:
delivery_encoders ={}

categorical_col = ['customer_state', 'product_category_name']

for col in categorical_col:
    le= LabelEncoder()
    X_delivery[col]=le.fit_transform(X_delivery[col])
    delivery_encoders[col]=le
    

Splitting the dataset for modeling

In [12]:
#train_test Split
X_train_del, X_test_del,  y_train_del, y_test_del = train_test_split(X_delivery, y_delivery, test_size=0.2, random_state=42)


print(f"Training set: {X_train_del.shape}")
print(f"Test Set: {X_test_del.shape}")

Training set: (88156, 11)
Test Set: (22040, 11)


Standardization Since the model involves Linear Regression -New scaler

In [13]:
delivery_scaler= StandardScaler()
X_train_del_scaled= delivery_scaler.fit_transform(X_train_del)
X_test_del_scaled= delivery_scaler.transform(X_test_del)

In [14]:
delivery_models={
    'Linear Regression': LinearRegression(),
    'Random Forest Regressor': RandomForestRegressor(n_estimators=100,random_state=42),
    'XGBoost Regressor' : XGBRegressor(n_estimators=100, random_state=42)
}

#train and evaluate 
delivery_result={}
for name, model in delivery_models.items():
    print(f"n\====={name} ====")
    if name == 'Linear Regression':
        model.fit(X_train_del_scaled, y_train_del)
        y_pred = model.predict(X_test_del_scaled)

    else:

        model.fit(X_train_del, y_train_del)
        y_pred = model.predict(X_test_del)

    #metrics
    mse = mean_squared_error(y_test_del, y_pred)
    rmse = mse ** 0.5
    mae = mean_absolute_error(y_test_del, y_pred)
    r2= r2_score(y_test_del, y_pred)

    delivery_result[name] = {'RMSE': rmse, 'MAE': mae, 'R2': r2}
    
    print(f'RMSE: {rmse:.2f}')
    print(f'MAE: {mae:.2f}')
    print(f'R2: {r2:.2f}')
    
    #compareb results
    delivery_df = pd.DataFrame(delivery_result).T
    print("\n==== Delivery Time Model Comparison ===\n")
    print(delivery_df)

n\=====Linear Regression ====
RMSE: 8.61
MAE: 5.67
R2: 0.14

==== Delivery Time Model Comparison ===

                       RMSE       MAE        R2
Linear Regression  8.612944  5.671724  0.138417
n\=====Random Forest Regressor ====
RMSE: 7.42
MAE: 4.47
R2: 0.36

==== Delivery Time Model Comparison ===

                             RMSE       MAE        R2
Linear Regression        8.612944  5.671724  0.138417
Random Forest Regressor  7.424661  4.472330  0.359753
n\=====XGBoost Regressor ====
RMSE: 7.48
MAE: 4.70
R2: 0.35

==== Delivery Time Model Comparison ===

                             RMSE       MAE        R2
Linear Regression        8.612944  5.671724  0.138417
Random Forest Regressor  7.424661  4.472330  0.359753
XGBoost Regressor        7.482660  4.696887  0.349711


Business Interpretation 

In [15]:
print("Delivery Time Prediction:")
print("Predict Delivery within +-4.5 days in average")
print("This explains 36% of delivery time variation")
print("Typical Delivery prediction accuracy is 7-10days for 14days delivery ")

Delivery Time Prediction:
Predict Delivery within +-4.5 days in average
This explains 36% of delivery time variation
Typical Delivery prediction accuracy is 7-10days for 14days delivery 


In real world:
Customer asked : when will my order arrive?
Actually delivery will be in 12 days

Our Random forest predicts 12days +-4.5days , which could be 7.5-16.5 days range

BUSINESS IMPACT: can give customer realistic expectation in regards to order good delivery time 


Linear Regression performance was worst with R2 score of 0.14 because Delivery time is highly non-linear with the features 

while Randon Forest Regressor capture non-linearity and complexity in the model such as Geographical location, Product complexity e.g heavy/bukly item processing days 
also capture seasonal effect which could be holidays 
shipping tiers , express vs standard shipping 
Warehouse logistics , stock availability that may affect processing of goods



In [57]:
#Features importance checking the features that drives the delivery time 
delivery_importance =pd.DataFrame({
    'features': delivery_features,
    'importance': delivery_models['Random Forest Regressor'].feature_importances_,
}).sort_values(by='importance', ascending=False)

print("Top 5 delivery Time Drivers:")
print(delivery_importance.head())

Top 5 delivery Time Drivers:
                    features  importance
1   customer_zip_code_prefix    0.276462
9              freight_value    0.148156
10                     price    0.104447
6                order_month    0.086501
2           product_weight_g    0.086159


Top 5 Delivery Time Drivers - Business Insights:

customer_zip_code_prefix (26.6%): 🏠 Location is KING

Remote areas = longer delivery
Urban vs rural logistics differences
Distance from distribution centers


freight_value (14.2%): 🚚 Shipping complexity

Express shipping = faster delivery
Heavy/bulky items = processing delays
Premium shipping tiers


price (9.6%): 💰 Product value affects handling

Expensive items = extra security checks
Premium products = careful packaging
Insurance processing time


order_month (8.5%): 📅 Seasonal logistics

Holiday seasons = warehouse congestion
Weather impacts (rainy season)
Peak shopping periods


product_weight_g (7.9%): ⚖️ Physical logistics

Heavy items = slower processing
Special handling requirements
Carrier capacity constraints