In [7]:
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from fontTools.misc.classifyTools import Classifier
from numba.np.random.distributions import random_standard_exponential
from pandas.core.common import random_state
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor 
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error 
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


import warnings
warnings.filterwarnings('ignore')
import pickle 

In [8]:
import kagglehub 
path=kagglehub.dataset_download("olistbr/brazilian-ecommerce")
print('path to dataset file:', path)

path to dataset file: /Users/modupeolafagbenro/.cache/kagglehub/datasets/olistbr/brazilian-ecommerce/versions/2


In [9]:
files= os.listdir(path)
print("files in the directory path and the dataset inside:")
for file in files:
    print(file)

files in the directory path and the dataset inside:
olist_sellers_dataset.csv
product_category_name_translation.csv
olist_orders_dataset.csv
olist_order_items_dataset.csv
olist_customers_dataset.csv
olist_geolocation_dataset.csv
olist_order_payments_dataset.csv
olist_order_reviews_dataset.csv
olist_products_dataset.csv


In [10]:
#load the dataset from preporocesing 
def load_clean_dataset():
    " Loading clean preprocessed dataset"""
    try:
        with open("../data/processed/main_df_cleaned.pkl", "rb") as f:
            main_df = pickle.load(f)
            print(f"Loading clean data from data preprocessing: {main_df.shape}")
            return main_df  
    except FileNotFoundError:
        print("File is not found: Run data preprocessing file")
        return None
        
    

In [11]:
main_df = load_clean_dataset()

if main_df is not None:
    print("data loading")
    print(main_df.head(5))

Loading clean data from data preprocessing: (112650, 33)
data loading
                           order_id                       customer_id  \
0  e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
1  53cdb2fc8bc7dce0b6741e2150273451  b0830fb4747a6c6d20dea0b8c802d7ef   
2  47770eb9100c2d0c44946d9cf07ec65d  41ce2a54c0b03bf3443c3d931a367089   
3  949d5b44dbf5de918fe9c16f97b45f8a  f88197465ea7920adcdbec7375364d82   
4  ad21c59c0840e6cb83a9ceb5573f8159  8ab97904e6daea8866dbdbc4fb7aad2c   

  order_status order_purchase_timestamp   order_approved_at  \
0    delivered      2017-10-02 10:56:33 2017-10-02 11:07:15   
1    delivered      2018-07-24 20:41:37 2018-07-26 03:24:27   
2    delivered      2018-08-08 08:38:49 2018-08-08 08:55:23   
3    delivered      2017-11-18 19:28:06 2017-11-18 19:45:59   
4    delivered      2018-02-13 21:18:39 2018-02-13 22:20:29   

  order_delivered_carrier_date order_delivered_customer_date  \
0          2017-10-04 19:55:00           2017-10

Modeling-Regression, Classification and Ensemble Method model

In [12]:
#merge the payment dataset with the clean dataset for modeling purposes
payment_df=pd.read_csv(os.path.join(path, "olist_order_payments_dataset.csv"))

#get primary payment method or order(most common payment method for each order)
primary_payments =payment_df.groupby('order_id').agg({
    'payment_type': lambda x: x.mode()[0], 
    'payment_value': 'sum' 
}).reset_index()

#merge with your current main_df
main_df_with_payments = main_df.merge(primary_payments, on='order_id', how='left')

print(f"Shape before:{main_df.shape}")
print(f"shape after :{main_df.shape}")

Shape before:(112650, 33)
shape after :(112650, 33)


In [13]:
main_df_with_payments.columns

Index(['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp',
       'order_approved_at', 'order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date',
       'order_item_id', 'product_id', 'seller_id', 'shipping_limit_date',
       'price', 'freight_value', 'customer_unique_id',
       'customer_zip_code_prefix', 'customer_city', 'customer_state',
       'product_category_name', 'product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm',
       'order_year', 'order_month', 'order_day_of_week', 'order_hour',
       'delivery_days', 'total_order_value', 'is_weekend', 'payment_type',
       'payment_value'],
      dtype='object')

In [14]:
#checking if the dataset has null values 
print("Null values in the modeling dataset")
print(main_df_with_payments.isnull().sum())

Null values in the modeling dataset
order_id                            0
customer_id                         0
order_status                        0
order_purchase_timestamp            0
order_approved_at                   0
order_delivered_carrier_date     1194
order_delivered_customer_date    2454
order_estimated_delivery_date       0
order_item_id                       0
product_id                          0
seller_id                           0
shipping_limit_date                 0
price                               0
freight_value                       0
customer_unique_id                  0
customer_zip_code_prefix            0
customer_city                       0
customer_state                      0
product_category_name               0
product_name_lenght                 0
product_description_lenght          0
product_photos_qty                  0
product_weight_g                    0
product_length_cm                   0
product_height_cm                   0
product_width_

The above null values are business logic - those null values are meaningfuk , business wise # These are NULL because orders haven't been delivered yet!
order_delivered_carrier_date: 1194 nulls    # Orders not shipped yet
order_delivered_customer_date: 2454 nulls   # Orders not delivered yet  
total_order_value: 2454 nulls               # Same orders (can't calculate without delivery

In [15]:
#dropping other irrelevant columns -all the ids are irrelevant columns
columns_to_drop=[
    #id is not predictive)
    'order_id', 'customer_id', 'customer_unique_id', 'product_id', 'seller_id',
    #date (we already extract relevant data and cols using this columns so we do not need it again
    'order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date','order_delivered_customer_date', 'order_estimated_delivery_date', 'shipping_limit_date',
    #target leakage(these reveal the answer) its same thing has price 
    'payment_value', 
    #redundant
    'order_item_id', 'customer_city' #customer state is enough
]

Order Value Prediction

In [18]:
#regression and ensemble modeling -order value opredictions 
R_Features =['product_category_name', 'product_weight_g', 'product_length_cm','product_height_cm', 'product_width_cm','product_photos_qty','product_name_lenght', 'product_description_lenght','customer_state', 'customer_zip_code_prefix','freight_value', 'order_month', 'order_day_of_week', 'order_hour', 'is_weekend','payment_type']

target_regression = 'price'

Partitioning Datasety X and y set for the model training

In [19]:
X = main_df_with_payments[R_Features].copy()
y=main_df_with_payments[target_regression]

print(f"model features shape: {X.shape}")
print(f"Target shape:{y.shape}")

#handling categorical variables 
label_encoder= {}
categorical_col=['product_category_name', 'customer_state', 'payment_type']

for col in categorical_col:
    le=LabelEncoder()
    X[col]=le.fit_transform(X[col])
    label_encoder[col]=le

#train_test split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape}")
print(f"Test set: {y_test.shape}")

model features shape: (112650, 16)
Target shape:(112650,)
Training set: (90120, 16)
Test set: (22530,)


Feature Scaling

In [20]:
#using standardization -important for linear Regression 
scaler = StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Data successfully scaled")

Data successfully scaled


Modelling Multiple Regression Model

In [21]:
#intialiazed the models 
models= {
    'Linear Regression' :LinearRegression(),
    'Random Forest Regressor' : RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost Regressor': XGBRegressor(n_estimators=100, random_state=42)
    
}

result = {}
for name, model in models.items():
    print(f"\n====={name}=====")

    #use scaled data for linear regression, original for the ensemble tree-based model
    if name == 'Linear Regression':
        model.fit(X_train_scaled, y_train)
        y_pred=model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred= model.predict(X_test)

    #calculate the metrics 
    mse= mean_squared_error(y_test, y_pred)
    rmse= mse **0.5
    mae = mean_absolute_error(y_test, y_pred)
    r2= r2_score(y_test, y_pred)

    result[name] ={'RMSE': rmse, 'MAE': mae, 'R2':r2}
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R2 : {r2:.2f}")


#compare result 
result_df = pd.DataFrame(result).T
print("\n======Model Comparison =====")
print(result_df)

        
    


=====Linear Regression=====
RMSE: 154.71
MAE: 76.86
R2 : 0.21

=====Random Forest Regressor=====
RMSE: 107.19
MAE: 36.24
R2 : 0.62

=====XGBoost Regressor=====
RMSE: 113.73
MAE: 48.84
R2 : 0.57

                               RMSE        MAE        R2
Linear Regression        154.707894  76.863631  0.205336
Random Forest Regressor  107.185968  36.235794  0.618553
XGBoost Regressor        113.731727  48.842760  0.570541


In [22]:
# Random Forest Performance:
print("Random Forest for Price Prediction:")
print("✅ Can predict order values within ±$36 on average")
print("✅ Explains 62% of pricing patterns") 
print("✅ Good enough for business decisions!")

# Use cases:
print("\nBusiness Applications:")
print("• Dynamic pricing strategies")
print("• Revenue forecasting") 
print("• Inventory planning")
print("• Customer segment analysis")

Random Forest for Price Prediction:
✅ Can predict order values within ±$36 on average
✅ Explains 62% of pricing patterns
✅ Good enough for business decisions!

Business Applications:
• Dynamic pricing strategies
• Revenue forecasting
• Inventory planning
• Customer segment analysis


why random forest has the best prediction, Random forest has a non-linear relationship, meaning prices vs features is not linear, it captures feature interaction(e.g heavy products in distant states cost more, and extreme products do not skew the model, while Linear regression fail- is that prices increase linearly with weight , but in reality e-commerce pricing is complex and non-linear, cannot captures interactions between features 

In [23]:
#fetaure importance seeing what drives pricees
feature_importance=pd.DataFrame({
    'feature':R_Features,
    'importance':models['Random Forest Regressor'].feature_importances_,
}).sort_values(by='importance', ascending=False)

print("Top 5 Prices Drivers:")
print(feature_importance.head())

Top 5 Prices Drivers:
                       feature  importance
10               freight_value    0.243255
7   product_description_lenght    0.156378
1             product_weight_g    0.103747
0        product_category_name    0.100402
2            product_length_cm    0.067468


Business Insight:


In [92]:
print("Key Price Drivers Analysis:")
print("🚚 Shipping costs heavily influence final order value")
print("📝 Product complexity (description length) indicates premium items") 
print("⚖️ Product weight correlates with value/quality")
print("🏷️ Product category sets price expectations")

Key Price Drivers Analysis:
🚚 Shipping costs heavily influence final order value
📝 Product complexity (description length) indicates premium items
⚖️ Product weight correlates with value/quality
🏷️ Product category sets price expectations
