In [2]:
import pandas as pd
import sqlite3

# Load CSV
df = pd.read_csv("ecommerce.shipping.data.csv")

# Inspect
print(df.shape)
print(df.columns)
df.head()


(10999, 12)
Index(['ID', 'Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls',
       'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases',
       'Product_importance', 'Gender', 'Discount_offered', 'Weight_in_gms',
       'Reached.on.Time_Y.N'],
      dtype='object')


Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,D,Flight,4,2,177,3,low,F,44,1233,1
1,2,F,Flight,4,5,216,2,low,M,59,3088,1
2,3,A,Flight,2,2,183,4,low,M,48,3374,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177,1
4,5,C,Flight,2,2,184,3,medium,F,46,2484,1


In [3]:
# Clean column names for SQL use
df.columns = [c.strip().replace('.', '_').replace(' ', '_') for c in df.columns]

# Confirm rename worked
print(df.columns)

# Create SQLite connection
conn = sqlite3.connect("supply_chain.db")

# Export to SQL
df.to_sql("supply_chain", conn, if_exists="replace", index=False)

print("Data successfully loaded into SQLite.")

pd.read_sql_query("SELECT * FROM supply_chain LIMIT 5;", conn)


Index(['ID', 'Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls',
       'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases',
       'Product_importance', 'Gender', 'Discount_offered', 'Weight_in_gms',
       'Reached_on_Time_Y_N'],
      dtype='object')
Data successfully loaded into SQLite.


Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached_on_Time_Y_N
0,1,D,Flight,4,2,177,3,low,F,44,1233,1
1,2,F,Flight,4,5,216,2,low,M,59,3088,1
2,3,A,Flight,2,2,183,4,low,M,48,3374,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177,1
4,5,C,Flight,2,2,184,3,medium,F,46,2484,1


In [4]:
##### Single Row Summary Stats #####
pd.read_sql_query("""
SELECT 
    COUNT(*) AS total_orders,
    SUM(CASE WHEN Reached_on_Time_Y_N = 1 THEN 1 ELSE 0 END) AS late_deliveries,
    SUM(CASE WHEN Reached_on_Time_Y_N = 0 THEN 1 ELSE 0 END) AS ontime_deliveries,
    ROUND(SUM(CASE WHEN Reached_on_Time_Y_N = 1 THEN 1 ELSE 0 END)*100.0/COUNT(*), 2) AS late_rate_percent,
    ROUND(SUM(CASE WHEN Reached_on_Time_Y_N = 0 THEN 1 ELSE 0 END)*100.0/COUNT(*), 2) AS ontime_rate_percent
FROM supply_chain;
""", conn)


Unnamed: 0,total_orders,late_deliveries,ontime_deliveries,late_rate_percent,ontime_rate_percent
0,10999,6563,4436,59.67,40.33


In [5]:
##### Worst Performing Warehouses #####
pd.read_sql_query("""
SELECT 
    Warehouse_block,
    COUNT(*) AS total_orders,
    SUM(CASE WHEN Reached_on_Time_Y_N = 1 THEN 1 ELSE 0 END) AS late_deliveries,
    ROUND(SUM(CASE WHEN Reached_on_Time_Y_N = 1 THEN 1 ELSE 0 END)*100.0/COUNT(*), 2) AS late_rate_percent
FROM supply_chain
GROUP BY Warehouse_block
ORDER BY late_rate_percent DESC;
""", conn)

Unnamed: 0,Warehouse_block,total_orders,late_deliveries,late_rate_percent
0,B,1833,1104,60.23
1,F,3666,2194,59.85
2,D,1834,1096,59.76
3,C,1833,1094,59.68
4,A,1833,1075,58.65


In [6]:
###### Late Deliveries by Shipment Mode #####
pd.read_sql_query("""
SELECT 
    Mode_of_Shipment,
    COUNT(*) AS total_orders,
    SUM(CASE WHEN Reached_on_Time_Y_N = 1 THEN 1 ELSE 0 END) AS late_deliveries,
    ROUND(SUM(CASE WHEN Reached_on_Time_Y_N = 1 THEN 1 ELSE 0 END)*100.0/COUNT(*), 2) AS late_rate_percent
FROM supply_chain
GROUP BY Mode_of_Shipment
ORDER BY late_rate_percent DESC;
""", conn)

Unnamed: 0,Mode_of_Shipment,total_orders,late_deliveries,late_rate_percent
0,Flight,1777,1069,60.16
1,Ship,7462,4459,59.76
2,Road,1760,1035,58.81


In [7]:
##### Cost and Discount Patterns for Latle Deliveries #####
pd.read_sql_query("""
SELECT 
    Reached_on_Time_Y_N,
    ROUND(AVG(Cost_of_the_Product), 2) AS avg_cost,
    ROUND(AVG(Discount_offered), 2) AS avg_discount,
    ROUND(AVG(Weight_in_gms), 2) AS avg_weight,
    ROUND(AVG(Customer_rating), 2) AS avg_rating
FROM supply_chain
GROUP BY Reached_on_Time_Y_N;
""", conn)

Unnamed: 0,Reached_on_Time_Y_N,avg_cost,avg_discount,avg_weight,avg_rating
0,0,214.5,5.55,4168.67,2.97
1,1,207.29,18.66,3272.64,3.01


In [8]:
##### Summary Comparison Between Late and On-Time Deliveries #####
pd.read_sql_query("""
SELECT 
    ROUND(AVG(Customer_care_calls), 2) AS avg_calls,
    ROUND(AVG(Customer_rating), 2) AS avg_rating,
    ROUND(AVG(Cost_of_the_Product), 2) AS avg_cost,
    ROUND(AVG(Discount_offered), 2) AS avg_discount,
    ROUND(AVG(Weight_in_gms), 2) AS avg_weight
FROM supply_chain
GROUP BY Reached_on_Time_Y_N;
""", conn)


Unnamed: 0,avg_calls,avg_rating,avg_cost,avg_discount,avg_weight
0,4.15,2.97,214.5,5.55,4168.67
1,3.99,3.01,207.29,18.66,3272.64


In [9]:
#### Association Tests (Chi-Squared) #####
import scipy.stats as stats
import pandas as pd

# Load the data from SQL
df = pd.read_sql_query("SELECT * FROM supply_chain;", conn)

# Function to test associations
def chi_square_test(col):
    contingency = pd.crosstab(df[col], df['Reached_on_Time_Y_N'])
    chi2, p, dof, ex = stats.chi2_contingency(contingency)
    return pd.Series({'Chi2': chi2, 'p-value': p})

tests = pd.DataFrame({
    'Variable': ['Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Gender'],
})
tests[['Chi2', 'p-value']] = tests['Variable'].apply(lambda x: chi_square_test(x))
tests


Unnamed: 0,Variable,Chi2,p-value
0,Warehouse_block,1.089387,0.895952
1,Mode_of_Shipment,0.743436,0.689549
2,Product_importance,12.211164,0.00223
3,Gender,0.22308,0.636703


In [10]:
#### Correlation with Numeric Variables #####
numeric_cols = ['Cost_of_the_Product', 'Discount_offered', 'Weight_in_gms', 'Customer_care_calls', 'Customer_rating']
df[numeric_cols + ['Reached_on_Time_Y_N']].corr()['Reached_on_Time_Y_N'].sort_values(ascending=False)


Reached_on_Time_Y_N    1.000000
Discount_offered       0.397108
Customer_rating        0.013119
Customer_care_calls   -0.067126
Cost_of_the_Product   -0.073587
Weight_in_gms         -0.268793
Name: Reached_on_Time_Y_N, dtype: float64

In [11]:
###### Logistic Regression Model #####
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
import numpy as np

# Reload from SQL (to ensure we have clean data)
df = pd.read_sql_query("SELECT * FROM supply_chain;", conn)

# Select numeric predictor variables
features = ['Discount_offered', 'Weight_in_gms', 'Customer_care_calls', 'Customer_rating', 'Cost_of_the_Product']
X = df[features]
y = df['Reached_on_Time_Y_N']

# Standardize numeric variables for interpretability
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=features)

# Add intercept
X_scaled = sm.add_constant(X_scaled)

# Fit logistic regression
logit_model = sm.Logit(y, X_scaled).fit()

# Summary
print(logit_model.summary())

# Compute Odds Ratios (more intuitive than coefficients)
odds_ratios = pd.DataFrame({
    "Variable": X_scaled.columns,
    "Odds_Ratio": np.exp(logit_model.params),
    "p-value": logit_model.pvalues
}).round(3)

print("\nOdds Ratios and p-values:")
print(odds_ratios)


Optimization terminated successfully.
         Current function value: 0.547818
         Iterations 8
                            Logit Regression Results                           
Dep. Variable:     Reached_on_Time_Y_N   No. Observations:                10999
Model:                           Logit   Df Residuals:                    10993
Method:                            MLE   Df Model:                            5
Date:                 Tue, 11 Nov 2025   Pseudo R-squ.:                  0.1876
Time:                         16:05:44   Log-Likelihood:                -6025.4
converged:                        True   LL-Null:                       -7417.0
Covariance Type:             nonrobust   LLR p-value:                     0.000
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   0.9382      0.037     25.029      0.000       0.865       

In [13]:
##### Warehouse Summary ######
warehouse_summary = pd.read_sql_query("""
SELECT 
    Warehouse_block,
    COUNT(*) AS total_orders,
    SUM(CASE WHEN Reached_on_Time_Y_N = 1 THEN 1 ELSE 0 END) AS late_deliveries,
    ROUND(SUM(CASE WHEN Reached_on_Time_Y_N = 1 THEN 1 ELSE 0 END)*100.0/COUNT(*), 2) AS late_rate_percent
FROM supply_chain
GROUP BY Warehouse_block
ORDER BY Warehouse_block;
""", conn)
warehouse_summary.to_csv("warehouse_summary.csv", index=False)


In [14]:
##### Shipment Mode Summary #####
mode_summary = pd.read_sql_query("""
SELECT 
    Mode_of_Shipment,
    COUNT(*) AS total_orders,
    SUM(CASE WHEN Reached_on_Time_Y_N = 1 THEN 1 ELSE 0 END) AS late_deliveries,
    ROUND(SUM(CASE WHEN Reached_on_Time_Y_N = 1 THEN 1 ELSE 0 END)*100.0/COUNT(*), 2) AS late_rate_percent
FROM supply_chain
GROUP BY Mode_of_Shipment
ORDER BY Mode_of_Shipment;
""", conn)
mode_summary.to_csv("mode_summary.csv", index=False)


In [16]:
##### Delivery Type Summary #####
delivery_summary = pd.read_sql_query("""
SELECT 
    Reached_on_Time_Y_N,
    ROUND(AVG(Cost_of_the_Product), 2) AS avg_cost,
    ROUND(AVG(Discount_offered), 2) AS avg_discount,
    ROUND(AVG(Weight_in_gms), 2) AS avg_weight,
    ROUND(AVG(Customer_rating), 2) AS avg_rating,
    ROUND(AVG(Customer_care_calls), 2) AS avg_calls
FROM supply_chain
GROUP BY Reached_on_Time_Y_N
ORDER BY Reached_on_Time_Y_N;
""", conn)
delivery_summary.to_csv("delivery_summary.csv", index=False)


In [18]:
###### Predictive Model Summary ######
import numpy as np

odds_ratios = pd.DataFrame({
    "Variable": logit_model.params.index,
    "Coefficient": logit_model.params.values,
    "Odds_Ratio": np.exp(logit_model.params.values).round(3),
    "p_value": logit_model.pvalues.round(4)
})

# Filter out constant
odds_ratios = odds_ratios[odds_ratios["Variable"] != "const"]

odds_ratios.to_csv("logistic_odds_ratios.csv", index=False)
odds_ratios


Unnamed: 0,Variable,Coefficient,Odds_Ratio,p_value
Discount_offered,Discount_offered,1.830754,6.239,0.0
Weight_in_gms,Weight_in_gms,-0.355062,0.701,0.0
Customer_care_calls,Customer_care_calls,-0.133305,0.875,0.0
Customer_rating,Customer_rating,0.035744,1.036,0.1011
Cost_of_the_Product,Cost_of_the_Product,-0.097995,0.907,0.0
