# 1. Calculate Probability Distribution

Using the _DataCo Supply Chain Analytics_ dataset, calculate the probability distribution for a model to predict the risk of a shipment being delayed.

Reference
- Constante, Fabian; Silva, Fernando; Pereira, António (2019), “DataCo SMART SUPPLY CHAIN FOR BIG DATA ANALYSIS”, Mendeley Data, V5, doi: 10.17632/8gx2fvg2k6.5

In [1]:
import pandas as pd

from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.factors.discrete import TabularCPD
from pgmpy.models import BayesianNetwork


## 1.1 Data Preparation & Cleaning

### 1.1.1 Load Source Date

In [2]:
# load the source data
df_source = pd.read_csv('data/raw/DataCoSupplyChainDataset.csv', encoding='unicode_escape')
df_source.drop_duplicates(inplace=True)

print(df_source.shape)
with pd.option_context('display.max_columns', None):
    display(df_source.head())

(180519, 53)


Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Category Name,Customer City,Customer Country,Customer Email,Customer Fname,Customer Id,Customer Lname,Customer Password,Customer Segment,Customer State,Customer Street,Customer Zipcode,Department Id,Department Name,Latitude,Longitude,Market,Order City,Order Country,Order Customer Id,order date (DateOrders),Order Id,Order Item Cardprod Id,Order Item Discount,Order Item Discount Rate,Order Item Id,Order Item Product Price,Order Item Profit Ratio,Order Item Quantity,Sales,Order Item Total,Order Profit Per Order,Order Region,Order State,Order Status,Order Zipcode,Product Card Id,Product Category Id,Product Description,Product Image,Product Name,Product Price,Product Status,shipping date (DateOrders),Shipping Mode
0,DEBIT,3,4,91.25,314.640015,Advance shipping,0,73,Sporting Goods,Caguas,Puerto Rico,XXXXXXXXX,Cally,20755,Holloway,XXXXXXXXX,Consumer,PR,5365 Noble Nectar Island,725.0,2,Fitness,18.251453,-66.037056,Pacific Asia,Bekasi,Indonesia,20755,1/31/2018 22:56,77202,1360,13.11,0.04,180517,327.75,0.29,1,327.75,314.640015,91.25,Southeast Asia,Java Occidental,COMPLETE,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,2/3/2018 22:56,Standard Class
1,TRANSFER,5,4,-249.089996,311.359985,Late delivery,1,73,Sporting Goods,Caguas,Puerto Rico,XXXXXXXXX,Irene,19492,Luna,XXXXXXXXX,Consumer,PR,2679 Rustic Loop,725.0,2,Fitness,18.279451,-66.037064,Pacific Asia,Bikaner,India,19492,1/13/2018 12:27,75939,1360,16.389999,0.05,179254,327.75,-0.8,1,327.75,311.359985,-249.089996,South Asia,Rajastán,PENDING,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/18/2018 12:27,Standard Class
2,CASH,4,4,-247.779999,309.720001,Shipping on time,0,73,Sporting Goods,San Jose,EE. UU.,XXXXXXXXX,Gillian,19491,Maldonado,XXXXXXXXX,Consumer,CA,8510 Round Bear Gate,95125.0,2,Fitness,37.292233,-121.881279,Pacific Asia,Bikaner,India,19491,1/13/2018 12:06,75938,1360,18.030001,0.06,179253,327.75,-0.8,1,327.75,309.720001,-247.779999,South Asia,Rajastán,CLOSED,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/17/2018 12:06,Standard Class
3,DEBIT,3,4,22.860001,304.809998,Advance shipping,0,73,Sporting Goods,Los Angeles,EE. UU.,XXXXXXXXX,Tana,19490,Tate,XXXXXXXXX,Home Office,CA,3200 Amber Bend,90027.0,2,Fitness,34.125946,-118.291016,Pacific Asia,Townsville,Australia,19490,1/13/2018 11:45,75937,1360,22.940001,0.07,179252,327.75,0.08,1,327.75,304.809998,22.860001,Oceania,Queensland,COMPLETE,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/16/2018 11:45,Standard Class
4,PAYMENT,2,4,134.210007,298.25,Advance shipping,0,73,Sporting Goods,Caguas,Puerto Rico,XXXXXXXXX,Orli,19489,Hendricks,XXXXXXXXX,Corporate,PR,8671 Iron Anchor Corners,725.0,2,Fitness,18.253769,-66.037048,Pacific Asia,Townsville,Australia,19489,1/13/2018 11:24,75936,1360,29.5,0.09,179251,327.75,0.45,1,327.75,298.25,134.210007,Oceania,Queensland,PENDING_PAYMENT,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/15/2018 11:24,Standard Class


### 1.1.2 Data Selection

Select the columns that will be used as the network nodes and remove duplicate records.

In [3]:
nodes = [
    'Days for shipping (real)',
    'Days for shipment (scheduled)',
    'Late_delivery_risk',
    'Delivery Status',
    'order date (DateOrders)',
    'shipping date (DateOrders)',
    #'Order Status',
]

df_data = df_source[nodes] \
    .rename(columns={
        'order date (DateOrders)': 'Order date (DateOrders)',
        'shipping date (DateOrders)': 'Shipping date (DateOrders)'}) \
    .drop_duplicates() \
    .reset_index(drop=True)


print(df_data.shape)
with pd.option_context('display.max_columns', None):
    display(df_data.head())

(65752, 6)


Unnamed: 0,Days for shipping (real),Days for shipment (scheduled),Late_delivery_risk,Delivery Status,Order date (DateOrders),Shipping date (DateOrders)
0,3,4,0,Advance shipping,1/31/2018 22:56,2/3/2018 22:56
1,5,4,1,Late delivery,1/13/2018 12:27,1/18/2018 12:27
2,4,4,0,Shipping on time,1/13/2018 12:06,1/17/2018 12:06
3,3,4,0,Advance shipping,1/13/2018 11:45,1/16/2018 11:45
4,2,4,0,Advance shipping,1/13/2018 11:24,1/15/2018 11:24


## 1.2 Feature Engineering

Change dates to week numbers.

In [4]:
df_data['Order date (Week)'] = pd.to_datetime(df_data['Order date (DateOrders)']).dt.isocalendar().week
df_data['Shipping date (Week)'] = pd.to_datetime(df_data['Shipping date (DateOrders)']).dt.isocalendar().week

print(df_data.shape)
with pd.option_context('display.max_columns', None):
    display(df_data.head())

(65752, 8)


Unnamed: 0,Days for shipping (real),Days for shipment (scheduled),Late_delivery_risk,Delivery Status,Order date (DateOrders),Shipping date (DateOrders),Order date (Week),Shipping date (Week)
0,3,4,0,Advance shipping,1/31/2018 22:56,2/3/2018 22:56,5,5
1,5,4,1,Late delivery,1/13/2018 12:27,1/18/2018 12:27,2,3
2,4,4,0,Shipping on time,1/13/2018 12:06,1/17/2018 12:06,2,3
3,3,4,0,Advance shipping,1/13/2018 11:45,1/16/2018 11:45,2,3
4,2,4,0,Advance shipping,1/13/2018 11:24,1/15/2018 11:24,2,3


## 1.3 Model Training

### 1.3.1 Model Definition

Define the model nodes and edges.

In [5]:
# Create the Bayesian model
model = BayesianNetwork()

# Add the nodes to the model
model.add_node('Days for shipping (real)')
model.add_node('Days for shipment (scheduled)')
model.add_node('Late_delivery_risk')
model.add_node('Delivery Status')
model.add_node('Order date (Week)')
model.add_node('Shipping date (Week)')
#model.add_node('Order Status')

# Add the edges to the model
model.add_edge('Days for shipping (real)', 'Late_delivery_risk')
model.add_edge('Days for shipment (scheduled)', 'Late_delivery_risk')
model.add_edge('Late_delivery_risk', 'Delivery Status')
model.add_edge('Order date (Week)', 'Delivery Status')
model.add_edge('Shipping date (Week)', 'Delivery Status')
#model.add_edge('Order Status', 'Delivery Status')

### 1.3.2 Compute the Probability Distribution Table (PDT)

In [6]:
model.fit(
    data=df_data, 
    estimator=MaximumLikelihoodEstimator)

In [7]:
for cpd in model.get_cpds():
    print(f'--- {cpd.variable} ---')
    print(cpd.values, end='\n\n')

--- Days for shipping (real) ---
[0.02804477 0.02626536 0.31259886 0.15829176 0.15851989 0.15719674
 0.15908261]

--- Days for shipment (scheduled) ---
[0.05431014 0.15328811 0.19433629 0.59806546]

--- Late_delivery_risk ---
[[[1.         0.04574406 0.5        0.5        0.5        0.5
   0.5       ]
  [0.5        0.5        0.04732612 0.5        0.5        0.5
   0.5       ]
  [0.5        0.5        1.         0.04306598 0.03586745 0.0395613
   0.04212168]
  [0.5        0.5        1.         1.         1.         0.04175768
   0.04546606]]

 [[0.         0.95425594 0.5        0.5        0.5        0.5
   0.5       ]
  [0.5        0.5        0.95267388 0.5        0.5        0.5
   0.5       ]
  [0.5        0.5        0.         0.95693402 0.96413255 0.9604387
   0.95787832]
  [0.5        0.5        0.         0.         0.         0.95824232
   0.95453394]]]

--- Delivery Status ---
[[[[0.58923513 0.47402597 0.25       ... 0.25       0.25
    0.25      ]
   [0.25       0.52888889 0.44