## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder

## Import Dataset

In [2]:
dataset = "../data/raw/DataCoSupplyChainDataset.csv"
df = pd.read_csv(dataset, encoding='latin1', header=0)
df.head()

Unnamed: 0,Type,Days for shipping (real),Days for shipment (scheduled),Benefit per order,Sales per customer,Delivery Status,Late_delivery_risk,Category Id,Category Name,Customer City,...,Order Zipcode,Product Card Id,Product Category Id,Product Description,Product Image,Product Name,Product Price,Product Status,shipping date (DateOrders),Shipping Mode
0,DEBIT,3,4,91.25,314.640015,Advance shipping,0,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,2/3/2018 22:56,Standard Class
1,TRANSFER,5,4,-249.089996,311.359985,Late delivery,1,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/18/2018 12:27,Standard Class
2,CASH,4,4,-247.779999,309.720001,Shipping on time,0,73,Sporting Goods,San Jose,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/17/2018 12:06,Standard Class
3,DEBIT,3,4,22.860001,304.809998,Advance shipping,0,73,Sporting Goods,Los Angeles,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/16/2018 11:45,Standard Class
4,PAYMENT,2,4,134.210007,298.25,Advance shipping,0,73,Sporting Goods,Caguas,...,,1360,73,,http://images.acmesports.sports/Smart+watch,Smart watch,327.75,0,1/15/2018 11:24,Standard Class


## Data Preparation

In [3]:
df.shape

(180519, 53)

In [4]:
df['Late_delivery_risk'].value_counts()

Late_delivery_risk
1    98977
0    81542
Name: count, dtype: int64

In [5]:
rm_cols = [ "Category Name", "Customer Id", "Customer City", "Customer Email", "Customer Fname", "Customer Lname", "Customer Password", "Customer Street", "Customer State",
           "Department Name", "Order Id", "Order Customer Id", "Order Item Cardprod Id", "Order Item Discount Rate", "Order Item Id", "Product Card Id",
           "Product Description", "Product Image", "Product Name", "Order Zipcode", 'Days for shipping (real)', 'Delivery Status', "Order Status"]
  

df_filtered = df.drop(rm_cols, axis=1)

In [6]:
df_filtered.isnull().sum()

Type                             0
Days for shipment (scheduled)    0
Benefit per order                0
Sales per customer               0
Late_delivery_risk               0
Category Id                      0
Customer Country                 0
Customer Segment                 0
Customer Zipcode                 3
Department Id                    0
Latitude                         0
Longitude                        0
Market                           0
Order City                       0
Order Country                    0
order date (DateOrders)          0
Order Item Discount              0
Order Item Product Price         0
Order Item Profit Ratio          0
Order Item Quantity              0
Sales                            0
Order Item Total                 0
Order Profit Per Order           0
Order Region                     0
Order State                      0
Product Category Id              0
Product Price                    0
Product Status                   0
shipping date (DateO

In [7]:
df_filtered.head()

Unnamed: 0,Type,Days for shipment (scheduled),Benefit per order,Sales per customer,Late_delivery_risk,Category Id,Customer Country,Customer Segment,Customer Zipcode,Department Id,...,Sales,Order Item Total,Order Profit Per Order,Order Region,Order State,Product Category Id,Product Price,Product Status,shipping date (DateOrders),Shipping Mode
0,DEBIT,4,91.25,314.640015,0,73,Puerto Rico,Consumer,725.0,2,...,327.75,314.640015,91.25,Southeast Asia,Java Occidental,73,327.75,0,2/3/2018 22:56,Standard Class
1,TRANSFER,4,-249.089996,311.359985,1,73,Puerto Rico,Consumer,725.0,2,...,327.75,311.359985,-249.089996,South Asia,Rajastán,73,327.75,0,1/18/2018 12:27,Standard Class
2,CASH,4,-247.779999,309.720001,0,73,EE. UU.,Consumer,95125.0,2,...,327.75,309.720001,-247.779999,South Asia,Rajastán,73,327.75,0,1/17/2018 12:06,Standard Class
3,DEBIT,4,22.860001,304.809998,0,73,EE. UU.,Home Office,90027.0,2,...,327.75,304.809998,22.860001,Oceania,Queensland,73,327.75,0,1/16/2018 11:45,Standard Class
4,PAYMENT,4,134.210007,298.25,0,73,Puerto Rico,Corporate,725.0,2,...,327.75,298.25,134.210007,Oceania,Queensland,73,327.75,0,1/15/2018 11:24,Standard Class


In [8]:
df_filtered['order date (DateOrders)'] = pd.to_datetime(
    df_filtered['order date (DateOrders)'], 
    format="%m/%d/%Y %H:%M"
)
df_filtered['shipping date (DateOrders)'] = pd.to_datetime(
    df_filtered['shipping date (DateOrders)'],
    format="%m/%d/%Y %H:%M"
)

In [9]:
rename_cols = {
  "Type": "type",
  "Days for shipment (scheduled)": "days_for_shipment_scheduled",
  "Benefit per order": "benefit_per_order",
  "Sales per customer": "sales_per_customer",
  "Late_delivery_risk": "late_delivery_risk",
  "Category Id": "category_id",
  "Customer Country": "customer_country",
  "Customer Segment": "customer_segment",
  "Customer Zipcode": "customer_zipcode",
  "Department Id": "department_id",
  "Latitude": "latitude",
  "Longitude": "longitude",
  "Market": "market",
  "Order City": "order_city",
  "Order Country": "order_country",
  "order date (DateOrders)": "order_date",
  "Order Item Discount": "order_item_discount",
  "Order Item Product Price": "order_item_product_price",
  "Order Item Profit Ratio": "order_item_profit_ratio",
  "Order Item Quantity": "order_item_quantity",
  "Sales": "sales",
  "Order Item Total": "order_item_total",
  "Order Profit Per Order": "order_profit_per_order",
  "Order Region": "order_region",
  "Order State": "order_state",
  "Product Category Id": "product_category_id",
  "Product Price": "product_price",
  "Product Status": "product_status",
  "shipping date (DateOrders)": "shipping_date",
  "Shipping Mode": "shipping_mode"
}
df_filtered.rename(columns=rename_cols, inplace=True)

In [10]:
df_filtered['order_month'] = df_filtered['order_date'].dt.month
df_filtered['shipping_month'] = df_filtered['shipping_date'].dt.month

df_filtered['order_day'] = df_filtered['order_date'].dt.day_name()
df_filtered['shipping_day'] = df_filtered['shipping_date'].dt.day_name()

df_filtered = df_filtered.drop(['order_date', 'shipping_date'], axis=1)

In [11]:
df_filtered.columns

Index(['type', 'days_for_shipment_scheduled', 'benefit_per_order',
       'sales_per_customer', 'late_delivery_risk', 'category_id',
       'customer_country', 'customer_segment', 'customer_zipcode',
       'department_id', 'latitude', 'longitude', 'market', 'order_city',
       'order_country', 'order_item_discount', 'order_item_product_price',
       'order_item_profit_ratio', 'order_item_quantity', 'sales',
       'order_item_total', 'order_profit_per_order', 'order_region',
       'order_state', 'product_category_id', 'product_price', 'product_status',
       'shipping_mode', 'order_month', 'shipping_month', 'order_day',
       'shipping_day'],
      dtype='object')

In [16]:
df_filtered = df_filtered.dropna()

In [17]:
df_filtered.shape

(180516, 32)

In [None]:
print(df['shipping_mode'].value_counts())
print(df.groupby('shipping_mode')['late_delivery_risk'].mean())

## Preparation for Model Training

In [12]:
X = df_filtered.drop('late_delivery_risk', axis=1)
y = df_filtered['late_delivery_risk']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1111)

In [14]:
categorical_columns = X_train.select_dtypes(include=['object', 'string', 'category']).columns.to_list()

target_encoder = TargetEncoder(cols=categorical_columns, smoothing=10.0)

X_train_encoded = target_encoder.fit_transform(X_train, y_train)
X_test_encoded = target_encoder.transform(X_test)