In [1]:
# https://pypi.org/project/PuLP/
# https://coin-or.github.io/pulp/guides/index.html

In [2]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import os
import json
import pandas as pd
from sqlite3 import connect
from ydata_profiling import ProfileReport
import seaborn as sns

In [5]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.min_rows', 50)

In [6]:
pd.options.display.float_format = "{:.2f}".format

In [7]:
# Get the absolute path of the file within the INPUTS directory
file_path = os.path.join(os.getcwd(), 'INPUTS', 'case_study_data.csv')

# Print the file path
print(file_path)

C:\Users\KonuTech\PycharmProjects\supply-chain-case-study\INPUTS\case_study_data.csv


In [8]:
# Get the absolute path of the config within the CONFIGS directory
config_path = os.path.join(os.getcwd(), 'CONFIGS', 'config.json')

# Print the fileconfig_pathpath
print(config_path)

C:\Users\KonuTech\PycharmProjects\supply-chain-case-study\CONFIGS\config.json


In [9]:
with open(config_path, encoding='utf-8') as f:
    CONFIG = json.load(f)

In [10]:
INDEX_COL = CONFIG["INPUTS"]["INDEX_COLUMNS"]
SEP = CONFIG["INPUTS"]["SEPARATOR"]
DECIMAL = CONFIG["INPUTS"]["DECIMAL"]
ENCODING = CONFIG["INPUTS"]["ENCODING"]
DATE_COLUMNS = CONFIG["INPUTS"]["DATE_COLUMNS"]
FLOAT_PRECISION = CONFIG["INPUTS"]["FLOAT_PRECISION"]
DTYPE = CONFIG["INPUTS"]["DTYPE"]
COLUMNS_WITH_NAN_VALUES = CONFIG["INPUTS"]["COLUMNS_WITH_NAN_VALUES"]

In [11]:
df = pd.read_csv(
    file_path,
#     index_col=INDEX_COL,
    sep=SEP,
    encoding=ENCODING,
    engine="c",
    low_memory=False,
    dtype=DTYPE
)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9993 entries, 0 to 9992
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Order ID      9993 non-null   object 
 1   Order Date    9993 non-null   object 
 2   Ship Mode     9993 non-null   object 
 3   Region        9993 non-null   object 
 4   Product ID    9993 non-null   object 
 5   Category      9993 non-null   object 
 6   Sub-Category  9993 non-null   object 
 7   Product Name  9993 non-null   object 
 8   Sales         9993 non-null   float64
 9   Quantity      9993 non-null   int64  
 10  Discount      9993 non-null   float64
 11  Profit        8000 non-null   float64
dtypes: float64(3), int64(1), object(8)
memory usage: 937.0+ KB


In [13]:
df.describe()

Unnamed: 0,Sales,Quantity,Discount,Profit
count,9993.0,9993.0,9993.0,8000.0
mean,229.85,3.79,0.16,28.13
std,623.28,2.23,0.21,227.33
min,0.44,1.0,0.0,-6599.98
25%,17.28,2.0,0.0,1.81
50%,54.48,3.0,0.2,8.77
75%,209.94,5.0,0.2,29.95
max,22638.48,14.0,0.8,8399.98


In [14]:
df.isnull().sum()

Order ID           0
Order Date         0
Ship Mode          0
Region             0
Product ID         0
Category           0
Sub-Category       0
Product Name       0
Sales              0
Quantity           0
Discount           0
Profit          1993
dtype: int64

In [15]:
for i, v in enumerate(df["Category"].unique()):
    print(i, v)
    print(df[df["Category"] == v].isnull().sum())

0 Technology
Order ID          0
Order Date        0
Ship Mode         0
Region            0
Product ID        0
Category          0
Sub-Category      0
Product Name      0
Sales             0
Quantity          0
Discount          0
Profit          377
dtype: int64
1 Office Supplies
Order ID           0
Order Date         0
Ship Mode          0
Region             0
Product ID         0
Category           0
Sub-Category       0
Product Name       0
Sales              0
Quantity           0
Discount           0
Profit          1186
dtype: int64
2 Furniture
Order ID          0
Order Date        0
Ship Mode         0
Region            0
Product ID        0
Category          0
Sub-Category      0
Product Name      0
Sales             0
Quantity          0
Discount          0
Profit          430
dtype: int64


# PREPROCESSING

In [16]:
# Use regex to separate the string parts of the Order ID column into separate fields
df[['Order ID prefix', 'Order ID year', 'Order ID number']] = df['Order ID'].str.extract(r'(\w+)-(\d+)-(\d+)')

# Use regex to separate the string parts of the Order Date column into separate fields
df[['Order Year', 'Order Month', 'Order Day']] = pd.to_datetime(df['Order Date']).dt.strftime('%Y-%m-%d').str.extract(r'(\d{4})-(\d{2})-(\d{2})')

# PIVOTS

In [17]:
pivot = pd.pivot_table(df, values='Sales', index=['Category','Order Year'],
                          columns=['Sub-Category'], aggfunc=sum, fill_value=0)

In [18]:
pivot

Unnamed: 0_level_0,Sub-Category,Accessories,Appliances,Art,Binders,Bookcases,Chairs,Copiers,Envelopes,Fasteners,Furnishings,Labels,Machines,Paper,Phones,Storage,Supplies,Tables
Category,Order Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Furniture,2014,0.0,0.0,0.0,0.0,20036.68,77241.58,0.0,0.0,0.0,13826.23,0.0,0.0,0.0,0.0,0.0,0.0,46088.37
Furniture,2015,0.0,0.0,0.0,0.0,38543.57,71734.53,0.0,0.0,0.0,21089.71,0.0,0.0,0.0,0.0,0.0,0.0,39150.42
Furniture,2016,0.0,0.0,0.0,0.0,26275.47,83918.65,0.0,0.0,0.0,27874.12,0.0,0.0,0.0,0.0,0.0,0.0,60833.2
Furniture,2017,0.0,0.0,0.0,0.0,30024.28,95554.35,0.0,0.0,0.0,28915.09,0.0,0.0,0.0,0.0,0.0,0.0,60893.54
Office Supplies,2014,0.0,15313.62,6057.98,43488.26,0.0,0.0,0.0,3855.75,661.33,0.0,2841.39,0.0,14834.96,0.0,50329.04,14394.07,0.0
Office Supplies,2015,0.0,23241.29,6236.83,37453.1,0.0,0.0,0.0,4512.19,545.22,0.0,2956.46,0.0,15287.64,0.0,45048.25,1952.48,0.0
Office Supplies,2016,0.0,26050.31,5960.91,49683.33,0.0,0.0,0.0,4729.89,960.13,0.0,2827.24,0.0,20661.89,0.0,58788.7,14277.58,0.0
Office Supplies,2017,0.0,42926.93,8863.07,72788.04,0.0,0.0,0.0,3378.57,857.59,0.0,3861.22,0.0,27694.72,0.0,69677.62,16049.41,0.0
Technology,2014,25014.27,0.0,0.0,0.0,0.0,0.0,10849.78,0.0,0.0,0.0,0.0,62023.37,0.0,77390.81,0.0,0.0,0.0
Technology,2015,40523.96,0.0,0.0,0.0,0.0,0.0,26179.45,0.0,0.0,0.0,0.0,27763.7,0.0,68049.74,0.0,0.0,0.0


In [19]:
pivot = pd.pivot_table(df, values='Profit', index=['Category','Order Year'],
                          columns=['Sub-Category'], aggfunc=sum, fill_value=0)

In [20]:
pivot

Unnamed: 0_level_0,Sub-Category,Accessories,Appliances,Art,Binders,Bookcases,Chairs,Copiers,Envelopes,Fasteners,Furnishings,Labels,Machines,Paper,Phones,Storage,Supplies,Tables
Category,Order Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Furniture,2014,0.0,0.0,0.0,0.0,-285.69,5866.55,0.0,0.0,0.0,1673.9,0.0,0.0,0.0,0.0,0.0,0.0,-3005.3
Furniture,2015,0.0,0.0,0.0,0.0,-2888.0,4994.83,0.0,0.0,0.0,2858.02,0.0,0.0,0.0,0.0,0.0,0.0,-1277.68
Furniture,2016,0.0,0.0,0.0,0.0,-36.33,4947.41,0.0,0.0,0.0,3390.35,0.0,0.0,0.0,0.0,0.0,0.0,-2787.26
Furniture,2017,0.0,0.0,0.0,0.0,-329.47,7147.45,0.0,0.0,0.0,2929.15,0.0,0.0,0.0,0.0,0.0,0.0,-6322.7
Office Supplies,2014,0.0,2083.27,1032.36,7543.36,0.0,0.0,0.0,1382.7,166.55,0.0,929.91,0.0,5052.01,0.0,3299.51,416.17,0.0
Office Supplies,2015,0.0,3042.59,1252.51,6339.22,0.0,0.0,0.0,1736.18,150.83,0.0,1070.82,0.0,5199.78,0.0,2862.31,-81.28,0.0
Office Supplies,2016,0.0,3701.18,1045.05,4174.35,0.0,0.0,0.0,1752.17,242.42,0.0,1088.53,0.0,7463.0,0.0,4724.68,-572.53,0.0
Office Supplies,2017,0.0,5992.71,1913.88,2718.41,0.0,0.0,0.0,1248.89,258.8,0.0,1596.28,0.0,10202.18,0.0,6736.08,-1002.04,0.0
Technology,2014,4655.66,0.0,0.0,0.0,0.0,0.0,2411.95,0.0,0.0,0.0,0.0,484.29,0.0,10100.25,0.0,0.0,0.0
Technology,2015,9269.96,0.0,0.0,0.0,0.0,0.0,7465.34,0.0,0.0,0.0,0.0,53.71,0.0,8956.12,0.0,0.0,0.0


# PROFILING

In [21]:
profile = ProfileReport(df, tsmode=True, sortby="Order Date", title="Pandas Profiling Report")

In [22]:
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

In [23]:
profile.to_notebook_iframe()

In [24]:
profile.to_file("report_timeseries.html")

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]