## Importing the data

In [1]:
import pandas as pd
import pm4py
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
PO_model_data = pd.read_csv("throughput_dataset/PO_modelling_dataset.csv")
li_model_data = pd.read_csv("throughput_dataset/line_item_modelling_dataset.csv")

In [3]:
package_v = PO_model_data[PO_model_data["case_vendor"]=="vendorID_0106"]
variety_v = PO_model_data[PO_model_data["case_vendor"]=="vendorID_0171"]

## Linear regression model on the full data

All variables, line item level:

In [4]:
model_li = smf.ols(formula="throughput ~ event_value_EUR + rework_activities + payment_block + automation + avg_vendor_workload\
                + C(case_PR_NPR) + C(case_product_type) + C(case_product_specific) + C(case_vendor)",
               data=li_model_data).fit() 
model_li.summary()

0,1,2,3
Dep. Variable:,throughput,R-squared:,0.099
Model:,OLS,Adj. R-squared:,0.098
Method:,Least Squares,F-statistic:,155.5
Date:,"Thu, 02 Feb 2023",Prob (F-statistic):,0.0
Time:,16:10:38,Log-Likelihood:,-140460.0
No. Observations:,25636,AIC:,281000.0
Df Residuals:,25617,BIC:,281100.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,133.6127,3.451,38.720,0.000,126.849,140.376
C(case_PR_NPR)[T._NPR],-21.0129,1.969,-10.671,0.000,-24.873,-17.153
C(case_product_type)[T.Packaging],-8.8104,3.204,-2.750,0.006,-15.090,-2.531
C(case_product_type)[T.Sales],30.9630,4.273,7.247,0.000,22.588,39.338
C(case_product_type)[T._other_product_type],5.4770,2.146,2.553,0.011,1.272,9.682
C(case_product_specific)[T.Extenders],-33.6335,4.091,-8.222,0.000,-41.651,-25.616
C(case_product_specific)[T.Labels],3.1125,1.701,1.830,0.067,-0.222,6.447
C(case_product_specific)[T.Products for Resale],-32.8994,4.443,-7.404,0.000,-41.608,-24.191
C(case_product_specific)[T._other_specific_type],-10.1567,2.865,-3.545,0.000,-15.773,-4.541

0,1,2,3
Omnibus:,3446.991,Durbin-Watson:,0.476
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5094.922
Skew:,1.0,Prob(JB):,0.0
Kurtosis:,3.878,Cond. No.,187000.0


---
All variables, purchase order level:

In [5]:
model_PO = smf.ols(formula='throughput ~ avg_value_EUR + rework_activities + payment_block + automation + avg_vendor_workload + num_items\
                  + C(case_product_type, Treatment("_other_product_type")) + C(case_product_specific, Treatment("_other_specific_type")) + C(case_vendor)',
               data=PO_model_data).fit() 
model_PO.summary()

0,1,2,3
Dep. Variable:,throughput,R-squared:,0.075
Model:,OLS,Adj. R-squared:,0.072
Method:,Least Squares,F-statistic:,32.79
Date:,"Thu, 02 Feb 2023",Prob (F-statistic):,4.45e-109
Time:,16:10:38,Log-Likelihood:,-39995.0
No. Observations:,7323,AIC:,80030.0
Df Residuals:,7304,BIC:,80160.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,110.4174,1.910,57.808,0.000,106.673,114.162
"C(case_product_type, Treatment(""_other_product_type""))[T.Additives]",6.7220,2.567,2.619,0.009,1.690,11.754
"C(case_product_type, Treatment(""_other_product_type""))[T.Packaging]",-6.1567,3.633,-1.695,0.090,-13.279,0.966
"C(case_product_type, Treatment(""_other_product_type""))[T.Sales]",15.2249,6.151,2.475,0.013,3.167,27.282
"C(case_product_specific, Treatment(""_other_specific_type""))[T.Containers]",23.4936,3.693,6.361,0.000,16.254,30.734
"C(case_product_specific, Treatment(""_other_specific_type""))[T.Extenders]",-18.3602,3.363,-5.460,0.000,-24.953,-11.768
"C(case_product_specific, Treatment(""_other_specific_type""))[T.Labels]",15.1390,4.397,3.443,0.001,6.519,23.759
"C(case_product_specific, Treatment(""_other_specific_type""))[T.Products for Resale]",-24.1504,6.138,-3.935,0.000,-36.182,-12.119
C(case_vendor)[T.vendorID_0104],-0.1407,6.297,-0.022,0.982,-12.484,12.203

0,1,2,3
Omnibus:,828.329,Durbin-Watson:,1.49
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1138.078
Skew:,0.913,Prob(JB):,7.4e-248
Kurtosis:,3.627,Cond. No.,224000.0


----
The fit of the model is extremely poor in both cases.

Looking at the p-values for all of the features in the purchase order model:

In [6]:
pd.DataFrame(np.round(model_PO.pvalues, 3)).reset_index().rename(columns={0:"p-value", "index":"feature name"})\
.merge(pd.DataFrame(np.round(model_PO.params, 3)).reset_index().rename(columns={0:"coefficients", "index":"feature name"}), left_on="feature name", right_on="feature name")

Unnamed: 0,feature name,p-value,coefficients
0,Intercept,0.0,110.417
1,"C(case_product_type, Treatment(""_other_product...",0.009,6.722
2,"C(case_product_type, Treatment(""_other_product...",0.09,-6.157
3,"C(case_product_type, Treatment(""_other_product...",0.013,15.225
4,"C(case_product_specific, Treatment(""_other_spe...",0.0,23.494
5,"C(case_product_specific, Treatment(""_other_spe...",0.0,-18.36
6,"C(case_product_specific, Treatment(""_other_spe...",0.001,15.139
7,"C(case_product_specific, Treatment(""_other_spe...",0.0,-24.15
8,C(case_vendor)[T.vendorID_0104],0.982,-0.141
9,C(case_vendor)[T.vendorID_0106],0.0,-30.371


## Linear regression model for two specific vendors

Modelling vendor 0171:

In [7]:
model = smf.ols(formula="throughput ~ avg_value_EUR + rework_activities + payment_block + automation + avg_vendor_workload + num_items\
                + C(case_product_type) + C(case_product_specific)",
               data=variety_v).fit() 
model.summary()

0,1,2,3
Dep. Variable:,throughput,R-squared:,0.139
Model:,OLS,Adj. R-squared:,0.112
Method:,Least Squares,F-statistic:,5.103
Date:,"Thu, 02 Feb 2023",Prob (F-statistic):,2.07e-06
Time:,16:10:38,Log-Likelihood:,-1709.4
No. Observations:,295,AIC:,3439.0
Df Residuals:,285,BIC:,3476.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,120.3025,92.249,1.304,0.193,-61.274,301.879
C(case_product_type)[T.Packaging],89.9939,36.906,2.438,0.015,17.352,162.636
C(case_product_type)[T._other_product_type],2.9093,13.182,0.221,0.825,-23.037,28.855
C(case_product_specific)[T._other_specific_type],103.5202,81.955,1.263,0.208,-57.794,264.834
avg_value_EUR,0.0027,0.002,1.702,0.090,-0.000,0.006
rework_activities,62.3396,19.880,3.136,0.002,23.209,101.470
payment_block,-23.1492,16.527,-1.401,0.162,-55.680,9.382
automation,-1.9240,0.643,-2.990,0.003,-3.190,-0.658
avg_vendor_workload,-6.0403,2.990,-2.020,0.044,-11.926,-0.154

0,1,2,3
Omnibus:,36.614,Durbin-Watson:,0.617
Prob(Omnibus):,0.0,Jarque-Bera (JB):,13.381
Skew:,0.27,Prob(JB):,0.00124
Kurtosis:,2.108,Cond. No.,90600.0


Modelling vendor 0106:

In [8]:
model = smf.ols(formula="throughput ~ avg_value_EUR + rework_activities + payment_block + automation + avg_vendor_workload + num_items\
                + C(case_product_type) + C(case_product_specific)",
               data=package_v).fit() 
model.summary()

0,1,2,3
Dep. Variable:,throughput,R-squared:,0.168
Model:,OLS,Adj. R-squared:,0.143
Method:,Least Squares,F-statistic:,6.753
Date:,"Thu, 02 Feb 2023",Prob (F-statistic):,1.56e-06
Time:,16:10:38,Log-Likelihood:,-1102.9
No. Observations:,208,AIC:,2220.0
Df Residuals:,201,BIC:,2243.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,146.1466,23.081,6.332,0.000,100.636,191.658
avg_value_EUR,0.0014,0.001,1.542,0.125,-0.000,0.003
rework_activities,15.3060,7.672,1.995,0.047,0.177,30.435
payment_block,10.0180,7.863,1.274,0.204,-5.487,25.523
automation,-1.3550,0.325,-4.175,0.000,-1.995,-0.715
avg_vendor_workload,-1.8104,0.676,-2.679,0.008,-3.143,-0.478
num_items,-1.3180,0.787,-1.674,0.096,-2.870,0.234

0,1,2,3
Omnibus:,19.109,Durbin-Watson:,1.201
Prob(Omnibus):,0.0,Jarque-Bera (JB):,22.31
Skew:,0.802,Prob(JB):,1.43e-05
Kurtosis:,3.034,Cond. No.,34400.0
