# Part 3: Unbiased Evaluation using a New Test Set


In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import os, sys
import seaborn as sns
import itertools
import numpy as np
import pandas as pd
import random
import pickle
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import  mutual_info_classif,SelectPercentile
from sklearn.pipeline import Pipeline
from time import time
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
from sklearn.preprocessing import StandardScaler
random.seed(42)
np.random.seed(42)

---

## Load the balanced sample and the best pipeline and the anomaly detector

In [2]:
#Smart sample
dataset = pd.read_csv('newdf.csv').sample(frac = 1).reset_index(drop=True)
dataset.head()

pipe = pickle.load(open("best_pipeline.pkl", "rb"))
envelope= pickle.load(open("envelope_AD_model.pkl", "rb"))

---

##  Retrain pipeline using the full balanced sample 


In [None]:
# ----------------------------------
X = dataset.drop(columns='went_on_backorder')
y = dataset['went_on_backorder']

In [4]:
#Feature Scaling:
numerical_cols = ['national_inv', 'lead_time', 'in_transit_qty', 'forecast_6_month', 'sales_6_month', 
                  'min_bank', 'pieces_past_due', 'perf_6_month_avg', 'local_bo_qty']
categorical_cols = ['potential_issue', 'deck_risk', 'oe_constraint', 'ppap_risk', 'stop_auto_buy', 'rev_stop']
scaler = StandardScaler()

num_scale_train = scaler.fit_transform(X[numerical_cols])

X_= np.hstack((num_scale_train, X[categorical_cols].values))



In [5]:
#Anomaly Detection:
out_envelope = envelope.predict(X_) == -1

print(f"No. of Outliers = {np.sum(out_envelope)}")
X_clean = X_[~out_envelope]
y_clean = y[~out_envelope]

No. of Outliers = 459


In [6]:
pipe.fit(X_clean, y_clean)

Pipeline(steps=[('select',
                 SelectPercentile(percentile=50,
                                  score_func=<function mutual_info_classif at 0x7f0efeef2048>)),
                ('gb',
                 GradientBoostingClassifier(learning_rate=0.2, max_depth=5,
                                            min_samples_split=10,
                                            random_state=42, subsample=0.6))])

## Pickle and save the trained model and the anomaly detector 

In [7]:
#Pipeline - Model and Parameters
pickle.dump(pipe, open("Final_trained_pipeline.pkl", "wb"))

#Anomaly Detection model
pickle.dump(envelope, open("Final_trained_envelope_AD_model.pkl", "wb"))


---

## Load the test data and evaluating the model


In [8]:
# Preprocess the given test set 
# ----------------------------------

DATASET = '/dsa/data/all_datasets/back_order/Kaggle_Test_Dataset_v2.csv'

data = pd.read_csv(DATASET).sample(frac = 1).reset_index(drop=True)

data.head()


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,sku,national_inv,lead_time,in_transit_qty,forecast_3_month,forecast_6_month,forecast_9_month,sales_1_month,sales_3_month,sales_6_month,...,pieces_past_due,perf_6_month_avg,perf_12_month_avg,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,rev_stop,went_on_backorder
0,3510135,19.0,8.0,0.0,0.0,22.0,44.0,1.0,18.0,45.0,...,0.0,0.91,0.95,0.0,No,No,No,Yes,No,No
1,3309787,102.0,52.0,0.0,0.0,0.0,0.0,3.0,8.0,19.0,...,0.0,1.0,0.99,0.0,Yes,No,No,Yes,No,No
2,3451379,101.0,8.0,76.0,60.0,60.0,120.0,27.0,58.0,99.0,...,0.0,0.9,0.86,0.0,No,No,No,Yes,No,No
3,3433288,16.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.7,0.46,0.0,No,No,No,Yes,No,No
4,3453698,4.0,8.0,4.0,0.0,0.0,3.0,3.0,6.0,9.0,...,0.0,0.99,0.99,0.0,No,No,No,Yes,No,No


In [9]:
#Dimensions
data.shape

(242076, 23)

In [10]:
# Preprocess the given test set 
# ----------------------------------
#Droping columns to prevent multicollinearity and that are no needed:
remove = ['sku','forecast_3_month', 'forecast_9_month', 'sales_1_month', 'sales_3_month', 'sales_9_month', 'perf_12_month_avg']
data = data.drop(remove, axis=1)

In [11]:
print(data.columns)

Index(['national_inv', 'lead_time', 'in_transit_qty', 'forecast_6_month',
       'sales_6_month', 'min_bank', 'potential_issue', 'pieces_past_due',
       'perf_6_month_avg', 'local_bo_qty', 'deck_risk', 'oe_constraint',
       'ppap_risk', 'stop_auto_buy', 'rev_stop', 'went_on_backorder'],
      dtype='object')


In [12]:
#Finding null null values in columns:
data.isnull().sum().sort_values(ascending = True)

national_inv             1
in_transit_qty           1
forecast_6_month         1
sales_6_month            1
min_bank                 1
potential_issue          1
pieces_past_due          1
perf_6_month_avg         1
local_bo_qty             1
deck_risk                1
oe_constraint            1
ppap_risk                1
stop_auto_buy            1
rev_stop                 1
went_on_backorder        1
lead_time            14725
dtype: int64

In [13]:
#6% of rows contain null value in feature 'lead_time', and as such, will be removed.
data = data.dropna(subset=['lead_time'])

In [14]:
#Checking......
data.isnull().sum().sort_values(ascending = True)

national_inv         0
lead_time            0
in_transit_qty       0
forecast_6_month     0
sales_6_month        0
min_bank             0
potential_issue      0
pieces_past_due      0
perf_6_month_avg     0
local_bo_qty         0
deck_risk            0
oe_constraint        0
ppap_risk            0
stop_auto_buy        0
rev_stop             0
went_on_backorder    0
dtype: int64

In [15]:
#Checking for unique value count, to ensure no additional class for the object data types. 
data.nunique().sort_values().head(7)

potential_issue      2
deck_risk            2
oe_constraint        2
ppap_risk            2
stop_auto_buy        2
rev_stop             2
went_on_backorder    2
dtype: int64

In [16]:
# All the column names of these yes/no columns
yes_no_columns = list(filter(lambda i: data[i].dtype!=np.float64, data.columns))
print(yes_no_columns)

# Fill missing values if any
for column_name in yes_no_columns:
    mode = data[column_name].apply(str).mode()[0]
    print('Filling missing values of {} with {}'.format(column_name, mode))
    data[column_name].fillna(mode, inplace=True)

# Convert 'Yes'/'No' to 1/0
for column_name in yes_no_columns:
    data[column_name] = data[column_name].map({'Yes': 1, 'No': 0})

['potential_issue', 'deck_risk', 'oe_constraint', 'ppap_risk', 'stop_auto_buy', 'rev_stop', 'went_on_backorder']
Filling missing values of potential_issue with No
Filling missing values of deck_risk with No
Filling missing values of oe_constraint with No
Filling missing values of ppap_risk with No
Filling missing values of stop_auto_buy with Yes
Filling missing values of rev_stop with No
Filling missing values of went_on_backorder with No


In [17]:
data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
national_inv,227351.0,494.771917,30146.679689,-25414.0,4.0,15.0,79.0,12145792.0
lead_time,227351.0,7.923018,7.04141,0.0,4.0,8.0,9.0,52.0
in_transit_qty,227351.0,36.922037,741.337,0.0,0.0,0.0,0.0,186624.0
forecast_6_month,227351.0,369.731842,10401.02628,0.0,0.0,0.0,16.0,2157024.0
sales_6_month,227351.0,349.273339,9227.229225,0.0,0.0,3.0,33.0,2103389.0
min_bank,227351.0,53.315732,1152.838833,0.0,0.0,0.0,4.0,196869.0
potential_issue,227351.0,0.000361,0.018988,0.0,0.0,0.0,0.0,1.0
pieces_past_due,227351.0,1.941249,184.373362,0.0,0.0,0.0,0.0,79964.0
perf_6_month_avg,227351.0,-1.141629,13.712759,-99.0,0.7,0.84,0.97,1.0
local_bo_qty,227351.0,0.886326,47.004905,0.0,0.0,0.0,0.0,6232.0


In [18]:
data[(data['perf_6_month_avg'] == -99)].shape

(4377, 16)

In [19]:
#A few rows have a performance score of -99, which falls outside the expected range of 0 to 1. 
#It could be that it's likely either a placeholder or invalid data.
#It will be removed it to avoid any sort of distortion.

data = data[data['perf_6_month_avg'] != -99]

In [20]:
#Dimensions
data.shape

(222974, 16)

In [21]:
#Spliting the data in X_test and y_test
X_test = data.drop(columns='went_on_backorder')
y_test = data['went_on_backorder']

In [22]:
X_test.shape

(222974, 15)

In [23]:
#Feature Scaling:
num_scale_test = scaler.transform(X_test[numerical_cols])
X_test_scaled= np.hstack((num_scale_test, X_test[categorical_cols].values))

In [24]:
#checking......
X_test_scaled

array([[-0.08432486,  0.17465271, -0.05550266, ...,  0.        ,
         1.        ,  0.        ],
       [-0.03940284,  8.13891765, -0.05550266, ...,  0.        ,
         1.        ,  0.        ],
       [-0.03994406,  0.17465271,  0.0820374 , ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-0.09136083,  0.17465271, -0.05550266, ...,  0.        ,
         1.        ,  0.        ],
       [-0.08486608,  0.8986768 , -0.05550266, ...,  0.        ,
         1.        ,  0.        ],
       [-0.09190206, -0.91138342, -0.05550266, ...,  0.        ,
         1.        ,  0.        ]])

We can now predict and evaluate with the preprocessed test set. It would be interesting to see the performance with and without outliers removal from the test set. 

Report confusion matrix, precision, recall, f1-score, accuracy, and other measures (if any). 

With Outliers:
--------------

In [25]:
# Code  
# ----------------------------------
y_pred_w = pipe.predict(X_test_scaled)

In [26]:
#confusion matrix
print(confusion_matrix(y_test, y_pred_w))

[[193710  26700]
 [   527   2037]]


In [27]:
#classification report
print(classification_report(y_test, y_pred_w))

              precision    recall  f1-score   support

           0       1.00      0.88      0.93    220410
           1       0.07      0.79      0.13      2564

    accuracy                           0.88    222974
   macro avg       0.53      0.84      0.53    222974
weighted avg       0.99      0.88      0.93    222974



In [28]:
print("Accuracy:", accuracy_score(y_test, y_pred_w))

Accuracy: 0.8778915927417547


-------------------------------------------------------------------------------------------------------------------------------

Without Outliers (Comparison purposes)
----------------

In [29]:
#Comparison purposes
outliers = envelope.predict(X_test_scaled) == -1
X_test_clean = X_test_scaled[~outliers]
y_test_clean = y_test[~outliers]
print(f"No. of Outliers = {np.sum(outliers)}")

y_pred_woo = pipe.predict(X_test_clean)

No. of Outliers = 10850


In [30]:
#confusion matrix
print(confusion_matrix(y_test_clean, y_pred_woo))

[[183726  25937]
 [   498   1963]]


In [31]:
#classification report
print(classification_report(y_test_clean, y_pred_woo))

              precision    recall  f1-score   support

           0       1.00      0.88      0.93    209663
           1       0.07      0.80      0.13      2461

    accuracy                           0.88    212124
   macro avg       0.53      0.84      0.53    212124
weighted avg       0.99      0.88      0.92    212124



In [32]:
print("Accuracy:", accuracy_score(y_test_clean, y_pred_woo))

Accuracy: 0.8753794950123512


---


Summary of Processing:
----------------------------------
Missing/Invalid Data: Handled in Part 1. Rows containing missing values or invalid entries (e.g.,-99) in any feature were removed.

Data Transformation: Categorical variables with classes "Yes" and "No" were transformed into a binary format (1/0). 

Smart Sample: A random sample size of 10000 instances was drawn from the balanced dataset created in Part 1.

Feature Scaling:  Only non-binary numeric features were scaled using StandardScaler.

Anomaly Detection: EllipticEnvelope was applied to remove noise and extreme values, helping the model improve its generalization capability by training on clean data.

Feature Selection:  SelectPercentile was used to retain the top 50% most relevant features. 

Model Tuning: Hyperparameter tuning and cross-validation were performed in Part 2 to enhance performance and reduce overfitting.

Model Training: A GradientBoostingClassifier was used, as it formed part of the pipeline that provided the best performance in Part 2. 
The model was trained on the cleaned, balanced training dataset to learn and identify patterns associated with the target variable (went_on_backorder).

Performance Evaluation: The final evaluation was conducted on entirely unseen data. Preprocessing was done on the test set, but outliers were retained to test the model's robustness on real-world-like data.

Evaluation Metrics: A Confusion Matrix and Classification Report were used to evaluate performance on the test set.


Model performance:
-----------------------
The model, as mentioned before, was evaluated on test data with outliers included, which reflects the real-world data that is often noisy and imperfect. 

Key metrics:
------
Accuracy: 87.8%

Precision (Class 1 - Backorder): 7%

Recall (Class 1 - Backorder): 79%

F1-Score (Class 1 - Backorder): 13%


Analysis:
-----------
The model accuracy is inflated and, as such, unreliable. This is due to class imbalance, where the model achieves high accuracy by correctly predicting the majority class (class 0), which accounts for almost 98% of the data.

The precision score for predicting class 1 is very low (7%), indicating that many items expected to go on backorder did not (High False Positives).

The recall score for class 1 was high (~80%), which suggests that the model correctly identified most of the actual backorder cases.

The F1-score for class 1 was low (13%) due to the significant imbalance between precision and recall. 

Model performance remained nearly the same before and after the removal of test set outliers, suggesting that the model is relatively robust to noise and extreme values.

Q: After seeing the performance in Part 3, how would you change your choices in Part 1?

A: In Part 1, the method I used to handle unbalanced data was "Undersampling". This removed a large portion of the majority class data,  likely discarding valuable information and contributing to the high false positive value. 
So, instead of doing a 50/50 split, keeping a slightly imbalanced ratio of 60/40 or 70/30 would have been ideal. This way, more useful information might have been preserved, potentially improving precision and recall.  
Alternatively, I could have explored other techniques like SMOTE (Synthetic Minority Over-sampling Technique).


Q: Is your model robust to outliers, or is their number just too small to make a difference?

A: When checking for outliers in the test set, I found approximately 10000 instances - about 4.8% of the total data. While this is a small portion, in my opinion, it is not insignificant.
Surprisingly, the evaluation scores remained nearly the same with or without outliers, suggesting that the model used is fairly robust to outliers, or at the very least, the detected outliers didn't meaningfully affect the model's predictions.

Q: How do you decide what "best" performance is? Accuracy, F1 score, recall, or precision? Keep in mind that this is an imbalanced problem; which is more important: false negatives or false positives?

A: From a business point of view, I would say that recall is the best metric we can use to evaluate the model's performance. 
High recall indicates that the model is good at identifying actual positive cases. 
In this situation, false negatives can lead to serious consequences, such as dissatisfied customers or loss of sales. On the other hand, false positives more or less result in over-preparation, which is not that harmful to the supplier. Therefore, it might be ideal to maximize recall at the cost of some precision to avoid missing critical backorders.

---

## Reflection


Thanks to its high recall performance, the developed model reliably detects products/parts likely to go on backorder, enabling early intervention to prevent loss of sales or customer dissatisfaction. 
While the model occasionally flags parts/products unnecessarily due to its low precision, this trade-off is reasonable because missing an actual shortage is far more costly than investigating a false alarm. 
Based on the test results, I am confident in the model's ability to generalize well to new, unseen data. However, it is not meant to replace human oversight. The best approach is to integrate it into the inventory management process as an alert system, where flagged parts/products are prioritized for review by the supply chain team. Combining automation and expert human judgment can help reduce risk and improve resource planning.