# Part 2: Model Development

In this part, I develop unique pipelines for predicting backorder. I also **use the smart sample from Part 1** to fit and evaluate these pipelines. 


In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import os, sys
import itertools
import numpy as np
import pandas as pd
import random


In [2]:
random.seed(42)
np.random.seed(42)

## Reload the smart sample here

In [3]:
# Reload your smart sample from local file 
# ----------------------------------
dataset = pd.read_csv('newdf.csv').sample(frac = 1).reset_index(drop=True)
dataset.head()

Unnamed: 0,national_inv,lead_time,in_transit_qty,forecast_6_month,sales_6_month,min_bank,potential_issue,pieces_past_due,perf_6_month_avg,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,rev_stop,went_on_backorder
0,153.0,4.0,0.0,0.0,54.0,0.0,0,0.0,0.73,0.0,0,0,0,1,0,0
1,34.0,8.0,0.0,0.0,0.0,0.0,0,0.0,0.9,0.0,0,0,1,1,0,0
2,0.0,8.0,0.0,6.0,0.0,1.0,0,0.0,0.46,0.0,0,0,0,1,0,1
3,17.0,2.0,0.0,0.0,6.0,0.0,0,0.0,0.76,0.0,0,0,0,1,0,0
4,0.0,2.0,0.0,46.0,0.0,0.0,0,0.0,1.0,0.0,0,0,0,1,0,1


In [4]:
dataset.shape

(10000, 16)

In [5]:
dataset.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
national_inv,10000.0,174.8025,1847.738607,-1194.0,1.0,5.0,24.0,116712.0
lead_time,10000.0,7.0351,5.524954,0.0,2.0,8.0,8.0,52.0
in_transit_qty,10000.0,30.6689,552.593921,0.0,0.0,0.0,0.0,29937.0
forecast_6_month,10000.0,301.9374,4059.444328,0.0,0.0,7.0,44.0,267300.0
sales_6_month,10000.0,271.6356,4801.258211,0.0,0.0,8.0,39.0,373777.0
min_bank,10000.0,38.6283,463.948425,0.0,0.0,0.0,4.0,29376.0
potential_issue,10000.0,0.0028,0.052844,0.0,0.0,0.0,0.0,1.0
pieces_past_due,10000.0,2.1267,24.34135,0.0,0.0,0.0,0.0,816.0
perf_6_month_avg,10000.0,0.745943,0.268487,0.0,0.66,0.82,0.96,1.0
local_bo_qty,10000.0,1.8423,28.579645,0.0,0.0,0.0,0.0,1980.0


## Split the data into Train/Test


In [6]:
X = dataset.drop(columns = 'went_on_backorder')
X.head()

Unnamed: 0,national_inv,lead_time,in_transit_qty,forecast_6_month,sales_6_month,min_bank,potential_issue,pieces_past_due,perf_6_month_avg,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,rev_stop
0,153.0,4.0,0.0,0.0,54.0,0.0,0,0.0,0.73,0.0,0,0,0,1,0
1,34.0,8.0,0.0,0.0,0.0,0.0,0,0.0,0.9,0.0,0,0,1,1,0
2,0.0,8.0,0.0,6.0,0.0,1.0,0,0.0,0.46,0.0,0,0,0,1,0
3,17.0,2.0,0.0,0.0,6.0,0.0,0,0.0,0.76,0.0,0,0,0,1,0
4,0.0,2.0,0.0,46.0,0.0,0.0,0,0.0,1.0,0.0,0,0,0,1,0


In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   national_inv      10000 non-null  float64
 1   lead_time         10000 non-null  float64
 2   in_transit_qty    10000 non-null  float64
 3   forecast_6_month  10000 non-null  float64
 4   sales_6_month     10000 non-null  float64
 5   min_bank          10000 non-null  float64
 6   potential_issue   10000 non-null  int64  
 7   pieces_past_due   10000 non-null  float64
 8   perf_6_month_avg  10000 non-null  float64
 9   local_bo_qty      10000 non-null  float64
 10  deck_risk         10000 non-null  int64  
 11  oe_constraint     10000 non-null  int64  
 12  ppap_risk         10000 non-null  int64  
 13  stop_auto_buy     10000 non-null  int64  
 14  rev_stop          10000 non-null  int64  
dtypes: float64(9), int64(6)
memory usage: 1.1 MB


In [8]:
y = dataset['went_on_backorder']
y.head()

0    0
1    0
2    1
3    0
4    1
Name: went_on_backorder, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,random_state = 20)

## Developing Pipeline

In [44]:
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest

from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_classif

from sklearn.pipeline import Pipeline
from time import time
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier 

---


### 1st pipeline 


In [11]:
#Feature Scaling
#-----------------------
from sklearn.preprocessing import StandardScaler
numerical_cols = ['national_inv', 'lead_time', 'in_transit_qty', 'forecast_6_month', 'sales_6_month', 
                  'min_bank', 'pieces_past_due', 'perf_6_month_avg', 'local_bo_qty']
categorical_cols = ['potential_issue', 'deck_risk', 'oe_constraint', 'ppap_risk', 'stop_auto_buy', 'rev_stop']
scaler = StandardScaler()

num_scale_train = scaler.fit_transform(X_train[numerical_cols])
num_scale_test = scaler.transform(X_test[numerical_cols])

X_train= np.hstack((num_scale_train, X_train[categorical_cols].values))
X_test = np.hstack((num_scale_test, X_test[categorical_cols].values))



In [12]:
X_train

array([[-0.05611873,  8.26558552, -0.05416027, ...,  1.        ,
         0.        ,  0.        ],
       [-0.08516439,  0.92015313, -0.05416027, ...,  0.        ,
         1.        ,  0.        ],
       [-0.09167463, -1.28347658, -0.05416027, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.08266045,  0.18560989, -0.05416027, ...,  0.        ,
         1.        ,  0.        ],
       [-0.09117384,  0.3692457 , -0.05416027, ...,  0.        ,
         1.        ,  0.        ],
       [-0.09117384, -0.91620496, -0.05416027, ...,  0.        ,
         1.        ,  0.        ]])

In [13]:
# Anomaly detection code  
# ----------------------------------
iso = IsolationForest(contamination=0.05).fit(X_train)
out = iso.predict(X_train) == -1

print(f"No. of Outliers = {np.sum(out)}")
X_train1 = X_train[~out]
y_train1 = y_train[~out]

#Just to identify the outliers
outtest = iso.predict(X_test) == -1
print(f"No. of Outliers in test = {np.sum(outtest)}")

No. of Outliers = 400
No. of Outliers in test = 98


Outliers from test data will not be removed so as to mimic real-world unpredictability. 
In such a situation, we can not control what data is fed into the model after its deployment, and as such, will contain noisy inputs and extreme values.  

It can also provide me with an honest model evaluation, testing the model's robustness and generalization capabilities.

In [14]:
# Code for pipeline with feature selection and classification and hyperparameter tuning  
# ----------------------------------
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

param_grid1 = {
    'select__k': [4, 7, 10, 12, 15],
    'rf__max_depth': [6,10,15],
    'rf__max_features': ['sqrt', 'log2'],
    'rf__n_estimators': [50, 100, 200],
    'rf__min_samples_leaf':[2,5,7]
}

pipe1 = Pipeline([
    ('select',SelectKBest(score_func=f_classif)),
    ('rf',RandomForestClassifier(random_state=42))
])

In [15]:
# ----------------------------------

start = time()

model_grid1 = RandomizedSearchCV(pipe1,  param_grid1, n_jobs=5, cv=10,random_state=42)
model_grid1.fit(X_train1, y_train1)

stop = time()
print(f"Training time: {stop - start}s")

  f = msb / msw


Training time: 12.516682624816895s


In [16]:
#Best parameters
#-----------------------
model_grid1.best_params_

{'select__k': 15,
 'rf__n_estimators': 200,
 'rf__min_samples_leaf': 5,
 'rf__max_features': 'sqrt',
 'rf__max_depth': 15}

In [17]:
#Mean and Std scores
#-----------------------
results = pd.DataFrame(model_grid1.cv_results_)
best_data  = results[results['rank_test_score']==1]
mean_s = best_data['mean_test_score']
std_s = best_data['std_test_score']

print("Best Model Mean test (CV) score:",round(float(mean_s),4))
print("Best Model Std test (CV) score: ",round(float(std_s),4))

Best Model Mean test (CV) score: 0.868
Best Model Std test (CV) score:  0.0093


Maybe I can look at max_depth being greater than 15, to see if there is an improvement in the model. I can also reduce the number of features to use for the prediction.

-------------------------------------------------------------------------------------------------------------------------------

In [18]:
#Second iteration:
#-----------------------
param_grid1 = {
    'select__k': [4,5,8,9],
    'rf__max_depth': [15,17,20,24],
    'rf__max_features': ['sqrt','log2',None],
    'rf__n_estimators': [200,250,300],
    'rf__min_samples_leaf':[2,5,7]
}

pipe1 = Pipeline([
    ('select',SelectKBest(score_func=f_classif)),
    ('rf',RandomForestClassifier(random_state=42))
])

start = time()

model_grid1 = RandomizedSearchCV(pipe1,  param_grid1, n_jobs=5, cv=10, random_state=42)
model_grid1.fit(X_train1, y_train1)

stop = time()
print(f"Training time: {stop - start}s")

  f = msb / msw


Training time: 44.851775884628296s


In [19]:
#Best parameters
#-----------------------
model_grid1.best_params_

{'select__k': 8,
 'rf__n_estimators': 300,
 'rf__min_samples_leaf': 2,
 'rf__max_features': 'log2',
 'rf__max_depth': 17}

In [20]:
#Mean and Std scores
#-----------------------
results = pd.DataFrame(model_grid1.cv_results_)
best_data  = results[results['rank_test_score']==1]
mean_s = best_data['mean_test_score']
std_s = best_data['std_test_score']

print("Best Model Mean test (CV) score:",round(float(mean_s),4))
print("Best Model Std test (CV) score: ",round(float(std_s),4))

Best Model Mean test (CV) score: 0.857
Best Model Std test (CV) score:  0.0066


The first iteration did perform better than the second in terms of mean scores, but that could be because it was using all the features. 

Achieving a score of 0.857 while using only eight features and a max depth of 17 is still really good.

In [21]:
#confusion matrix 
#-----------------------
train_pred = model_grid1.predict(X_train1)
pd.DataFrame(confusion_matrix(y_train1, train_pred))

Unnamed: 0,0,1
0,3462,297
1,204,3637


In [22]:
#classification report 
#-----------------------
print(classification_report(y_train1, train_pred))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93      3759
           1       0.92      0.95      0.94      3841

    accuracy                           0.93      7600
   macro avg       0.93      0.93      0.93      7600
weighted avg       0.93      0.93      0.93      7600



-------------------------------------------------------------------------------------------------------------------------------

In [23]:
#Third iteration:
#-----------------------
param_grid1 = {
    'select__k': [4,5,8,9],
    'rf__max_depth': [15,17,20,24],
    'rf__max_features': ['sqrt','log2',None],
    'rf__n_estimators': [300,350,400],
    'rf__min_samples_leaf':[2,5,7]
}

pipe1 = Pipeline([
    ('select',SelectKBest(score_func=f_classif)),
    ('rf',RandomForestClassifier(random_state=42))
])

start = time()

model_grid1 = RandomizedSearchCV(pipe1,  param_grid1, n_jobs=5, cv=10, random_state=42)
model_grid1.fit(X_train1, y_train1)

stop = time()
print(f"Training time: {stop - start}s")

  f = msb / msw


Training time: 63.33665060997009s


In [24]:
#Best parameters
#-----------------------
model_grid1.best_params_

{'select__k': 8,
 'rf__n_estimators': 400,
 'rf__min_samples_leaf': 2,
 'rf__max_features': 'log2',
 'rf__max_depth': 17}

In [25]:
#Mean and Std scores
#-----------------------
results = pd.DataFrame(model_grid1.cv_results_)
best_data  = results[results['rank_test_score']==1]
mean_s = best_data['mean_test_score']
std_s = best_data['std_test_score']

print("Best Model Mean test (CV) score:",round(float(mean_s),4))
print("Best Model Std test (CV) score: ",round(float(std_s),4))

Best Model Mean test (CV) score: 0.8576
Best Model Std test (CV) score:  0.0066


The model score does see a slight improvement with the increase in "n_estimators". 

In [26]:
#confusion matrix 
#-----------------------
from sklearn.metrics import classification_report, confusion_matrix
train_pred = model_grid1.predict(X_train1)
pd.DataFrame(confusion_matrix(y_train1, train_pred))

Unnamed: 0,0,1
0,3461,298
1,201,3640


In [27]:
#classification report
#-----------------------
print(classification_report(y_train1, train_pred))

              precision    recall  f1-score   support

           0       0.95      0.92      0.93      3759
           1       0.92      0.95      0.94      3841

    accuracy                           0.93      7600
   macro avg       0.93      0.93      0.93      7600
weighted avg       0.93      0.93      0.93      7600



The third iteration has very few minor improvements, as seen in the precision score for class (0), but that seems about it.

There is still some improvement, so I will consider the third iteration to be the best.

-------------------------------------------------------------------------------------------------------------------------------

In [28]:
model_grid1.best_params_

{'select__k': 8,
 'rf__n_estimators': 400,
 'rf__min_samples_leaf': 2,
 'rf__max_features': 'log2',
 'rf__max_depth': 17}

In [29]:
#Model Prediction on test data
#-----------------------
y_pred = model_grid1.predict(X_test)

In [30]:
#confusion matrix
#-----------------------
pd.DataFrame(confusion_matrix(y_test, y_pred))

Unnamed: 0,0,1
0,881,155
1,118,846


In [31]:
#classification report
#-----------------------
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.85      0.87      1036
           1       0.85      0.88      0.86       964

    accuracy                           0.86      2000
   macro avg       0.86      0.86      0.86      2000
weighted avg       0.86      0.86      0.86      2000



#### <center>Record the best hyperparameters and performance resulting from this pipeline.</center>

---


### 2nd pipeline


In [32]:
# Anomaly detection code  (Question #E205)
# ----------------------------------
oc_svm =  OneClassSVM(kernel='rbf', gamma='auto', nu=0.05)
oc_svm =  oc_svm.fit(X_train)

out_svm = oc_svm.predict(X_train) == -1

print(f"No. of Outliers = {np.sum(out_svm)}")
X_train2 = X_train[~out_svm]
y_train2 = y_train[~out_svm]

#Just to identify the outliers
outtest_svm = oc_svm.predict(X_test) == -1
print(f"No. of Outliers in test = {np.sum(outtest_svm)}")


No. of Outliers = 410
No. of Outliers in test = 111


Outliers from test data will not be removed so as to mimic real-world unpredictability. 
In such a situation, we can not control what data is fed into the model after its deployment, and as such, will contain noisy inputs and extreme values.  

It can also provide me with an honest model evaluation, testing the model's robustness and generalization capabilities.

In [33]:
# Code for pipeline with feature selection and classification and hyperparameter tuning
# ----------------------------------
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

#First iteration
#-----------------------
param_grid2 = {   
    'rfe__n_features_to_select': [5, 7,8,9,10],
    'svc__C': [0.1, 1, 10, 100],
    'svc__gamma' : [0.01,1,'scale'],
    'svc__kernel' : ['linear', 'rbf', 'poly'],
}

pipe2 = Pipeline([
    ('rfe' , RFE(LogisticRegression())),
    ('svc', SVC(probability=True, random_state=42))
    
])

start = time()

model_grid2 = RandomizedSearchCV(pipe2,  param_grid2, n_jobs=7, cv=10, random_state=42,verbose=2)
model_grid2.fit(X_train2, y_train2)

stop = time()
print(f"Training time: {stop - start}s")

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Training time: 209.41144180297852s


In [34]:
#Best parameters
#-----------------------
model_grid2.best_params_

{'svc__kernel': 'linear',
 'svc__gamma': 0.01,
 'svc__C': 1,
 'rfe__n_features_to_select': 10}

In [35]:
#Mean and Std scores
#-----------------------
results = pd.DataFrame(model_grid2.cv_results_)
best_data  = results[results['rank_test_score']==1]
mean_s = best_data['mean_test_score']
std_s = best_data['std_test_score']

print("Best Model Mean test (CV) score:",round(float(mean_s),4))
print("Best Model Std test (CV) score: ",round(float(std_s),4))

Best Model Mean test (CV) score: 0.7295
Best Model Std test (CV) score:  0.0202


In [36]:
#confusion matrix 
#-----------------------
train_pred2= model_grid2.predict(X_train2)
pd.DataFrame(confusion_matrix(y_train2, train_pred2))

Unnamed: 0,0,1
0,2439,1323
1,744,3084


In [37]:
#classification report
#-----------------------
print(classification_report(y_train2, train_pred2))

              precision    recall  f1-score   support

           0       0.77      0.65      0.70      3762
           1       0.70      0.81      0.75      3828

    accuracy                           0.73      7590
   macro avg       0.73      0.73      0.73      7590
weighted avg       0.73      0.73      0.73      7590



The results are not that great, lets try changing the parameters abit more..

I tried a different "estimator" parameter for the RFE model.

-------------------------------------------------------------------------------------------------------------------------------

In [38]:
#Second Iteration
#-----------------------
param_grid2 = {   
    'rfe__n_features_to_select': [4,5,6,7,8],
    'svc__C': [0.1, 1, 10, 100],
    'svc__gamma' : [0.001 ,0.01,1],
    'svc__kernel' : ['linear', 'rbf', 'poly'],
}

pipe2 = Pipeline([
    ('rfe' , RFE(estimator= SVC(kernel="linear"))),
    ('svc', SVC(probability=True, random_state=42))
    
])

start = time()

model_grid2 = RandomizedSearchCV(pipe2,  param_grid2, n_jobs=7, cv=10, random_state=42,verbose=2)
model_grid2.fit(X_train2, y_train2)

stop = time()
print(f"Training time: {stop - start}s")

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Training time: 523.6520590782166s


In [39]:
#Best parameters
#-----------------------
model_grid2.best_params_

{'svc__kernel': 'linear',
 'svc__gamma': 1,
 'svc__C': 10,
 'rfe__n_features_to_select': 4}

In [40]:
#Mean and Std scores
#-----------------------
results = pd.DataFrame(model_grid2.cv_results_)
best_data  = results[results['rank_test_score']==1]
mean_s = best_data['mean_test_score']
std_s = best_data['std_test_score']

print("Best Model Mean test (CV) score:",round(float(mean_s),4))
print("Best Model Std test (CV) score: ",round(float(std_s),4))

Best Model Mean test (CV) score: 0.6736
Best Model Std test (CV) score:  0.0127


In [41]:
#confusion matrix 
#-----------------------
train_pred2= model_grid2.predict(X_train2)
pd.DataFrame(confusion_matrix(y_train2, train_pred2))

Unnamed: 0,0,1
0,1401,2361
1,107,3721


In [42]:
#classification report
#-----------------------
print(classification_report(y_train2, train_pred2))

              precision    recall  f1-score   support

           0       0.93      0.37      0.53      3762
           1       0.61      0.97      0.75      3828

    accuracy                           0.67      7590
   macro avg       0.77      0.67      0.64      7590
weighted avg       0.77      0.67      0.64      7590



The performance is much worse than that of the first iteration. 

I will add a new parameter called "step" and change the estimator again.

-------------------------------------------------------------------------------------------------------------------------------

In [45]:
#Third Iteration
#-----------------------
param_grid2 = {
    'rfe__step' : [0.1,0.2,0.5],
    
    'rfe__n_features_to_select': [3,5, 7,8],
    'svc__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'svc__gamma' : [0.001, 0.01, 0.1, 1,'scale'],
    'svc__kernel' : ['linear', 'rbf'],
}

pipe2 = Pipeline([
    ('rfe' , RFE(DecisionTreeClassifier())),
    ('svc', SVC(probability=True, random_state=42))
    
])

start = time()

model_grid2 = RandomizedSearchCV(pipe2,  param_grid2, n_jobs=7, cv=10, random_state=42,verbose=2)
model_grid2.fit(X_train2, y_train2)

stop = time()
print(f"Training time: {stop - start}s")

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Training time: 254.47905850410461s


In [46]:
#Best parameters
#-----------------------
model_grid2.best_params_

{'svc__kernel': 'linear',
 'svc__gamma': 0.001,
 'svc__C': 100,
 'rfe__step': 0.2,
 'rfe__n_features_to_select': 5}

In [47]:
#Mean and Std scores
#-----------------------
results = pd.DataFrame(model_grid2.cv_results_)
best_data  = results[results['rank_test_score']==1]
mean_s = best_data['mean_test_score']
std_s = best_data['std_test_score']

print("Best Model Mean test (CV) score:", round(float(mean_s.values[0]), 4))
print("Best Model Std test (CV) score: ", round(float(std_s.values[0]), 4))

Best Model Mean test (CV) score: 0.7404
Best Model Std test (CV) score:  0.0136


In [48]:
#confusion matrix 
#-----------------------
train_pred2= model_grid2.predict(X_train2)
pd.DataFrame(confusion_matrix(y_train2, train_pred2))

Unnamed: 0,0,1
0,2007,1755
1,210,3618


In [49]:
#classification report
#-----------------------
print(classification_report(y_train2, train_pred2))

              precision    recall  f1-score   support

           0       0.91      0.53      0.67      3762
           1       0.67      0.95      0.79      3828

    accuracy                           0.74      7590
   macro avg       0.79      0.74      0.73      7590
weighted avg       0.79      0.74      0.73      7590



Ok, there seems to be a good improvement when using the 'step' parameter. 
For the fourth iteration, I can add a few more parameters to see if there is any further improvement. 

I can increase the value of the C parameter. While this can make the model fit the data more closely, it does increase the risk of overfitting and can greatly increase the computation time. As such, it's best to keep it as is. 

I want to see if using "SVC(kernel='linear')" as the estimator would be better with "step". This method keeps both parts of the pipeline within the SVM family. 

-------------------------------------------------------------------------------------------------------------------------------

In [87]:
#Fourth iteration:
#-----------------------
param_grid2 = {
    'rfe__step' : [0.15,0.2,0.5],
    'rfe__n_features_to_select': [3,5, 7,8],
    'svc__C': [1,10, 100],
    'svc__gamma' : [0.001, 0.01, 0.1, 1,'scale'],
    'svc__tol': [1e-3, 1e-2,1e-1],
    'svc__kernel' : ['linear', 'rbf'],
}

pipe2 = Pipeline([
    ('rfe' , RFE(SVC(kernel='linear'))),
    ('svc', SVC(probability=True, random_state=42))
    
])

start = time()

model_grid2 = RandomizedSearchCV(pipe2,  param_grid2, n_jobs=7, cv=10, random_state=42,verbose=2)
model_grid2.fit(X_train2, y_train2)

stop = time()
print(f"Training time: {stop - start}s")

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Training time: 306.4144217967987s


In [88]:
#Best parameters
#-----------------------
model_grid2.best_params_

{'svc__tol': 0.001,
 'svc__kernel': 'rbf',
 'svc__gamma': 'scale',
 'svc__C': 100,
 'rfe__step': 0.15,
 'rfe__n_features_to_select': 3}

In [89]:
#Mean and Std scores
#-----------------------
results = pd.DataFrame(model_grid2.cv_results_)
best_data  = results[results['rank_test_score']==1]
mean_s = best_data['mean_test_score']
std_s = best_data['std_test_score']

print("Best Model Mean test (CV) score:", round(float(mean_s.values[0]), 4))
print("Best Model Std test (CV) score: ", round(float(std_s.values[0]), 4))

Best Model Mean test (CV) score: 0.8105
Best Model Std test (CV) score:  0.0092


In [90]:
#confusion matrix 
#-----------------------
train_pred2= model_grid2.predict(X_train2)
pd.DataFrame(confusion_matrix(y_train2, train_pred2))

Unnamed: 0,0,1
0,2854,908
1,487,3341


In [91]:
#classification report
#-----------------------
print(classification_report(y_train2, train_pred2))

              precision    recall  f1-score   support

           0       0.85      0.76      0.80      3762
           1       0.79      0.87      0.83      3828

    accuracy                           0.82      7590
   macro avg       0.82      0.82      0.82      7590
weighted avg       0.82      0.82      0.82      7590



Ok, there is a substantial improvement with these parameters. So, for this pipeline, I will consider these parameters the best. The only potential concern is underfitting, as the model uses only 3 out of 15 features, which might limit its ability to capture the complexity of the data.

-------------------------------------------------------------------------------------------------------------------------------

In [92]:
model_grid2.best_params_

{'svc__tol': 0.001,
 'svc__kernel': 'rbf',
 'svc__gamma': 'scale',
 'svc__C': 100,
 'rfe__step': 0.15,
 'rfe__n_features_to_select': 3}

In [93]:
#Model Prediction on test data
#-----------------------
y_pred2 = model_grid2.predict(X_test)
#confusion matrix 
#----------------------
pd.DataFrame(confusion_matrix(y_test, y_pred2))

Unnamed: 0,0,1
0,778,258
1,123,841


In [94]:
#classification report
#-----------------------
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.86      0.75      0.80      1036
           1       0.77      0.87      0.82       964

    accuracy                           0.81      2000
   macro avg       0.81      0.81      0.81      2000
weighted avg       0.82      0.81      0.81      2000



#### <center>Record the best hyperparameters and performance resulting from this pipeline.</center>

---

### 3rd pipeline

In [None]:
# Anomaly detection code  
# ----------------------------------
envelope = EllipticEnvelope(support_fraction=1, contamination=0.05)
envelope = envelope.fit(X_train)

out_envelope = envelope.predict(X_train) == -1

print(f"No. of Outliers = {np.sum(out_envelope)}")
X_train3 = X_train[~out_envelope]
y_train3 = y_train[~out_envelope]

#Just to identify the outliers
outtest_envelope = envelope.predict(X_test) == -1
print(f"No. of Outliers in test = {np.sum(outtest_envelope)}")


Outliers from test data will not be removed so as to mimic real-world unpredictability. 
In such a situation, we can not control what data is fed into the model after its deployment, and as such, will contain noisy inputs and extreme values.  

It can also provide me with an honest model evaluation, testing the model's robustness and generalization capabilities.

In [197]:
# Code for pipeline with feature selection and classification and hyperparameter tuning
# ----------------------------------

#First iteration
#-----------------------
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectPercentile

param_grid3 = {
    'select__percentile': [20, 30, 40,50,60],
    'gb__max_depth': [3, 5, 7],
    'gb__learning_rate': [0.01,0.05, 0.2, 0.25],
    'gb__min_samples_split': [3, 5, 10],
    'gb__subsample': [0.4,0.5,0.6, 0.8, 1.0]
}


pipe3 = Pipeline([
    ('select', SelectPercentile(score_func=mutual_info_classif)),
    ('gb', GradientBoostingClassifier(random_state=42))
])

start = time()

model_grid3 = RandomizedSearchCV(pipe3,  param_grid3, n_jobs=7, cv=10, random_state=42,verbose=2)
model_grid3.fit(X_train3, y_train3)

stop = time()
print(f"Training time: {stop - start}s")

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Training time: 25.83633804321289s


In [198]:
#Best parameters
#-----------------------
model_grid3.best_params_

{'select__percentile': 50,
 'gb__subsample': 0.6,
 'gb__min_samples_split': 10,
 'gb__max_depth': 5,
 'gb__learning_rate': 0.2}

In [199]:
#Mean and Std scores
#-----------------------
results = pd.DataFrame(model_grid3.cv_results_)
best_data  = results[results['rank_test_score']==1]
mean_s = best_data['mean_test_score']
std_s = best_data['std_test_score']

print("Best Model Mean test (CV) score:", round(float(mean_s.values), 4))
print("Best Model Std test (CV) score: ", round(float(std_s.values), 4))

Best Model Mean test (CV) score: 0.872
Best Model Std test (CV) score:  0.0138


In [200]:
#confusion matrix 
#-----------------------
train_pred3= model_grid3.predict(X_train3)
pd.DataFrame(confusion_matrix(y_train3, train_pred3))

Unnamed: 0,0,1
0,3481,298
1,224,3597


In [201]:
#classification report
#-----------------------
print(classification_report(y_train3, train_pred3))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93      3779
           1       0.92      0.94      0.93      3821

    accuracy                           0.93      7600
   macro avg       0.93      0.93      0.93      7600
weighted avg       0.93      0.93      0.93      7600



The model does perform well on the training set, but there may be some room for improvement by exploring another parameter.

-------------------------------------------------------------------------------------------------------------------------------

In [68]:
#Second iteration
#-----------------------
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectPercentile

param_grid3 = {
    'select__percentile': [10, 30,50,70,80],
    'gb__max_depth': [3, 5, 7],
    'gb__learning_rate': [0.01,0.05, 0.2, 0.25],
    'gb__n_estimators': [50, 100, 200, 300],
    'gb__subsample': [0.4,0.5,0.6, 0.8, 1.0]
}


pipe3 = Pipeline([
    ('select', SelectPercentile(score_func=mutual_info_classif)),
    ('gb', GradientBoostingClassifier(random_state=42))
])

start = time()

model_grid3 = RandomizedSearchCV(pipe3,  param_grid3, n_jobs=7, cv=10, random_state=42,verbose=2)
model_grid3.fit(X_train3, y_train3)

stop = time()
print(f"Training time: {stop - start}s")

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Training time: 34.85167169570923s


In [69]:
#Best parameters
#-----------------------
model_grid3.best_params_

{'select__percentile': 70,
 'gb__subsample': 1.0,
 'gb__n_estimators': 50,
 'gb__max_depth': 7,
 'gb__learning_rate': 0.25}

In [70]:
#Mean and Std scores
#-----------------------
results = pd.DataFrame(model_grid3.cv_results_)
best_data  = results[results['rank_test_score']==1]
mean_s = best_data['mean_test_score']
std_s = best_data['std_test_score']

print("Best Model Mean test (CV) score:", round(float(mean_s.values), 4))
print("Best Model Std test (CV) score: ", round(float(std_s.values), 4))

Best Model Mean test (CV) score: 0.8768
Best Model Std test (CV) score:  0.0089


In [71]:
#confusion matrix 
#-----------------------
train_pred3= model_grid3.predict(X_train3)
pd.DataFrame(confusion_matrix(y_train3, train_pred3)) 

Unnamed: 0,0,1
0,3673,106
1,72,3749


In [72]:
#classification report
#-----------------------
print(classification_report(y_train3, train_pred3))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98      3779
           1       0.97      0.98      0.98      3821

    accuracy                           0.98      7600
   macro avg       0.98      0.98      0.98      7600
weighted avg       0.98      0.98      0.98      7600



Ok, the performance of the model has improved much more when using "n_estimators" over "min_samples_split". My only concern is that the model is overfitting.
Maybe tweaking the range and using the "min_samples_split" parameter could provide reasonable results. I will also increase the no. of combinations to search.

-------------------------------------------------------------------------------------------------------------------------------

In [150]:
#Third Iteration
#-----------------------
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectPercentile

param_grid3 = {
    'select__percentile': [10,30,50],
    'gb__max_depth': [7,12,15,17,20,25],
    'gb__learning_rate': [0.01, 0.2, 0.25,0.3],
    'gb__n_estimators': [50,100, 200, 300],
    'gb__subsample': [0.6, 0.8,1],
    'gb__min_samples_split': [5,10,15]
    
}


pipe3 = Pipeline([
    ('select', SelectPercentile(score_func=mutual_info_classif)),
    ('gb', GradientBoostingClassifier(random_state=42))
])

start = time()

model_grid3 = RandomizedSearchCV(pipe3,  param_grid3, n_jobs=7, cv=10, random_state=42,verbose=2,n_iter=50)
model_grid3.fit(X_train3, y_train3)

stop = time()
print(f"Training time: {stop - start}s")

Fitting 10 folds for each of 50 candidates, totalling 500 fits
Training time: 444.5483889579773s


In [151]:
#Best parameters
#-----------------------
model_grid3.best_params_

{'select__percentile': 50,
 'gb__subsample': 0.6,
 'gb__n_estimators': 50,
 'gb__min_samples_split': 5,
 'gb__max_depth': 12,
 'gb__learning_rate': 0.25}

In [152]:
#Mean and Std scores
#-----------------------
results = pd.DataFrame(model_grid3.cv_results_)
best_data  = results[results['rank_test_score']==1]
mean_s = best_data['mean_test_score']
std_s = best_data['std_test_score']

print("Best Model Mean test (CV) score:", round(float(mean_s.values), 4))
print("Best Model Std test (CV) score: ", round(float(std_s.values), 4))

Best Model Mean test (CV) score: 0.8758
Best Model Std test (CV) score:  0.0115


In [153]:
#confusion matrix 
#-----------------------
train_pred3= model_grid3.predict(X_train3)
pd.DataFrame(confusion_matrix(y_train3, train_pred3)) 

Unnamed: 0,0,1
0,3728,51
1,40,3781


In [154]:
#classification report
#-----------------------
print(classification_report(y_train3, train_pred3))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3779
           1       0.99      0.99      0.99      3821

    accuracy                           0.99      7600
   macro avg       0.99      0.99      0.99      7600
weighted avg       0.99      0.99      0.99      7600



These parameters give results that appear as overfitting as well. I have tried other combinations and parameter ranges, each of which gives near-perfect predictions over training data, which is likely overfitting. 

-------------------------------------------------------------------------------------------------------------------------------

In [160]:
#Fourth Iteration
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectPercentile

param_grid3 = {
    'select__percentile': [20, 30, 40,50,60],
    'gb__max_depth': [7,12,15,17,20,25],
    'gb__learning_rate': [0.01,0.05, 0.2, 0.25],
    'gb__min_samples_split': [ 5, 10,15],
    'gb__subsample': [0.6, 0.8, 1.0]
}


pipe3 = Pipeline([
    ('select', SelectPercentile(score_func=mutual_info_classif)),
    ('gb', GradientBoostingClassifier(random_state=42))
])

start = time()

model_grid3 = RandomizedSearchCV(pipe3,  param_grid3, n_jobs=7, cv=10, random_state=42,verbose=2)
model_grid3.fit(X_train3, y_train3)

stop = time()
print(f"Training time: {stop - start}s")

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Training time: 94.01213574409485s


In [161]:
#Best parameters
#-----------------------
model_grid3.best_params_

{'select__percentile': 60,
 'gb__subsample': 0.8,
 'gb__min_samples_split': 5,
 'gb__max_depth': 25,
 'gb__learning_rate': 0.25}

In [162]:
#Mean and Std scores
#-----------------------
results = pd.DataFrame(model_grid3.cv_results_)
best_data  = results[results['rank_test_score']==1]
mean_s = best_data['mean_test_score']
std_s = best_data['std_test_score']

print("Best Model Mean test (CV) score:", round(float(mean_s.values), 4))
print("Best Model Std test (CV) score: ", round(float(std_s.values), 4))

Best Model Mean test (CV) score: 0.8797
Best Model Std test (CV) score:  0.0099


In [163]:
#confusion matrix 
#-----------------------
train_pred3= model_grid3.predict(X_train3)
pd.DataFrame(confusion_matrix(y_train3, train_pred3)) 

Unnamed: 0,0,1
0,3765,14
1,20,3801


In [164]:
#classification report
#-----------------------
print(classification_report(y_train3, train_pred3))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      3779
           1       1.00      0.99      1.00      3821

    accuracy                           1.00      7600
   macro avg       1.00      1.00      1.00      7600
weighted avg       1.00      1.00      1.00      7600



The first iteration provided a more reasonable result and,  as such, will be considered the best for this pipeline.

-------------------------------------------------------------------------------------------------------------------------------

In [202]:
model_grid3.best_params_

{'select__percentile': 50,
 'gb__subsample': 0.6,
 'gb__min_samples_split': 10,
 'gb__max_depth': 5,
 'gb__learning_rate': 0.2}

In [203]:
#Model Prediction on test data
#-----------------------
y_pred3 = model_grid3.predict(X_test)
#confusion matrix 
#----------------------
pd.DataFrame(confusion_matrix(y_test, y_pred3))

Unnamed: 0,0,1
0,890,146
1,108,856


In [204]:
#classification report
#-----------------------
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

           0       0.89      0.86      0.88      1036
           1       0.85      0.89      0.87       964

    accuracy                           0.87      2000
   macro avg       0.87      0.87      0.87      2000
weighted avg       0.87      0.87      0.87      2000



#### <center>Record the best hyperparameters and performance resulting from this pipeline.</center>

---

## Comparing the pipelines and documenting the findings


Pipeline 1:
-------

In [172]:
#Parameters:
model_grid1.best_params_

{'select__k': 8,
 'rf__n_estimators': 400,
 'rf__min_samples_leaf': 2,
 'rf__max_features': 'log2',
 'rf__max_depth': 17}

In [173]:
#Model Prediction on test data
#-----------------------
y_pred = model_grid1.predict(X_test)
#confusion matrix
#-----------------------
pd.DataFrame(confusion_matrix(y_test, y_pred))

Unnamed: 0,0,1
0,881,155
1,118,846


In [174]:
#classification report
#-----------------------
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.85      0.87      1036
           1       0.85      0.88      0.86       964

    accuracy                           0.86      2000
   macro avg       0.86      0.86      0.86      2000
weighted avg       0.86      0.86      0.86      2000



-------------------------------------------------------------------------------------------------------------------------------

PipeLine 2:
------

In [175]:
#Parameters:
model_grid2.best_params_

{'svc__tol': 0.001,
 'svc__kernel': 'rbf',
 'svc__gamma': 'scale',
 'svc__C': 100,
 'rfe__step': 0.15,
 'rfe__n_features_to_select': 3}

In [176]:
#Model Prediction on test data
#-----------------------
y_pred2 = model_grid2.predict(X_test)
#confusion matrix 
#----------------------
pd.DataFrame(confusion_matrix(y_test, y_pred2))

Unnamed: 0,0,1
0,778,258
1,123,841


In [177]:
#classification report
#-----------------------
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.86      0.75      0.80      1036
           1       0.77      0.87      0.82       964

    accuracy                           0.81      2000
   macro avg       0.81      0.81      0.81      2000
weighted avg       0.82      0.81      0.81      2000



-------------------------------------------------------------------------------------------------------------------------------

PipeLine 3:
---------

In [205]:
#Parameters
#-----------------------
model_grid3.best_params_

{'select__percentile': 50,
 'gb__subsample': 0.6,
 'gb__min_samples_split': 10,
 'gb__max_depth': 5,
 'gb__learning_rate': 0.2}

In [206]:
#Model Prediction on test data
#-----------------------
y_pred3 = model_grid3.predict(X_test)
#confusion matrix 
#----------------------
pd.DataFrame(confusion_matrix(y_test, y_pred3))

Unnamed: 0,0,1
0,890,146
1,108,856


In [207]:
#classification report
#-----------------------
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

           0       0.89      0.86      0.88      1036
           1       0.85      0.89      0.87       964

    accuracy                           0.87      2000
   macro avg       0.87      0.87      0.87      2000
weighted avg       0.87      0.87      0.87      2000



-------------------------------------------------------------------------------------------------------------------------------

---

### Pickling the required pipeline/models to be used in Part 3.


In [208]:
model_grid3.best_estimator_

Pipeline(steps=[('select',
                 SelectPercentile(percentile=50,
                                  score_func=<function mutual_info_classif at 0x7fc98b88b9d8>)),
                ('gb',
                 GradientBoostingClassifier(learning_rate=0.2, max_depth=5,
                                            min_samples_split=10,
                                            random_state=42, subsample=0.6))])

In [209]:
import pickle

#Pipeline - Model and Parameters
pickle.dump(model_grid3.best_estimator_, open("best_pipeline.pkl", "wb"))

#Anomaly Detection model
pickle.dump(envelope, open("envelope_AD_model.pkl", "wb"))