# Preparing the Dataset to Implement Pipelines

In [1]:
# import packages
import pandas as pd

In [2]:
# url path
url_path = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter15/Dataset/crx.data'

In [3]:
# load the data
credData = pd.read_csv(url_path, header=None, na_values='?')
credData.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [4]:
# Change these to numerical values of 1 for approved and 0 for not approved
credData.loc[credData[15] == '+', 15] = 1
credData.loc[credData[15] == '-', 15] = 0
credData.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [5]:
# find the number of null values
credData.isnull().sum()

0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64

In [6]:
# remove na values
credData.dropna(inplace=True, axis=0)

In [7]:
# separate the X and y variables from the dataset
X = credData.loc[:, 0:14]
y = credData[15].astype('int')

In [8]:
# split the dataset into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

# Applying Pipelines for Feature Extraction to the Dataset

In [9]:
# import packages
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [10]:
# Define different transformations using the steps argument inside the Pipeline function
catTransformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [11]:
# normalize the numerical variables
numTransformer = Pipeline(steps=[('scaler', StandardScaler())])

In [12]:
# print the different data types
X.dtypes

0      object
1     float64
2     float64
3      object
4      object
5      object
6      object
7     float64
8      object
9      object
10      int64
11     object
12     object
13    float64
14      int64
dtype: object

In [13]:
# Select the numerical features
numFeatures = X.select_dtypes(include=['int64', 'float64']).columns
numFeatures

Int64Index([1, 2, 7, 10, 13, 14], dtype='int64')

In [14]:
# select the categorical features
catFeatures = X.select_dtypes(include='object').columns
catFeatures

Int64Index([0, 3, 4, 5, 6, 8, 9, 11, 12], dtype='int64')

In [15]:
# Create a transformation engine
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(transformers=[('numeric', numTransformer, numFeatures), ('categoric', catTransformer, catFeatures)])

In [16]:
# apply this engine to transform training data
X_train_trans = pd.DataFrame(preprocessor.fit_transform(X_train))
X_train_trans.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,36,37,38,39,40,41,42,43,44,45
0,0.105658,-0.4449,1.377002,-0.553206,0.570065,-0.174241,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
1,-1.084238,1.115032,-0.528306,-0.553206,-0.60247,-0.167337,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,-0.416675,-0.080916,0.592889,-0.327276,-0.367963,-0.174241,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
3,-0.795428,1.418699,-0.189778,-0.553206,-0.485217,0.024974,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
4,-1.125497,0.439061,-0.636809,-0.553206,-0.25071,-0.174241,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


In [17]:
# transform the test set using the preprocessing engine
X_test_trans = pd.DataFrame(preprocessor.transform(X_test))
X_test_trans.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,36,37,38,39,40,41,42,43,44,45
0,-0.059376,-0.531217,-0.623789,-0.553206,0.687319,-0.174241,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,-1.063609,-0.878562,-0.600642,-0.327276,0.101051,-0.174076,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
2,0.64862,1.929316,1.847181,0.802371,-0.661097,-0.174241,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
3,2.203242,3.402933,2.245025,2.383877,-1.071485,0.927028,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
4,-0.451332,-0.644572,-0.612215,-0.553206,-0.485217,-0.174241,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


# Adding Dimensionality Reduction to the Feature Extraction Pipeline

In [18]:
# import package
from sklearn.decomposition import PCA

In [19]:
# add the reduce dimensions to the pipeline
estimator = Pipeline([('preprocessor', preprocessor), ('dimred', PCA(10))])

In [20]:
# fit the training data
X_train_trans = pd.DataFrame(estimator.fit_transform(X_train))
X_train_trans.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.456911,0.857577,-1.231989,0.902396,1.604191,-0.284921,-0.595444,0.206836,0.027712,0.742267
1,-0.758102,-1.279315,1.162158,0.397572,0.031973,1.236864,0.353098,-0.020558,0.561482,0.613476
2,0.387754,-0.022255,-0.082482,-0.524931,0.0893,0.300113,-1.25766,-0.191124,-0.376516,-0.367365
3,-0.332061,-0.636192,0.825248,0.798001,0.435375,1.377995,-0.578766,0.030524,-0.900729,0.620234
4,-1.41278,-0.707406,0.607928,0.54958,1.582078,-0.11971,0.496112,0.597986,-0.133551,0.032972


In [21]:
# transform the test set
X_test_trans = pd.DataFrame(estimator.transform(X_test))
X_test_trans.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-1.299051,0.187772,-0.23137,0.112879,-0.484604,0.369499,0.28216,1.09115,-0.062456,0.077569
1,-1.494398,-0.200785,0.231369,-0.60963,1.235941,-1.063417,0.259277,0.779575,0.086378,0.07871
2,2.829701,-0.298786,-0.099139,0.24561,0.638466,0.991274,-0.769735,0.040185,-0.614251,0.164817
3,5.259748,-0.456795,0.789554,1.150056,-0.033996,0.487041,1.095085,-0.113758,0.515659,0.520806
4,-1.31073,-0.695854,0.14146,0.215672,-0.506067,0.058389,-0.324188,0.963671,0.032933,0.043535


# Modeling and Predictions Using ML Pipelines

In [22]:
# import packages
from sklearn.linear_model import LogisticRegression

In [23]:
# create an estimator
estimator = Pipeline(steps=[('preprocessor', preprocessor),
                           ('dimred', PCA(10)),
                           ('clf', LogisticRegression(random_state=123))])

In [24]:
# fit the model
estimator.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Int64Index([1, 2, 7, 10, 13, 14], dtype='int64')),
                                                 ('categoric',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  Int64Index([0, 3, 4, 5, 6, 8, 9, 11, 12], dtype='int64'))])),
                ('dimred', PCA(n_components=10)),
                ('clf', LogisticRegression(random_state=123))])

In [25]:
# print the accuracy score
estimator.score(X_test, y_test)

0.8877551020408163

In [26]:
# make predictions
pred = estimator.predict(X_test)

In [27]:
# classification report
from sklearn.metrics import classification_report

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.90      0.90      0.90       107
           1       0.88      0.88      0.88        89

    accuracy                           0.89       196
   macro avg       0.89      0.89      0.89       196
weighted avg       0.89      0.89      0.89       196



In [28]:
# confusion matrix
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, pred))

[[96 11]
 [11 78]]


We see that we have got an accuracy rate of 89%, which means that 89% (classification report) of the customers in the test set were correctly classified as creditworthy or not. Let's also look closely at the recall values for each class. We can see that the 0 class stands for those unworthy customers who had a recall value of 90%. This means that almost 10% (100%-90%) of unworthy customers were wrongly classified as worthy customers, which would be the risk the business will have to bear. On the other hand, the recall value for worthy customers is only 88%, which means that the business has missed an opportunity to the tune of 12% (100%-88%).

# Spot-Checking Models Using ML Pipelines

In [29]:
# import packages
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [30]:
# create a list of classifiers
classifiers = [KNeighborsClassifier(5),
              RandomForestClassifier(random_state=123),
              AdaBoostClassifier(random_state=123),
              LogisticRegression(random_state=123)]

In [31]:
# initiate a for loop over the classifiers and then pass the respective classifiers into the estimator
for classifier in classifiers:
    estimator = Pipeline(steps=[('preprocessor', preprocessor),
                               ('dimred', PCA(10)),
                               ('classifier', classifier)])
    
    estimator.fit(X_train, y_train)
    
    print(classifier)
    print(f'Accuracy on test set: {estimator.score(X_test, y_test)}\n')

KNeighborsClassifier()
Accuracy on test set: 0.826530612244898

RandomForestClassifier(random_state=123)
Accuracy on test set: 0.8622448979591837

AdaBoostClassifier(random_state=123)
Accuracy on test set: 0.8571428571428571

LogisticRegression(random_state=123)
Accuracy on test set: 0.8877551020408163



# Grid Search and Cross-Validation with ML Pipelines

In [32]:
# Create a pipeline using AdaBoostClassifier
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('dimred', PCA()),
                      ('classifier', AdaBoostClassifier(random_state=123))])

In [33]:
# Define the parameters as a dictionary
grid_params = {'dimred__n_components':[10,12,15],\
              "classifier__n_estimators": [50, 100,200],\
              "classifier__learning_rate":[0.7,0.6,1.0]}

In [34]:
# create the estimator function using the GridSearchCv function
from sklearn.model_selection import GridSearchCV

estimator = GridSearchCV(pipe, cv=10, param_grid=grid_params)

In [35]:
# fit the estimator
estimator.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('numeric',
                                                                         Pipeline(steps=[('scaler',
                                                                                          StandardScaler())]),
                                                                         Int64Index([1, 2, 7, 10, 13, 14], dtype='int64')),
                                                                        ('categoric',
                                                                         Pipeline(steps=[('onehot',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         Int64Index([0, 3, 4, 5, 6, 8, 9, 11, 12], dtype='int64'))])),
                                       ('dimred', P

In [36]:
# print out the best parameters and the best score obtained
print(f'Best: {estimator.best_score_} using {estimator.best_params_}')

Best: 0.8424637681159421 using {'classifier__learning_rate': 0.7, 'classifier__n_estimators': 50, 'dimred__n_components': 15}


In [37]:
# make predictions
pred = estimator.predict(X_test)

In [38]:
# classification report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.87      0.84      0.85       107
           1       0.82      0.84      0.83        89

    accuracy                           0.84       196
   macro avg       0.84      0.84      0.84       196
weighted avg       0.84      0.84      0.84       196

