# Complete ML Workflow in a Pipeline

<b> Load the dataset </b>

In [1]:
# import packages
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [2]:
# url path
url_path = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter16/Dataset/processed.cleveland.data'

<b> Read the data using pandas and then impute NA values where there are missing values or special characters such as ? </b>

In [3]:
# load the data
heartData = pd.read_csv(url_path, header=None, na_values='?')
heartData.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


<b> Define the names of the columns. Assign the names as given in the following list:<br> 
['age','sex', 'cp', 'trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','label'] </b>

In [4]:
heartData.columns = ['age','sex', 'cp', 'trestbps','chol','fbs','restecg',
                     'thalach','exang','oldpeak','slope','ca','thal','label']

In [5]:
heartData.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


<b> Change the classes of all values other than 0 in the label column to 1 </b>

In [6]:
heartData.loc[heartData['label'] > 0, 'label'] = 1
heartData.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


<b> Drop all NA values </b>

In [7]:
heartData.shape

(303, 14)

In [8]:
heartData.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
label       0
dtype: int64

There are few na values, some let's drop them.

In [9]:
heartData.dropna(inplace=True, axis=0)

In [10]:
heartData.shape

(297, 14)

<b> Create the Y variable </b>

In [11]:
y = heartData['label']

<b> Create the X variable from the remaining DataFrame </b>

In [12]:
X = heartData.iloc[:, 0:13]
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0


<b> Split the dataset into training and testing sets </b>

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

<b> Create the processing engine </b>

In [14]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

In [15]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns

In [16]:
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features)])

<b> Create a list of classifiers </b>

In [17]:
classifiers = [KNeighborsClassifier(),
              RandomForestClassifier(random_state=123),
              AdaBoostClassifier(random_state=123),
              LogisticRegression(random_state=123)]

<b> Create the estimator function with a preprocessor and a classifier </b>

In [18]:
for classifier in classifiers:
    estimator = Pipeline(steps=[('preprocessor', preprocessor),
                               ('dimred', PCA(10)),
                               ('classifier', classifier)])
    
    estimator.fit(X_train, y_train)
    
    print(classifier)
    print(f'Model score: {estimator.score(X_test, y_test)}\n')

KNeighborsClassifier()
Model score: 0.7777777777777778

RandomForestClassifier(random_state=123)
Model score: 0.8333333333333334

AdaBoostClassifier(random_state=123)
Model score: 0.7222222222222222

LogisticRegression(random_state=123)
Model score: 0.7888888888888889



<b> Select the model that generates the highest accuracy score </b>

The RandomForestClassifier model has the best accuracy score.

<b> Create a new pipeline with all the parameters with a preprocessor, PCA(), and the classifier that gave the highest accuracy score </b>

In [19]:
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('dimred', PCA()),
                      ('classifier', RandomForestClassifier(random_state=123))])

<b> Define the parameters of the selected model </b>

In [20]:
param_grid = {'dimred__n_components': [10, 11, 12, 13],
             'classifier__criterion':['gini', 'entropy'],
             'classifier__n_estimators':[2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13],
             'classifier__max_depth':[2, 4, 6, 8, 10]}

<b> Define the estimator with GridSearchCV with 10 fold </b>

In [21]:
estimator = GridSearchCV(pipe, cv=10, param_grid=param_grid)

<b> Fit the estimator with the training set </b>

In [22]:
estimator.fit(X_train, y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('scaler',
                                                                                          StandardScaler())]),
                                                                         Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal'],
      dtype='object'))])),
                                       ('dimred', PCA()),
                                       ('classifier',
                                        RandomForestClassifier(random_state=123))]),
             param_grid={'classifier__criterion': ['gini', 'entropy'],
                         'classifier__max_depth': [2, 4, 6, 8, 10],
                         'classifier__n_estimators': [2, 3, 4, 5, 6, 7

<b> Print the best score and best parameters </b>

In [23]:
print(f'Accuracy score on test set: {estimator.score(X_test, y_test)}')

Accuracy score on test set: 0.8444444444444444


<b> Generate predictions </b>

In [24]:
pred = estimator.predict(X_test)

<b> print the classification report </b>

In [25]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.86      0.86      0.86        49
           1       0.83      0.83      0.83        41

    accuracy                           0.84        90
   macro avg       0.84      0.84      0.84        90
weighted avg       0.84      0.84      0.84        90



<b> print the confusion matrix </b>

In [26]:
print(confusion_matrix(y_test, pred))

[[42  7]
 [ 7 34]]
