In [1]:
# basic
import pandas as pd
import numpy as np
import warnings

# for plot
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn
from sklearn.datasets import load_breast_cancer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report

# machine learning algorithm
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Objective
- Data Pre-processing
- Multiple Model Pipe-lining
- Hyper-parameter Tuning
- Choosing the best model
- Confusion Matrix (for classification)

In [2]:
# Load the data
breast_cancer = load_breast_cancer()
data = breast_cancer.data # data
print("Shape of breast cancer data", data.shape)
labels = breast_cancer.target
print("Shape of breast cancer labels", labels.shape)

# concatenate the data and labels along the second axis
labels = np.reshape(labels,(569,1))
data_and_label = np.concatenate([data, labels], axis=1)

# read in pandas DataFrame
df = pd.DataFrame(data_and_label)

# Append feature/column names
features_names = np.append(breast_cancer.feature_names, 'label')
df.columns = features_names

# Replace label with taget names
print("Classification Targets", breast_cancer.target_names)
df['label'].replace(0, 'Benign', inplace=True)
df['label'].replace(1, 'Malignant', inplace=True)

# show the first 6 rows of data
df.head(6)

Shape of breast cancer data (569, 30)
Shape of breast cancer labels (569,)
Classification Targets ['malignant' 'benign']


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,label
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,Benign
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,Benign
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,Benign
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,Benign
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,Benign
5,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,0.2087,0.07613,...,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244,Benign


In [3]:
# Specify the name of the target
target = 'label'

# Get the target vector
y = df[target]

# Specify the name of the features
features = list(df.drop(target, axis=1).columns)

# Get the feature vector
X = df[features]

In [4]:
# Declare the LabelEncoder
class_encoder = LabelEncoder()

# Enclode the target
y = class_encoder.fit_transform(y)

In [5]:
# Randomly choose 30% of the data for testing (set randome_state as 666 and stratify as y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=666, stratify=y)

In [6]:
# Over sampling for imbalanced data
# from imblearn.over_sampling import RandomOverSampler

# ros = RandomOverSampler(random_state=0)
# X_train, y_train = ros.fit_sample(X_train, y_train)

# print([np.where(y_train == 0)[0].shape[0], np.where(y_train == 1)[0].shape[0]])

# Multiple Model Pipe-lining
In this section, we will first use the combination of pipeline and GridSearchCV to tune the hyperparameters of five classifiers:
- logistic regression
- multi-layer perceptron
- decision tree
- random forest
- support vector machine

Next we will select the best model across the five classifiers.

## Create the dictionary of classifiers
In the dictionary:
- the key is the acronym of the classifier
- the value is the classifier (with random_state=0)

In [7]:
clfs = {'lr': LogisticRegression(random_state=0),
        'mlp': MLPClassifier(random_state=0),
        'dt': DecisionTreeClassifier(random_state=0),
        'rf': RandomForestClassifier(random_state=0),
        'svc': SVC(random_state=0, probability=True)}

# for imbalanced data: replicating the smaller class until you have as many samples as in the larger one.
# clfs = {'lr': LogisticRegression(random_state=0, class_weight='balanced'),
#         'mlp': MLPClassifier(random_state=0),
#         'dt': DecisionTreeClassifier(random_state=0, class_weight='balanced'),
#         'rf': RandomForestClassifier(random_state=0, class_weight='balanced'),
#         'svc': SVC(random_state=0, probability=True, class_weight='balanced')}

## Create the dictionary of pipeline
In the dictionary:
- the key is the acronym of the classifier
- the value is the pipeline (with StandardScaler and the classifier)

In [8]:
pipe_clfs = {}

for name, clf in clfs.items():
    pipe_clfs[name] = Pipeline([('StandardScaler', StandardScaler()), ('clf', clf)])

## Create the dictionary of parameter grids
In the dictionary:
- the key is the acronym of the classifier
- the value is the parameter grid of the classifier

In [9]:
param_grids = {}

### The parameter grid for logistic regression
The hyperparameters we want to tune are:
- multi_class
- solver
- C

Here we need to use two dictionaries in the parameter grid since 'multinomial' (multi_class) does not support 'liblinear' (solver). See details of the meaning of the hyperparametes in [sklearn logistic regression documentation](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

In [10]:
C_range = [10 ** i for i in range(-4, 5)]

param_grid = [{'clf__multi_class': ['ovr'], 
               'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
               'clf__C': C_range},
              {'clf__multi_class': ['multinomial'],
               'clf__solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
               'clf__C': C_range}]

param_grids['lr'] = param_grid

### The parameter grid for multi-layer perceptron
The hyperparameters we want to tune are:
- hidden_layer_sizes
- activation

See details of the meaning of the hyperparametes in [sklearn multi-layer perceptron documentation](http://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier)

In [11]:
param_grid = [{'clf__hidden_layer_sizes': [10, 100, 200],
               'clf__activation': ['identity', 'logistic', 'tanh', 'relu']}]

param_grids['mlp'] = param_grid

### The parameter grid for decision tree
The hyperparameters we want to tune are:
- min_samples_split
- min_samples_leaf

See details of the meaning of the hyperparametes in [sklearn decision tree documentation](http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier)

In [12]:
param_grid = [{'clf__min_samples_split': [2, 10, 30],
               'clf__min_samples_leaf': [1, 10, 30]}]

param_grids['dt'] = param_grid

### The parameter grid for random forest
The hyperparameters we want to tune are:
- n_estimators
- min_samples_split
- min_samples_leaf

See details of the meaning of the hyperparametes in [sklearn random forest documentation](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)

In [13]:
param_grid = [{'clf__n_estimators': [2, 10, 30],
               'clf__min_samples_split': [2, 10, 30],
               'clf__min_samples_leaf': [1, 10, 30]}]

param_grids['rf'] = param_grid

### The parameter grid for support vector machine
The hyperparameters we want to tune are:
- C
- gamma
- kernel

See details of the meaning of the hyperparametes in [sklearn support vector machine documentation](http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)

In [14]:
param_grid = [{'clf__C': [0.01, 0.1, 1, 10, 100],
               'clf__gamma': [0.01, 0.1, 1, 10, 100],
               'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid']}]

param_grids['svc'] = param_grid

# Hyperparameter tuning
Here we use two functions for hyperparameter tuning:
- [GridSearchCV](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html): Exhaustive search over specified parameter values for an estimator
- [StratifiedKFold](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html): Stratified K-Folds cross-validator

In [15]:
# Suppress warnings
warnings.filterwarnings("ignore")

# The list of [best_score_, best_params_, best_estimator_]
best_score_param_estimators = []

# For each classifier
for name in pipe_clfs.keys():
    # GridSearchCV
    gs = GridSearchCV(estimator=pipe_clfs[name],
                      param_grid=param_grids[name],
                      scoring='precision',
                      n_jobs=1,
                      cv=StratifiedKFold(n_splits=10,
                                         shuffle=True,
                                         random_state=666))
    # Fit the pipeline
    gs = gs.fit(X_train, y_train)
    
    # Update best_score_param_estimators
    best_score_param_estimators.append([gs.best_score_, gs.best_params_, gs.best_estimator_])

# Model Selection

In [16]:
# Sort best_score_param_estimators in descending order of the best_score_
best_score_param_estimators = sorted(best_score_param_estimators, key=lambda x : x[0], reverse=True)

# For each [best_score_, best_params_, best_estimator_]
for best_score_param_estimator in best_score_param_estimators:
    # Print out [best_score_, best_params_, best_estimator_], where best_estimator_ is a pipeline
    # Since we only print out the type of classifier of the pipeline
    print([best_score_param_estimator[0], best_score_param_estimator[1], 
           type(best_score_param_estimator[2].named_steps['clf'])], end='\n\n')

[0.9811190476190476, {'clf__C': 10, 'clf__gamma': 0.1, 'clf__kernel': 'rbf'}, <class 'sklearn.svm._classes.SVC'>]

[0.9807464387464387, {'clf__C': 1, 'clf__multi_class': 'ovr', 'clf__solver': 'saga'}, <class 'sklearn.linear_model._logistic.LogisticRegression'>]

[0.9807336182336183, {'clf__activation': 'tanh', 'clf__hidden_layer_sizes': 100}, <class 'sklearn.neural_network._multilayer_perceptron.MLPClassifier'>]

[0.9705356237964935, {'clf__min_samples_leaf': 30, 'clf__min_samples_split': 2}, <class 'sklearn.tree._classes.DecisionTreeClassifier'>]

[0.9628333333333334, {'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2, 'clf__n_estimators': 30}, <class 'sklearn.ensemble._forest.RandomForestClassifier'>]



## Print the accuracy of the best model on the testing data

In [17]:
print(best_score_param_estimators[0][2].score(X_test, y_test))

0.9649122807017544


# Confusion Matrix

In [18]:
y_test_pred = best_score_param_estimators[0][2].predict(X_test)

confusion_matrix(y_test, y_test_pred, labels=[0, 1])

array([[ 61,   3],
       [  3, 104]])