# Breast Cancer using Xgboost and pipeline

In [1]:
#import libraries
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor #for the model
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import mean_absolute_error, mean_squared_error, roc_curve, auc #for model evaluation
from sklearn.metrics import classification_report #for model evaluation
from sklearn.metrics import confusion_matrix #for model evaluation
from sklearn.inspection import permutation_importance

from xgboost import XGBClassifier, XGBRegressor

from tqdm import tqdm,trange
import scipy.stats as st

In [2]:
#load data set
data = pd.read_csv("breast_cancer.csv",index_col = 'id')

In [3]:
data.head()

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 569 entries, 842302 to 92751
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    object 
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se            

In [5]:
data.dropna(axis=0, subset=['diagnosis'], inplace=True)

y = data.diagnosis
data.drop(['diagnosis','Unnamed: 32'], axis=1, inplace=True)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=0)

In [25]:
# Select categorical columns with relatively low cardinality (convenient but arbitrary)

categorical_cols = [cname for cname in X_train.columns if X_train[cname].nunique()<60 and X_train[cname].dtype=="object"]
categorical_cols

[]

In [26]:
# Select numerical columns
numerical_cols = [cname for cname in X_train.columns if 
                X_train[cname].dtype in ['int64', 'float64']]
numerical_cols

['radius_mean',
 'texture_mean',
 'perimeter_mean',
 'area_mean',
 'smoothness_mean',
 'compactness_mean',
 'concavity_mean',
 'concave points_mean',
 'symmetry_mean',
 'fractal_dimension_mean',
 'radius_se',
 'texture_se',
 'perimeter_se',
 'area_se',
 'smoothness_se',
 'compactness_se',
 'concavity_se',
 'concave points_se',
 'symmetry_se',
 'fractal_dimension_se',
 'radius_worst',
 'texture_worst',
 'perimeter_worst',
 'area_worst',
 'smoothness_worst',
 'compactness_worst',
 'concavity_worst',
 'concave points_worst',
 'symmetry_worst',
 'fractal_dimension_worst']

In [27]:
# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train_selected = X_train[my_cols].copy()
X_test_selected = X_test[my_cols].copy()

In [28]:
# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler()) ])

In [29]:
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [30]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

## 1. RandomForestRegressor in pipeline

In [31]:
model = RandomForestClassifier(n_estimators=100, random_state=0)

my_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model) ])

X_train_rf = X_train_selected.copy()
X_test_rf = X_test_selected.copy()

# Fit model into pipeline
my_pipeline.fit(X_train_rf, y_train)

# get predicted prices on validation data
y_pred = my_pipeline.predict(X_test_rf)
# print(mean_squared_error(y_valid, val_predictions))
print("Confusion matrix")
print(confusion_matrix(y_test,y_pred))
print("Classification Report")
print(classification_report(y_test,y_pred))

Confusion matrix
[[64  3]
 [ 1 46]]
Classification Report
              precision    recall  f1-score   support

           B       0.98      0.96      0.97        67
           M       0.94      0.98      0.96        47

    accuracy                           0.96       114
   macro avg       0.96      0.97      0.96       114
weighted avg       0.97      0.96      0.97       114



## 2. RandomForestRegressor with RandomizedSearchCV

In [32]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1100, num = 6)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]


# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

{'n_estimators': [100, 300, 500, 700, 900, 1100], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 10, 15, 20, 25, 30], 'min_samples_split': [2, 5, 10, 15, 100], 'min_samples_leaf': [1, 2, 5, 10]}


In [33]:
regr = RandomForestClassifier()

# Data preprocessing pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# # Transform the data
X_train_rf_r = X_train.copy()
X_test_rf_r = X_test.copy()

X_train_rf_r = my_pipeline.fit_transform(X_train_rf_r)
X_test_rf_r = my_pipeline.transform(X_test_rf_r)



# rf_random = RandomizedSearchCV(estimator = regr, param_distributions = random_grid, scoring='neg_mean_squared_error',
#                               n_iter = 10, cv = 4, verbose= 1, random_state= 0, n_jobs = 1)
rf_random = RandomizedSearchCV(estimator = regr, param_distributions = random_grid, 
                              n_iter = 10, cv = 4, verbose= 1, random_state= 0, n_jobs = 1)


rf_random.fit(X_train_rf_r,y_train)

Fitting 4 folds for each of 10 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:   29.6s finished


RandomizedSearchCV(cv=4, estimator=RandomForestClassifier(), n_jobs=1,
                   param_distributions={'max_depth': [5, 10, 15, 20, 25, 30],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 5, 10],
                                        'min_samples_split': [2, 5, 10, 15,
                                                              100],
                                        'n_estimators': [100, 300, 500, 700,
                                                         900, 1100]},
                   random_state=0, verbose=1)

In [34]:
rf_random.best_params_

{'n_estimators': 900,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 30}

In [35]:
rs_y_pred = rf_random.predict(X_test_rf_r)

In [36]:
print(confusion_matrix(y_test,rs_y_pred))

[[65  2]
 [ 2 45]]


In [37]:
print(classification_report(y_test,rs_y_pred))

              precision    recall  f1-score   support

           B       0.97      0.97      0.97        67
           M       0.96      0.96      0.96        47

    accuracy                           0.96       114
   macro avg       0.96      0.96      0.96       114
weighted avg       0.96      0.96      0.96       114



In [38]:
rf_random.best_score_

0.951715572116131

## 3. Xgboost in pipeline

In [39]:
xgb_model = XGBClassifier(n_estimators=1000, learning_rate=0.05) # Typical values range from 100-1000

# Bundle preprocessing and modeling code in a pipeline
xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', xgb_model)
                             ])


# Fit model
X_test_eval = X_test.copy()
# Remove the model from pipeline
eval_set_pipe = Pipeline(steps = [('preprocessor', preprocessor)])
# fit transform X_valid.copy()
X_test_eval = eval_set_pipe.fit(X_train, y_train).transform (X_test_eval)


X_train_xg = X_train.copy()
X_test_xg = X_test.copy()

xgb_pipeline.fit(X_train_xg, y_train, model__early_stopping_rounds=5, model__eval_metric = "mae", 
                 model__eval_set=[(X_test_eval, y_test)],model__verbose=False)

# get predicted prices on validation data
xgb_y_predictions = xgb_pipeline.predict(X_test_xg)
#print(mean_squared_error(y_test, xgb_y_predictions))
print(classification_report(y_test,xgb_y_predictions))
print(confusion_matrix(y_test,xgb_y_predictions))

              precision    recall  f1-score   support

           B       0.96      0.99      0.97        67
           M       0.98      0.94      0.96        47

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

[[66  1]
 [ 3 44]]


## 3. Xgboost in pipeline using default parameters

In [40]:
xgb_pipeline_raw = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', xgb_model)
                             ])

X_train_xg = X_train.copy()
X_test_xg = X_test.copy()

xgb_pipeline_raw.fit(X_train_xg, y_train)

# get predicted prices on validation data
xgb_y_predictions = xgb_pipeline_raw.predict(X_test_xg)
# print(mean_squared_error(y_valid, xgb_val_predictions))
print(classification_report(y_test,xgb_y_predictions))
print(confusion_matrix(y_test,xgb_y_predictions))

              precision    recall  f1-score   support

           B       0.96      0.99      0.97        67
           M       0.98      0.94      0.96        47

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

[[66  1]
 [ 3 44]]


## 4. Xgboost with RandomizedSearchCV

In [41]:
one_to_left = st.beta(10, 1)  
from_zero_positive = st.expon(0, 50)

params = {
    "n_estimators": range(100,1000,200),
    "max_depth": st.randint(3, 40),
    "learning_rate": st.uniform(0.05, 0.4),
    "colsample_bytree": one_to_left,
    "subsample": one_to_left,
    "gamma": st.uniform(0, 10),
    'reg_alpha': from_zero_positive,
    "min_child_weight": from_zero_positive,
}

# xgbreg = XGBRegressor(nthreads=-1)  
# xgb_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
#                               ('model', xgbreg)
#                              ])

# gs = RandomizedSearchCV(xgb_pipeline, params, n_jobs=1)
# gs.fit(X_train, y_train,early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False) # cyin: has error
# gs.best_model_

#  cyin: xgboost using RandomizedSearchCV needs to seperate with pipeline otherwise error: 
#     use pipeline only for preprocessing 

# Define model
xgbreg = XGBClassifier(n_estimators=500, learning_rate=0.05)

# Data preprocessing pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Transform the data
X_train_xg_r = X_train.copy()
X_test_xg_r = X_test.copy()

X_train_xg_r = my_pipeline.fit_transform(X_train_xg_r)
X_test_xg_r = my_pipeline.transform(X_test_xg_r)


# xgb_model = RandomizedSearchCV(xgbreg, params, scoring='neg_mean_squared_error', n_jobs=1)  
xgb_model = RandomizedSearchCV(xgbreg, params, n_jobs=1)  

xgb_model.fit(X_train_xg_r, y_train)  
# rs_model.best_estimator_


# get predicted prices on validation data
xgb_y_predictions = xgb_model.predict(X_test_xg_r)
# print(mean_squared_error(y_valid, xgb_val_predictions))
print(classification_report(y_test,xgb_y_predictions))
print(confusion_matrix(y_test,xgb_y_predictions))

              precision    recall  f1-score   support

           B       0.93      0.96      0.94        67
           M       0.93      0.89      0.91        47

    accuracy                           0.93       114
   macro avg       0.93      0.92      0.93       114
weighted avg       0.93      0.93      0.93       114

[[64  3]
 [ 5 42]]


 ## Summarry -  'Random Forest has best accuracy'