# Load the data

In [1]:
# Load the libraries
import pandas as pd

In [2]:
# Load the dataset
df = pd.read_csv("data-breast-cancer.csv")

In [3]:
# Show some data samples
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883


# Analyze the data

In [4]:
# Show data information
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   diagnosis               569 non-null    object 
 1   radius_mean             569 non-null    float64
 2   texture_mean            569 non-null    float64
 3   perimeter_mean          569 non-null    float64
 4   area_mean               569 non-null    float64
 5   smoothness_mean         569 non-null    float64
 6   compactness_mean        569 non-null    float64
 7   concavity_mean          569 non-null    float64
 8   concave points_mean     569 non-null    float64
 9   symmetry_mean           569 non-null    float64
 10  fractal_dimension_mean  569 non-null    float64
dtypes: float64(10), object(1)
memory usage: 49.0+ KB


(569, 11)

In [5]:
# Summary statistics for numerical features
df.describe()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744


In [6]:
# Check for missing values
df.isnull().sum()

diagnosis                 0
radius_mean               0
texture_mean              0
perimeter_mean            0
area_mean                 0
smoothness_mean           0
compactness_mean          0
concavity_mean            0
concave points_mean       0
symmetry_mean             0
fractal_dimension_mean    0
dtype: int64

# Remove outliers and clean the data

### Remove outliers

In [7]:
# Take the whisker as the 2nd percentile and the 98th percentile of the data to remove outliers

# Remove outliers of 'radius_mean' feature
q = df['radius_mean'].quantile(0.98)
df_clean = df[df['radius_mean'] < q]

# Remove outliers of 'texture_mean' feature
q = df['texture_mean'].quantile(0.98)
df_clean = df[df['texture_mean'] < q]

# Remove outliers of 'perimeter_mean' feature
q = df['perimeter_mean'].quantile(0.98)
df_clean = df[df['perimeter_mean'] < q]

# Remove outliers of 'area_mean' feature
q = df['area_mean'].quantile(0.98)
df_clean = df[df['area_mean'] < q]

# Remove outliers of 'smoothness_mean' feature
q = df['smoothness_mean'].quantile(0.98)
df_clean = df[df['smoothness_mean'] < q]

# Remove outliers of 'compactness_mean' feature
q = df['compactness_mean'].quantile(0.98)
df_clean = df[df['compactness_mean'] < q]

# Remove outliers of 'concavity_mean' feature
q = df['concavity_mean'].quantile(0.98)
df_clean = df[df['concavity_mean'] < q]

# Remove outliers of 'concave points_mean' feature
q = df['concave points_mean'].quantile(0.98)
df_clean = df[df['concave points_mean'] < q]

# Remove outliers of 'symmetry_mean' feature
q = df['symmetry_mean'].quantile(0.98)
df_clean = df[df['symmetry_mean'] < q]

# Remove outliers of 'fractal_dimension_mean' feature
q = df['fractal_dimension_mean'].quantile(0.98)
df_clean = df[df['fractal_dimension_mean'] < q]

df_clean.shape

(557, 11)

In [8]:
# Drop duplicate samples
df_clean.drop_duplicates(ignore_index=True)

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean
0,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871
1,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667
2,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999
3,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883
4,M,12.45,15.70,82.57,477.1,0.12780,0.17000,0.15780,0.08089,0.2087,0.07613
...,...,...,...,...,...,...,...,...,...,...,...
552,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623
553,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533
554,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648
555,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016


In [9]:
# Summary statistics
df_clean.describe()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean
count,557.0,557.0,557.0,557.0,557.0,557.0,557.0,557.0,557.0,557.0
mean,14.189248,19.300449,92.313573,659.958528,0.095966,0.101992,0.086111,0.048382,0.18016,0.062263
std,3.502633,4.309433,24.165358,351.410589,0.013603,0.049869,0.076916,0.038578,0.026407,0.006045
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996
25%,11.75,16.18,75.49,426.0,0.08597,0.06374,0.02891,0.02017,0.1618,0.05766
50%,13.43,18.83,86.87,555.1,0.09566,0.09097,0.05929,0.03323,0.1784,0.0614
75%,16.02,21.8,104.3,793.2,0.1049,0.1289,0.1226,0.0734,0.195,0.06581
max,28.11,39.28,188.5,2501.0,0.1447,0.2867,0.4268,0.2012,0.304,0.08046


### Prepare data

In [10]:
# Determine feature x and label y
feature_names = df_clean.columns.tolist()     
feature_names.remove("diagnosis")       
feature_data = df_clean[feature_names]         
target_data = df_clean.diagnosis

In [11]:
# Label encode columns
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
target_data = label_encoder.fit_transform(target_data)

In [12]:
feature_data

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean
0,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,0.2419,0.07871
1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,0.1812,0.05667
2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,0.2069,0.05999
4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,0.1809,0.05883
5,12.45,15.70,82.57,477.1,0.12780,0.17000,0.15780,0.08089,0.2087,0.07613
...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,0.1726,0.05623
565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,0.1752,0.05533
566,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,0.1590,0.05648
567,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,0.2397,0.07016


In [13]:
target_data

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0,

In [14]:
# Train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feature_data.values, target_data, random_state=1, test_size=0.3)

In [15]:
# Initialize and use StandardScaler to normalize the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_normalized_train = scaler.fit_transform(X_train)
X_normalized_test = scaler.transform(X_test)

### Build the SVM model

In [16]:
# Fit the model
from sklearn.svm import SVC
model = SVC()
model.fit(X_normalized_train, y_train)

In [17]:
# Show evaluation metrics on the test set
from sklearn.metrics import classification_report
print(classification_report(y_test, model.predict(X_normalized_test)))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96       114
           1       0.92      0.91      0.92        54

    accuracy                           0.95       168
   macro avg       0.94      0.94      0.94       168
weighted avg       0.95      0.95      0.95       168



# Use GridSearchCV to find the best set of SVM hyperparameters

In [18]:
# Import GridSearchCV for finding the best SVM hyper-parameter set
from sklearn.model_selection import GridSearchCV
param_grid = {"C": [0.01, 0.1, 1, 10, 100, 1000],
              "gamma": ["scale", 0.001, 0.005, 0.1]}

gridsearch = GridSearchCV(SVC(), param_grid, cv=10, scoring="f1", verbose=1)

In [19]:
# Run grid search to find the best set of hyper-parameters
gridsearch.fit(X_normalized_train, y_train)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


In [20]:
# Best set of hyper-parameters found after searching
gridsearch.best_params_

{'C': 1000, 'gamma': 0.005}

# Build, train and evaluate the SVM model

In [21]:
# Re-run SVM with the best set of hyper-parameters.
model = SVC(C=gridsearch.best_params_['C'], gamma=gridsearch.best_params_['gamma'])
model.fit(X_normalized_train, y_train)

In [22]:
# Show evaluation metrics on the test set
print(classification_report(y_test, model.predict(X_normalized_test)))

              precision    recall  f1-score   support

           0       0.96      0.93      0.95       114
           1       0.86      0.93      0.89        54

    accuracy                           0.93       168
   macro avg       0.91      0.93      0.92       168
weighted avg       0.93      0.93      0.93       168



- Accuracy: The overall accuracy of the model is 0.93, meaning it correctly predicts the class for 93% of all cases in the dataset.
- The SVM model exhibits strong performance metrics, with particularly high accuracy and balanced F1-scores.
- However, slight room for improvement remains in the precision for class 1 (label M), where targeted model tuning might enhance performance further.

# Separately build, train and evaluate the other four classifiers (Logistic regression, Naive Bayes, Decision Tree, Random Forest) on the same dataset, then compare their performance with the SVM model's

### Logistic Regression

In [23]:
# Train a Logistic Regression model
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(solver='lbfgs', max_iter=1000)
log_reg.fit(X_normalized_train, y_train)

### Naive Bayes

In [24]:
# Train a Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB
naive_model = GaussianNB()
naive_model.fit(X_normalized_train, y_train)

### Decision Tree

In [25]:
# Train a Decision Tress classifier
from sklearn.tree import DecisionTreeClassifier
model_dt = DecisionTreeClassifier(random_state=1)
params = {"criterion": ["gini", "entropy"],             # Criterion to evaluate the purity.
         "max_depth": [3, 5],                           # Maximum depth of the tree
         "min_samples_split": [4, 8]}  
gridsearch_dt = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=params, cv=5)
model_dt.fit(X_normalized_train, y_train)

### Random Forest

In [26]:
# Train a Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()                        # Initialize a Random Forest Classifier.
params_rf = {'n_estimators': [50, 100, 200]}         # n_estimator in RandomForestClassifier(...) indicates the number of Trees in the Forest.
rf_gs = GridSearchCV(rf, params_rf, cv=5)            # Initialize GridSearchCV to find an optimal number of Trees.
rf_gs.fit(X_normalized_train, y_train)

In [27]:
# Best number of Trees
rf_best = rf_gs.best_estimator_
print(rf_gs.best_params_)

{'n_estimators': 200}


### Compare

In [28]:
# Print accuracy of single models on the test set
print('Logistic Regression: {}'.format(log_reg.score(X_normalized_test, y_test)))      # Logistic Regression accuracy
print('Naive Bayes: {}'.format(naive_model.score(X_normalized_test, y_test)))          # Naive Bayes accuracy
print('Decision Tree: {}'.format(model_dt.score(X_normalized_test, y_test)))           # Decision Tree accuracy
print('Random Forest: {}'.format(rf_best.score(X_normalized_test, y_test)))            # Random Forest accuracy
print('SVM: {}'.format(model.score(X_normalized_test, y_test)))

Logistic Regression: 0.9226190476190477
Naive Bayes: 0.9464285714285714
Decision Tree: 0.9226190476190477
Random Forest: 0.9464285714285714
SVM: 0.9285714285714286


- Naive Bayes stands out with the highest accuracy, indicating its effectiveness given the dataset's features and their distribution.
- Random Forest also shows strong performance, slightly better than SVM, highlighting the advantages of ensemble learning in handling overfitting and improving generalization.
- SVM’s performance is very commendable, placed in the middle among these models, and is particularly versatile for complex datasets with its ability to model both linear and non-linear boundaries.
- Logistic Regression and Decision Tree both have the lowest accuracy but offer benefits in terms of simplicity and interpretability, respectively.

In [29]:
# Classification reports for all models
print('Logistic Regression: ' + classification_report(y_test, log_reg.predict(X_normalized_test)))
print('Naive Bayes: ' + classification_report(y_test, naive_model.predict(X_normalized_test)))
print('Decision Tree: ' + classification_report(y_test, model_dt.predict(X_normalized_test)))
print('Random Forest: ' + classification_report(y_test, rf_best.predict(X_normalized_test)))
print('SVM: ' + classification_report(y_test, model.predict(X_normalized_test)))

Logistic Regression:               precision    recall  f1-score   support

           0       0.95      0.94      0.94       114
           1       0.87      0.89      0.88        54

    accuracy                           0.92       168
   macro avg       0.91      0.91      0.91       168
weighted avg       0.92      0.92      0.92       168

Naive Bayes:               precision    recall  f1-score   support

           0       0.96      0.96      0.96       114
           1       0.91      0.93      0.92        54

    accuracy                           0.95       168
   macro avg       0.94      0.94      0.94       168
weighted avg       0.95      0.95      0.95       168

Decision Tree:               precision    recall  f1-score   support

           0       0.96      0.92      0.94       114
           1       0.85      0.93      0.88        54

    accuracy                           0.92       168
   macro avg       0.91      0.92      0.91       168
weighted avg       0.93  

# Apply three model ensemble technics, i.e., Bagging, Boosting and Stacking, to solve the problem, then compare their performance with each other and with the use of individual models. Draw conclusion from what has been observed

### Bagging

In [30]:
# Create a Support Vector Machine Classifier as the base estimator
base_svm = SVC(kernel='linear', C=1.0)

In [31]:
# Create a Bagging Classifier with SVM as the base model
from sklearn.ensemble import BaggingClassifier
bagging_clf = BaggingClassifier(estimator=base_svm, n_estimators=10, max_samples=0.5)

In [32]:
# Train the Bagging Classifier
bagging_clf.fit(X_normalized_train, y_train)

In [33]:
# Making predictions on the test set
y_pred = bagging_clf.predict(X_normalized_test)

# Evaluating the accuracy of the model
from sklearn.metrics import accuracy_score
accuracy_bg = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy_bg)

Accuracy: 0.9345238095238095


### Boosting

##### AdaBoost

In [34]:
# Create an AdaBoost Classifier with Decision Tree as the base model
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
ada_clf = AdaBoostClassifier(estimator = DecisionTreeClassifier(), n_estimators=10)

# Train the AdaBoost Classifier
ada_clf.fit(X_normalized_train, y_train)

# Making predictions on the test set
y_pred_ada = ada_clf.predict(X_normalized_test)

# Evaluating the accuracy of the model
accuracy_ada = accuracy_score(y_test, y_pred_ada)
print("AdaBoost Classifier Accuracy:", accuracy_ada)

AdaBoost Classifier Accuracy: 0.8869047619047619


##### Gradient Boosting

In [35]:
# Create a Gradient Boosting Classifier which uses Decision Tree as boosting model by default
gb_clf = GradientBoostingClassifier(n_estimators=10, learning_rate=0.1)

# Train the Gradient Boosting Classifier
gb_clf.fit(X_normalized_train, y_train)

# Making predictions on the test set
y_pred_gb = gb_clf.predict(X_normalized_test)

# Evaluating the accuracy of the model
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print("Gradient Boosting Classifier Accuracy:", accuracy_gb)

Gradient Boosting Classifier Accuracy: 0.9404761904761905


##### XGBoost

In [36]:
# Create an XGBoost Classifier
from xgboost import XGBClassifier
xgb_clf = XGBClassifier(n_estimators=100, learning_rate=0.1)

# Train the XGBoost Classifier
xgb_clf.fit(X_normalized_train, y_train)

# Making predictions on the test set
y_pred_xgb = xgb_clf.predict(X_normalized_test)

# Evaluating the accuracy of the model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print("XGBoost Classifier Accuracy:", accuracy_xgb)

XGBoost Classifier Accuracy: 0.9285714285714286


### Stacking

In [37]:
# Ensemble the four models using hard (majority) voting
from sklearn.ensemble import VotingClassifier
estimators=[('log_reg', log_reg), ('naive_model', naive_model), ('dt', model_dt), ('rf', rf_best)]    # Initialize base models in the ensemble
ensemble = VotingClassifier(estimators, voting='hard')                                     # Define how to ensemble them, i.e., hard voting

In [38]:
# Train the model ensemble on the training set
ensemble.fit(X_normalized_train, y_train)          # Train the ensemble on the training set
ensemble.score(X_normalized_test, y_test)          # Test the ensemble on the test set

0.9523809523809523

### Compare

In [39]:
print("Bagging Classifier Accuracy:", accuracy_bg)
print("AdaBoost Classifier Accuracy:", accuracy_ada)
print("Gradient Boosting Classifier Accuracy:", accuracy_gb)
print("XGBoost Classifier Accuracy:", accuracy_xgb)
print("XGBoost Classifier Accuracy:", accuracy_xgb)
print("Stacking Classifier Accuracy:", ensemble.score(X_normalized_test, y_test))

Bagging Classifier Accuracy: 0.9345238095238095
AdaBoost Classifier Accuracy: 0.8869047619047619
Gradient Boosting Classifier Accuracy: 0.9404761904761905
XGBoost Classifier Accuracy: 0.9285714285714286
XGBoost Classifier Accuracy: 0.9285714285714286
Stacking Classifier Accuracy: 0.9523809523809523


### Conclusion

- Ensemble methods overall tend to show strong performance, particularly Gradient Boosting and Stacking, highlighting their ability to reduce both bias and variance.
- The relatively lower performance of AdaBoost may indicate a mismatch between the model complexity and the data characteristics, or possibly suboptimal parameter settings.
- Stacking confirms its theoretical advantage in achieving high performance by effectively combining different learning algorithms.

=> While ensemble methods generally provide robust results and can outperform individual models, the choice between them should consider both the nature of the data and the specific strengths of each classifier. For critical applications, a combination of different types of models might be prudent to achieve both high accuracy and model robustness.