In [1]:
pip install scikit-learn pandas scipy


Note: you may need to restart the kernel to use updated packages.


In [None]:
// https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data //

In [2]:
// https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [16]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=2, random_state=42, n_init=10, algorithm='lloyd')
kmeans_labels = kmeans.fit_predict(X_scaled)

print("K-Means Clustering Labels (first 10):", kmeans_labels[:10])


K-Means Clustering Labels (first 10): [1 1 1 1 1 1 1 1 1 1]


In [4]:
from sklearn.cluster import AgglomerativeClustering
agg = AgglomerativeClustering(n_clusters=2)
agg_labels = agg.fit_predict(X_scaled)
print("Hierarchical Clustering Labels (first 10):", agg_labels[:10])


Hierarchical Clustering Labels (first 10): [0 0 0 0 0 0 0 0 0 0]


In [5]:
from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps=2.0, min_samples=5)
dbscan_labels = dbscan.fit_predict(X_scaled)
print("DBSCAN Labels (first 10):", dbscan_labels[:10])


DBSCAN Labels (first 10): [-1 -1 -1 -1 -1 -1  0 -1 -1 -1]


In [6]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=2, random_state=42)
gmm_labels = gmm.fit_predict(X_scaled)
print("GMM Labels (first 10):", gmm_labels[:10])


GMM Labels (first 10): [0 0 0 0 0 0 0 0 0 0]


In [7]:
from sklearn.linear_model import LogisticRegression
l1_model = LogisticRegression(penalty='l1', solver='liblinear')
l1_model.fit(X_train, y_train)
l1_pred = l1_model.predict(X_test)
print("L1 Regularization (Logistic Regression)\n", classification_report(y_test, l1_pred))


L1 Regularization (Logistic Regression)
               precision    recall  f1-score   support

           0       0.95      0.98      0.97        43
           1       0.99      0.97      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [8]:
l2_model = LogisticRegression(penalty='l2', solver='liblinear')
l2_model.fit(X_train, y_train)
l2_pred = l2_model.predict(X_test)
print("L2 Regularization (Logistic Regression)\n", classification_report(y_test, l2_pred))


L2 Regularization (Logistic Regression)
               precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [9]:
from sklearn.linear_model import ElasticNetCV
elastic_model = ElasticNetCV(cv=5)
elastic_model.fit(X_train, y_train)
elastic_preds = np.round(elastic_model.predict(X_test))
print("ElasticNet Regression\n", classification_report(y_test, elastic_preds))


ElasticNet Regression
               precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00         0
         0.0       0.97      0.91      0.94        43
         1.0       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.64      0.63      0.64       114
weighted avg       0.96      0.96      0.96       114



In [10]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print("Random Forest\n", classification_report(y_test, rf_pred))


Random Forest
               precision    recall  f1-score   support

           0       0.95      0.93      0.94        43
           1       0.96      0.97      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



In [11]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
gb_pred = gb.predict(X_test)
print("Gradient Boosting\n", classification_report(y_test, gb_pred))


Gradient Boosting
               precision    recall  f1-score   support

           0       0.95      0.93      0.94        43
           1       0.96      0.97      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



In [15]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=10)
bag.fit(X_train, y_train)
bag_pred = bag.predict(X_test)
print("Bagging\n", classification_report(y_test, bag_pred))



Bagging
               precision    recall  f1-score   support

           0       0.95      0.93      0.94        43
           1       0.96      0.97      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



In [13]:
from sklearn.ensemble import AdaBoostClassifier
boost = AdaBoostClassifier(n_estimators=50)
boost.fit(X_train, y_train)
boost_pred = boost.predict(X_test)
print("AdaBoost\n", classification_report(y_test, boost_pred))


AdaBoost
               precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



In [14]:
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC

base_learners = [
    ('lr', LogisticRegression()),
    ('svc', SVC(probability=True)),
    ('tree', DecisionTreeClassifier())
]
stack = StackingClassifier(estimators=base_learners, final_estimator=LogisticRegression())
stack.fit(X_train, y_train)
stack_pred = stack.predict(X_test)
print("Stacking Classifier\n", classification_report(y_test, stack_pred))


Stacking Classifier
               precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [17]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)
print("Logistic Regression:\n", classification_report(y_test, log_pred))


Logistic Regression:
               precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [18]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
tree_pred = tree_model.predict(X_test)
print("Decision Tree:\n", classification_report(y_test, tree_pred))


Decision Tree:
               precision    recall  f1-score   support

           0       0.93      0.91      0.92        43
           1       0.94      0.96      0.95        71

    accuracy                           0.94       114
   macro avg       0.94      0.93      0.93       114
weighted avg       0.94      0.94      0.94       114



In [19]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
print("Random Forest:\n", classification_report(y_test, rf_pred))


Random Forest:
               precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



In [20]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)
print("SVM:\n", classification_report(y_test, svm_pred))


SVM:
               precision    recall  f1-score   support

           0       0.98      0.95      0.96        43
           1       0.97      0.99      0.98        71

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114



In [21]:
from sklearn.model_selection import GridSearchCV

param_grid_lr = {'C': [0.1, 1, 10], 'solver': ['liblinear', 'lbfgs']}
grid_lr = GridSearchCV(LogisticRegression(), param_grid_lr, cv=5, scoring='f1')
grid_lr.fit(X_train, y_train)
grid_lr_pred = grid_lr.predict(X_test)
print("Tuned Logistic Regression:\n", classification_report(y_test, grid_lr_pred))
print("Best Params:", grid_lr.best_params_)


Tuned Logistic Regression:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99        43
           1       0.99      1.00      0.99        71

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114

Best Params: {'C': 0.1, 'solver': 'liblinear'}


In [22]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_dist_rf = {'n_estimators': randint(50, 200), 'max_depth': [None, 10, 20], 'min_samples_split': randint(2, 10)}
rand_rf = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_dist_rf, n_iter=10, cv=5, scoring='f1', random_state=42)
rand_rf.fit(X_train, y_train)
rand_rf_pred = rand_rf.predict(X_test)
print("Tuned Random Forest:\n", classification_report(y_test, rand_rf_pred))
print("Best Params:", rand_rf.best_params_)


Tuned Random Forest:
               precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

Best Params: {'max_depth': None, 'min_samples_split': 6, 'n_estimators': 152}


In [23]:
from sklearn.metrics import f1_score

tuned_models = {
    "Tuned Logistic Regression": grid_lr.best_estimator_,
    "Tuned Random Forest": rand_rf.best_estimator_,
}

best_model = None
best_f1 = 0

for name, model in tuned_models.items():
    pred = model.predict(X_test)
    score = f1_score(y_test, pred)
    print(f"{name} F1 Score: {score:.4f}")
    if score > best_f1:
        best_model = name
        best_f1 = score

print(f"\nBest Performing Model: {best_model} with F1 Score: {best_f1:.4f}")


Tuned Logistic Regression F1 Score: 0.9930
Tuned Random Forest F1 Score: 0.9722

Best Performing Model: Tuned Logistic Regression with F1 Score: 0.9930
