In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer

# Load dataset
cancer = load_breast_cancer()

# Convert to DataFrame
data = pd.DataFrame(cancer.data, columns=cancer.feature_names)

# Add target column
data['target'] = cancer.target

print("Dataset Shape:", data.shape)
data.head()

Dataset Shape: (569, 31)


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [3]:
print(data['target'].value_counts())

target
1    357
0    212
Name: count, dtype: int64


In [4]:
from sklearn.model_selection import train_test_split

X = data.drop('target', axis=1)
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Training set shape: (455, 30)
Test set shape: (114, 30)


In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef,
    confusion_matrix
)

In [7]:
# Initialize model
log_model = LogisticRegression(max_iter=1000, random_state=42)

# Train
log_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_log = log_model.predict(X_test_scaled)
y_prob_log = log_model.predict_proba(X_test_scaled)[:, 1]

In [8]:
log_accuracy = accuracy_score(y_test, y_pred_log)
log_auc = roc_auc_score(y_test, y_prob_log)
log_precision = precision_score(y_test, y_pred_log)
log_recall = recall_score(y_test, y_pred_log)
log_f1 = f1_score(y_test, y_pred_log)
log_mcc = matthews_corrcoef(y_test, y_pred_log)

print("Logistic Regression Results:")
print("Accuracy:", log_accuracy)
print("AUC:", log_auc)
print("Precision:", log_precision)
print("Recall:", log_recall)
print("F1 Score:", log_f1)
print("MCC:", log_mcc)

Logistic Regression Results:
Accuracy: 0.9824561403508771
AUC: 0.9953703703703703
Precision: 0.9861111111111112
Recall: 0.9861111111111112
F1 Score: 0.9861111111111112
MCC: 0.9623015873015873


In [9]:
results = []

results.append([
    "Logistic Regression",
    log_accuracy,
    log_auc,
    log_precision,
    log_recall,
    log_f1,
    log_mcc
])

In [10]:
from sklearn.tree import DecisionTreeClassifier

# Initialize model
dt_model = DecisionTreeClassifier(random_state=42)

# Train
dt_model.fit(X_train, y_train)

# Predict
y_pred_dt = dt_model.predict(X_test)
y_prob_dt = dt_model.predict_proba(X_test)[:, 1]

# Metrics
dt_accuracy = accuracy_score(y_test, y_pred_dt)
dt_auc = roc_auc_score(y_test, y_prob_dt)
dt_precision = precision_score(y_test, y_pred_dt)
dt_recall = recall_score(y_test, y_pred_dt)
dt_f1 = f1_score(y_test, y_pred_dt)
dt_mcc = matthews_corrcoef(y_test, y_pred_dt)

print("Decision Tree Results:")
print("Accuracy:", dt_accuracy)
print("AUC:", dt_auc)
print("Precision:", dt_precision)
print("Recall:", dt_recall)
print("F1 Score:", dt_f1)
print("MCC:", dt_mcc)

# Store results
results.append([
    "Decision Tree",
    dt_accuracy,
    dt_auc,
    dt_precision,
    dt_recall,
    dt_f1,
    dt_mcc
])

Decision Tree Results:
Accuracy: 0.9122807017543859
AUC: 0.9156746031746031
Precision: 0.9558823529411765
Recall: 0.9027777777777778
F1 Score: 0.9285714285714286
MCC: 0.8174119974927639


In [11]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize model
knn_model = KNeighborsClassifier(n_neighbors=5)

# Train
knn_model.fit(X_train_scaled, y_train)

# Predict
y_pred_knn = knn_model.predict(X_test_scaled)
y_prob_knn = knn_model.predict_proba(X_test_scaled)[:, 1]

# Metrics
knn_accuracy = accuracy_score(y_test, y_pred_knn)
knn_auc = roc_auc_score(y_test, y_prob_knn)
knn_precision = precision_score(y_test, y_pred_knn)
knn_recall = recall_score(y_test, y_pred_knn)
knn_f1 = f1_score(y_test, y_pred_knn)
knn_mcc = matthews_corrcoef(y_test, y_pred_knn)

print("KNN Results:")
print("Accuracy:", knn_accuracy)
print("AUC:", knn_auc)
print("Precision:", knn_precision)
print("Recall:", knn_recall)
print("F1 Score:", knn_f1)
print("MCC:", knn_mcc)

# Store results
results.append([
    "KNN",
    knn_accuracy,
    knn_auc,
    knn_precision,
    knn_recall,
    knn_f1,
    knn_mcc
])

KNN Results:
Accuracy: 0.956140350877193
AUC: 0.9788359788359788
Precision: 0.958904109589041
Recall: 0.9722222222222222
F1 Score: 0.9655172413793104
MCC: 0.9054466190452621


In [12]:
from sklearn.naive_bayes import GaussianNB

# Initialize model
nb_model = GaussianNB()

# Train
nb_model.fit(X_train_scaled, y_train)

# Predict
y_pred_nb = nb_model.predict(X_test_scaled)
y_prob_nb = nb_model.predict_proba(X_test_scaled)[:, 1]

# Metrics
nb_accuracy = accuracy_score(y_test, y_pred_nb)
nb_auc = roc_auc_score(y_test, y_prob_nb)
nb_precision = precision_score(y_test, y_pred_nb)
nb_recall = recall_score(y_test, y_pred_nb)
nb_f1 = f1_score(y_test, y_pred_nb)
nb_mcc = matthews_corrcoef(y_test, y_pred_nb)

print("Naive Bayes Results:")
print("Accuracy:", nb_accuracy)
print("AUC:", nb_auc)
print("Precision:", nb_precision)
print("Recall:", nb_recall)
print("F1 Score:", nb_f1)
print("MCC:", nb_mcc)

# Store results
results.append([
    "Naive Bayes",
    nb_accuracy,
    nb_auc,
    nb_precision,
    nb_recall,
    nb_f1,
    nb_mcc
])

Naive Bayes Results:
Accuracy: 0.9298245614035088
AUC: 0.9867724867724867
Precision: 0.9444444444444444
Recall: 0.9444444444444444
F1 Score: 0.9444444444444444
MCC: 0.8492063492063492


In [13]:
from sklearn.ensemble import RandomForestClassifier

# Initialize model
rf_model = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)

# Train
rf_model.fit(X_train, y_train)

# Predict
y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]

# Metrics
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_auc = roc_auc_score(y_test, y_prob_rf)
rf_precision = precision_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)
rf_mcc = matthews_corrcoef(y_test, y_pred_rf)

print("Random Forest Results:")
print("Accuracy:", rf_accuracy)
print("AUC:", rf_auc)
print("Precision:", rf_precision)
print("Recall:", rf_recall)
print("F1 Score:", rf_f1)
print("MCC:", rf_mcc)

# Store results
results.append([
    "Random Forest",
    rf_accuracy,
    rf_auc,
    rf_precision,
    rf_recall,
    rf_f1,
    rf_mcc
])

Random Forest Results:
Accuracy: 0.956140350877193
AUC: 0.9930555555555556
Precision: 0.958904109589041
Recall: 0.9722222222222222
F1 Score: 0.9655172413793104
MCC: 0.9054466190452621


In [14]:
from xgboost import XGBClassifier

# Initialize model
xgb_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# Train
xgb_model.fit(X_train, y_train)

# Predict
y_pred_xgb = xgb_model.predict(X_test)
y_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]

# Metrics
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
xgb_auc = roc_auc_score(y_test, y_prob_xgb)
xgb_precision = precision_score(y_test, y_pred_xgb)
xgb_recall = recall_score(y_test, y_pred_xgb)
xgb_f1 = f1_score(y_test, y_pred_xgb)
xgb_mcc = matthews_corrcoef(y_test, y_pred_xgb)

print("XGBoost Results:")
print("Accuracy:", xgb_accuracy)
print("AUC:", xgb_auc)
print("Precision:", xgb_precision)
print("Recall:", xgb_recall)
print("F1 Score:", xgb_f1)
print("MCC:", xgb_mcc)

# Store results
results.append([
    "XGBoost",
    xgb_accuracy,
    xgb_auc,
    xgb_precision,
    xgb_recall,
    xgb_f1,
    xgb_mcc
])

XGBoost Results:
Accuracy: 0.956140350877193
AUC: 0.9900793650793651
Precision: 0.9466666666666667
Recall: 0.9861111111111112
F1 Score: 0.9659863945578231
MCC: 0.9058238738943076


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [15]:
import pandas as pd

comparison_df = pd.DataFrame(results, columns=[
    "Model",
    "Accuracy",
    "AUC",
    "Precision",
    "Recall",
    "F1 Score",
    "MCC"
])

comparison_df

Unnamed: 0,Model,Accuracy,AUC,Precision,Recall,F1 Score,MCC
0,Logistic Regression,0.982456,0.99537,0.986111,0.986111,0.986111,0.962302
1,Decision Tree,0.912281,0.915675,0.955882,0.902778,0.928571,0.817412
2,KNN,0.95614,0.978836,0.958904,0.972222,0.965517,0.905447
3,Naive Bayes,0.929825,0.986772,0.944444,0.944444,0.944444,0.849206
4,Random Forest,0.95614,0.993056,0.958904,0.972222,0.965517,0.905447
5,XGBoost,0.95614,0.990079,0.946667,0.986111,0.965986,0.905824


In [16]:
import joblib
import os

# Create model folder if not exists
os.makedirs("model", exist_ok=True)

# Save models
joblib.dump(log_model, "model/logistic_regression.pkl")
joblib.dump(dt_model, "model/decision_tree.pkl")
joblib.dump(knn_model, "model/knn.pkl")
joblib.dump(nb_model, "model/naive_bayes.pkl")
joblib.dump(rf_model, "model/random_forest.pkl")
joblib.dump(xgb_model, "model/xgboost.pkl")

# Save scaler
joblib.dump(scaler, "model/scaler.pkl")

print("All models and scaler saved successfully!")

All models and scaler saved successfully!


In [17]:
import os
os.listdir("model")

['decision_tree.pkl',
 'knn.pkl',
 'logistic_regression.pkl',
 'naive_bayes.pkl',
 'random_forest.pkl',
 'scaler.pkl',
 'xgboost.pkl']