# Breast Cancer Classification - Data preprocessing

In [3]:
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import sys
sys.path.append('../functions')
from functions import ModelResults
import pickle

In [4]:
# Load scaled train/test data
with open("../data/02_train-test-data-scaled.pkl", "rb") as f:
    X_train_scaled, X_test_scaled, y_train, y_test = pickle.load(f)

# Load raw (unscaled) train/test data
with open("../data/02_train-test-data-raw.pkl", "rb") as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

In [5]:
X_train_scaled

array([[-1.44075296, -0.43531947, -1.36208497, ...,  0.9320124 ,
         2.09724217,  1.88645014],
       [ 1.97409619,  1.73302577,  2.09167167, ...,  2.6989469 ,
         1.89116053,  2.49783848],
       [-1.39998202, -1.24962228, -1.34520926, ..., -0.97023893,
         0.59760192,  0.0578942 ],
       ...,
       [ 0.04880192, -0.55500086, -0.06512547, ..., -1.23903365,
        -0.70863864, -1.27145475],
       [-0.03896885,  0.10207345, -0.03137406, ...,  1.05001236,
         0.43432185,  1.21336207],
       [-0.54860557,  0.31327591, -0.60350155, ..., -0.61102866,
        -0.3345212 , -0.84628745]])

## Initial models

In [7]:
results = ModelResults()

### Logistic regression

In [9]:
lg = LogisticRegression(max_iter=5000)
lg.fit(X_train_scaled, y_train)

In [10]:
results.add(lg, "Logistic Regression", X_train_scaled, y_train, X_test_scaled, y_test)
results.get_results()


= CLASSIFICATION METRICS COMPARISON: Logistic Regression =
Metric          |    Train |     Test
----------------------------------------
Accuracy        | 0.9868 | 0.9737
Precision       | 0.9880 | 0.9762
Recall          | 0.9763 | 0.9535
F1              | 0.9821 | 0.9647


Unnamed: 0,Model,Train Accuracy,Test Accuracy,Train Precision,Test Precision,Train Recall,Test Recall,Train F1,Test F1
0,Logistic Regression,0.986813,0.973684,0.988024,0.97619,0.976331,0.953488,0.982143,0.964706


### K-nearest neighbors

In [12]:
knn = KNeighborsClassifier(n_neighbors=21)
knn.fit(X_train_scaled, y_train)

In [13]:
results.add(knn, "KNN (k=21)", X_train_scaled, y_train, X_test_scaled, y_test, average='macro')
results.get_results()


= CLASSIFICATION METRICS COMPARISON: KNN (k=21) =
Metric          |    Train |     Test
----------------------------------------
Accuracy        | 0.9582 | 0.9474
Precision       | 0.9689 | 0.9482
Recall          | 0.9438 | 0.9394
F1              | 0.9541 | 0.9435


Unnamed: 0,Model,Train Accuracy,Test Accuracy,Train Precision,Test Precision,Train Recall,Test Recall,Train F1,Test F1
0,Logistic Regression,0.986813,0.973684,0.988024,0.97619,0.976331,0.953488,0.982143,0.964706
1,KNN (k=21),0.958242,0.947368,0.968852,0.948212,0.943787,0.939404,0.954145,0.943452


### Decision tree

In [15]:
dt = DecisionTreeClassifier(
    max_depth=5,
    min_samples_split=5,
    min_samples_leaf=1,
    random_state=4)

dt.fit(X_train_scaled, y_train)

In [16]:
results.add(dt, "Decision Tree", X_train_scaled, y_train, X_test_scaled, y_test, average='macro')
results.get_results()


= CLASSIFICATION METRICS COMPARISON: Decision Tree =
Metric          |    Train |     Test
----------------------------------------
Accuracy        | 0.9934 | 0.9474
Precision       | 0.9935 | 0.9440
Recall          | 0.9923 | 0.9440
F1              | 0.9929 | 0.9440


Unnamed: 0,Model,Train Accuracy,Test Accuracy,Train Precision,Test Precision,Train Recall,Test Recall,Train F1,Test F1
0,Logistic Regression,0.986813,0.973684,0.988024,0.97619,0.976331,0.953488,0.982143,0.964706
1,KNN (k=21),0.958242,0.947368,0.968852,0.948212,0.943787,0.939404,0.954145,0.943452
2,Decision Tree,0.993407,0.947368,0.993539,0.94399,0.992335,0.94399,0.992931,0.94399


### Random forest

In [18]:
rf = RandomForestClassifier(max_depth=5, random_state=4)
rf.fit(X_train_scaled, y_train)

In [19]:
results.add(rf, "Random Forest", X_train_scaled, y_train, X_test_scaled, y_test, average='macro')
results.get_results()


= CLASSIFICATION METRICS COMPARISON: Random Forest =
Metric          |    Train |     Test
----------------------------------------
Accuracy        | 0.9934 | 0.9561
Precision       | 0.9948 | 0.9554
Recall          | 0.9911 | 0.9510
F1              | 0.9929 | 0.9531


Unnamed: 0,Model,Train Accuracy,Test Accuracy,Train Precision,Test Precision,Train Recall,Test Recall,Train F1,Test F1
0,Logistic Regression,0.986813,0.973684,0.988024,0.97619,0.976331,0.953488,0.982143,0.964706
1,KNN (k=21),0.958242,0.947368,0.968852,0.948212,0.943787,0.939404,0.954145,0.943452
2,Decision Tree,0.993407,0.947368,0.993539,0.94399,0.992335,0.94399,0.992931,0.94399
3,Random Forest,0.993407,0.95614,0.99481,0.955357,0.991124,0.951032,0.992914,0.953106


### XGBoost

In [21]:
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    eval_metric='logloss',
    random_state=42
)

In [22]:
xgb.fit(X_train_scaled, y_train)

In [23]:
results.add(xgb, "XGBoost", X_train_scaled, y_train, X_test_scaled, y_test, average='macro')
results.get_results()


= CLASSIFICATION METRICS COMPARISON: XGBoost =
Metric          |    Train |     Test
----------------------------------------
Accuracy        | 1.0000 | 0.9561
Precision       | 1.0000 | 0.9554
Recall          | 1.0000 | 0.9510
F1              | 1.0000 | 0.9531


Unnamed: 0,Model,Train Accuracy,Test Accuracy,Train Precision,Test Precision,Train Recall,Test Recall,Train F1,Test F1
0,Logistic Regression,0.986813,0.973684,0.988024,0.97619,0.976331,0.953488,0.982143,0.964706
1,KNN (k=21),0.958242,0.947368,0.968852,0.948212,0.943787,0.939404,0.954145,0.943452
2,Decision Tree,0.993407,0.947368,0.993539,0.94399,0.992335,0.94399,0.992931,0.94399
3,Random Forest,0.993407,0.95614,0.99481,0.955357,0.991124,0.951032,0.992914,0.953106
4,XGBoost,1.0,0.95614,1.0,0.955357,1.0,0.951032,1.0,0.953106


<b><i>
Given the medical context of the problem — predicting whether a breast tumor is malignant — **recall** is used as the primary evaluation metric. This is because the cost of a false negative (failing to identify a malignant tumor) is significantly higher than that of a false positive (incorrectly flagging a benign case as malignant). In other words, it is better to unnecessarily alarm some patients than to overlook cancer in others.

Initial results suggest that **Logistic Regression** may be the best-performing model in terms of recall. However, both the training and test sets are relatively small, and results based on a single data split may not be fully reliable. Therefore, it is more appropriate to rely on **cross-validation-based performance** to draw conclusions about model effectiveness
</b></i>