# Breast Cancer Classification - Data preprocessing

In [2]:
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import sys
sys.path.append('../functions')
from functions import evaluate_classifier
import pickle

In [3]:
# Load scaled train/test data
with open("../data/02_train-test-data-scaled.pkl", "rb") as f:
    X_train_scaled, X_test_scaled, y_train, y_test = pickle.load(f)

# Load raw (unscaled) train/test data
with open("../data/02_train-test-data-raw.pkl", "rb") as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

In [4]:
X_train_scaled

array([[-0.87737439, -0.99632936, -0.84836506, ..., -0.11026997,
         0.63086499,  0.39596053],
       [ 0.23276443, -0.39373657,  0.19955174, ...,  0.79850777,
         0.81341746, -0.68247993],
       [ 0.20707998, -0.53919   ,  0.11856132, ..., -0.53687358,
        -0.62276969, -0.8610112 ],
       ...,
       [-0.03264151, -0.83471443, -0.10044337, ..., -0.49926369,
        -1.25120254, -0.92628669],
       [-0.99152748, -0.95246245, -1.01365162, ..., -1.76049983,
        -0.3239006 , -1.23090567],
       [ 2.8868238 ,  0.21347378,  3.06520748, ...,  1.69039945,
         0.50323981, -0.20156131]])

## Initial models

In [6]:
model_results = pd.DataFrame({
    "Model": pd.Series(dtype='str'),
    "Train Accuracy": pd.Series(dtype='float'),
    "Test Accuracy": pd.Series(dtype='float'),
    "Train Precision": pd.Series(dtype='float'),
    "Test Precision": pd.Series(dtype='float'),
    "Train Recall": pd.Series(dtype='float'),
    "Test Recall": pd.Series(dtype='float'),
    "Train F1": pd.Series(dtype='float'),
    "Test F1": pd.Series(dtype='float'),
})

### Logistic regression

In [8]:
lg = LogisticRegression(max_iter=5000)
lg.fit(X_train_scaled, y_train)

In [9]:
evaluate_classifier(lg, "Logistic Regression", X_train_scaled, y_train, X_test_scaled, y_test)


= CLASSIFICATION METRICS COMPARISON: Logistic Regression =
Metric          |    Train |     Test
----------------------------------------
Accuracy        | 0.9890 | 0.9649
Precision       | 0.9943 | 0.8947
Recall          | 0.9775 | 1.0000
F1              | 0.9858 | 0.9444


NameError: name 'model_results' is not defined

### K-nearest neighbors

In [None]:
knn = KNeighborsClassifier(n_neighbors=21)
knn.fit(X_train_scaled, y_train)

In [None]:
evaluate_classifier(knn, "KNN (k=21)", X_train_scaled, y_train, X_test_scaled, y_test, average='macro')

### Decision tree

In [None]:
dt = DecisionTreeClassifier(
    max_depth=5,
    min_samples_split=5,
    min_samples_leaf=1,
    random_state=4)

dt.fit(X_train_scaled, y_train)

In [None]:
evaluate_classifier(dt, "Decision Tree", X_train_scaled, y_train, X_test_scaled, y_test, average='macro')

### Random forest

In [None]:
rf = RandomForestClassifier(max_depth=5, random_state=4)
rf.fit(X_train_scaled, y_train)

In [None]:
evaluate_classifier(rf, "Random Forest", X_train_scaled, y_train, X_test_scaled, y_test, average='macro')

### XGBoost

In [None]:
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    eval_metric='logloss',
    random_state=42
)

In [None]:
xgb.fit(X_train_scaled, y_train)

In [None]:
evaluate_classifier(xgb, "XGBoost", X_train_scaled, y_train, X_test_scaled, y_test, average='macro')

In [None]:
model_results