### Breast Cancer 

In [569]:
from sklearn.datasets import load_breast_cancer


bc = load_breast_cancer()
print(bc.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

:Number of Instances: 569

:Number of Attributes: 30 numeric, predictive attributes and the class

:Attribute Information:
    - radius (mean of distances from center to points on the perimeter)
    - texture (standard deviation of gray-scale values)
    - perimeter
    - area
    - smoothness (local variation in radius lengths)
    - compactness (perimeter^2 / area - 1.0)
    - concavity (severity of concave portions of the contour)
    - concave points (number of concave portions of the contour)
    - symmetry
    - fractal dimension ("coastline approximation" - 1)

    The mean, standard error, and "worst" or largest (mean of the three
    worst/largest values) of these features were computed for each image,
    resulting in 30 features.  For instance, field 0 is Mean Radius, field
    10 is Radius SE, field 20 is Worst Radius.

    - 

### Dataset Info

In [570]:
bc.target[20]

np.int64(1)

In [571]:
bc.target.shape

(569,)

In [572]:
bc.data

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]], shape=(569, 30))

In [573]:
bc.data.shape

(569, 30)

### Preprocessing

In [574]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(bc.data, bc.target, test_size=0.2)

In [575]:
print(f'Feature=> train: {x_train.shape} - test: {x_test.shape}')
print(f'Label=> train: {y_train.shape} - test: {y_test.shape}')

Feature=> train: (455, 30) - test: (114, 30)
Label=> train: (455,) - test: (114,)


In [576]:
x_train[0]

array([1.486e+01, 1.694e+01, 9.489e+01, 6.737e+02, 8.924e-02, 7.074e-02,
       3.346e-02, 2.877e-02, 1.573e-01, 5.703e-02, 3.028e-01, 6.683e-01,
       1.612e+00, 2.392e+01, 5.756e-03, 1.665e-02, 1.461e-02, 8.281e-03,
       1.551e-02, 2.168e-03, 1.631e+01, 2.054e+01, 1.023e+02, 7.775e+02,
       1.218e-01, 1.550e-01, 1.220e-01, 7.971e-02, 2.525e-01, 6.827e-02])

In [577]:
from sklearn.preprocessing import MinMaxScaler


scaler = MinMaxScaler(feature_range=(0, 1))
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [578]:
x_train[0]

array([0.38548853, 0.24450457, 0.35706799, 0.22489926, 0.33050465,
       0.15753635, 0.08145083, 0.15319489, 0.25909091, 0.1489048 ,
       0.0785465 , 0.06809141, 0.04778405, 0.03120163, 0.13743754,
       0.10813531, 0.03689394, 0.15686683, 0.10733382, 0.04398657,
       0.29811455, 0.22707889, 0.2584292 , 0.14557118, 0.34388372,
       0.14024665, 0.09744409, 0.27457802, 0.18923714, 0.11215666])

### Classification

In [579]:
from sklearn.metrics import accuracy_score, precision_score, recall_score


def calculate_metrics(y_train, y_test, y_pred_train, y_pred_test):
    acc_train = accuracy_score(y_true=y_train, y_pred=y_pred_train)
    acc_test = accuracy_score(y_true=y_test, y_pred=y_pred_test)

    p = precision_score(y_true=y_test, y_pred=y_pred_test)
    r = recall_score(y_true=y_test, y_pred=y_pred_test)

    print(f'acc_train: {acc_train} - acc test: {acc_test} - precision: {p} - recall: {r}')

    return acc_train, acc_test, p, r

#### 1. Naive bayes

In [580]:
from sklearn.naive_bayes import GaussianNB


gnb = GaussianNB()
gnb.fit(x_train, y_train)

In [581]:
y_pred_train = gnb.predict(x_train)
y_pred_test = gnb.predict(x_test)

acc_train_gnb, acc_test_gnb, p_gnb, r_gnb = calculate_metrics(y_train, y_test, y_pred_train, y_pred_test)

acc_train: 0.9494505494505494 - acc test: 0.9035087719298246 - precision: 0.921875 - recall: 0.9076923076923077


### 2. KNN

In [582]:
from sklearn.neighbors import KNeighborsClassifier


knn = KNeighborsClassifier(n_neighbors=8, algorithm='kd_tree', leaf_size=28)
knn.fit(x_train, y_train)

In [583]:
y_pred_train = knn.predict(x_train)
y_pred_test = knn.predict(x_test)

acc_train_knn, acc_test_knn, p_knn, r_knn = calculate_metrics(y_train, y_test, y_pred_train, y_pred_test)

acc_train: 0.978021978021978 - acc test: 0.9736842105263158 - precision: 0.9696969696969697 - recall: 0.9846153846153847


### 3. Decision Tree

In [584]:
from sklearn.tree import DecisionTreeClassifier


dt = DecisionTreeClassifier(max_depth=32, min_samples_split=4, criterion='gini')
dt.fit(x_train, y_train)

In [585]:
y_pred_train = dt.predict(x_train)
y_pred_test = dt.predict(x_test)

acc_train_dt, acc_test_dt, p_dt, r_dt = calculate_metrics(y_train, y_test, y_pred_train, y_pred_test)

acc_train: 0.9956043956043956 - acc test: 0.9385964912280702 - precision: 0.953125 - recall: 0.9384615384615385


### 4. Random Forest

In [586]:
from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier(n_estimators=500, max_depth=64, min_samples_split=8)
rf.fit(x_train, y_train)

In [587]:
y_pred_train = rf.predict(x_train)
y_pred_test = rf.predict(x_test)

acc_train_rf, acc_test_rf, p_dt, r_rf = calculate_metrics(y_train, y_test, y_pred_train, y_pred_test)

acc_train: 0.9956043956043956 - acc test: 0.9385964912280702 - precision: 0.953125 - recall: 0.9384615384615385


### 5. SVM

In [588]:
from sklearn.svm import SVC


svm = SVC(kernel='poly')
svm.fit(x_train, y_train)

In [589]:
y_pred_train = svm.predict(x_train)
y_pred_test = svm.predict(x_test)

acc_train_svm, acc_test_svm, p_svm, r_svm = calculate_metrics(y_train, y_test, y_pred_train, y_pred_test)

acc_train: 0.9934065934065934 - acc test: 0.956140350877193 - precision: 0.9285714285714286 - recall: 1.0


### 6. Logistic Regression

In [590]:
# from sklearn.linear_model import LogisticRegression


# lr = LogisticRegression()
# lr.fit(x_train, y_train)

In [591]:
# y_pred_train = lr.predict(x_train)
# y_pred_test = lr.predict(x_test)

# acc_train_lr, acc_test_lr, p_lr, r_lr = calculate_metrics(y_train, y_test, y_pred_train, y_pred_test)

### 7. ANN

In [592]:
# from sklearn.neural_network import MLPClassifier


# ann = MLPClassifier(hidden_layer_sizes=512, activation='relu', solver='adam')
# ann.fit(x_train, y_train)

In [593]:
# y_pred_train = ann.predict(x_train)
# y_pred_test = ann.predict(x_test)

# acc_train_ann, acc_test_ann, p_ann, r_ann = calculate_metrics(y_train, y_test, y_pred_train, y_pred_test)