### Breast Cancer 

In [258]:
from sklearn.datasets import load_breast_cancer


bc = load_breast_cancer()
print(bc.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

:Number of Instances: 569

:Number of Attributes: 30 numeric, predictive attributes and the class

:Attribute Information:
    - radius (mean of distances from center to points on the perimeter)
    - texture (standard deviation of gray-scale values)
    - perimeter
    - area
    - smoothness (local variation in radius lengths)
    - compactness (perimeter^2 / area - 1.0)
    - concavity (severity of concave portions of the contour)
    - concave points (number of concave portions of the contour)
    - symmetry
    - fractal dimension ("coastline approximation" - 1)

    The mean, standard error, and "worst" or largest (mean of the three
    worst/largest values) of these features were computed for each image,
    resulting in 30 features.  For instance, field 0 is Mean Radius, field
    10 is Radius SE, field 20 is Worst Radius.

    - 

### Dataset Info

In [259]:
bc.target[20]

np.int64(1)

In [260]:
bc.target.shape

(569,)

In [261]:
bc.data

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]], shape=(569, 30))

In [262]:
bc.data.shape

(569, 30)

### Preprocessing

In [263]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(bc.data, bc.target, test_size=0.2)

In [264]:
print(f'Feature=> train: {x_train.shape} - test: {x_test.shape}')
print(f'Label=> train: {y_train.shape} - test: {y_test.shape}')

Feature=> train: (455, 30) - test: (114, 30)
Label=> train: (455,) - test: (114,)


In [265]:
x_train[0]

array([1.154e+01, 1.072e+01, 7.373e+01, 4.091e+02, 8.597e-02, 5.969e-02,
       1.367e-02, 8.907e-03, 1.833e-01, 6.100e-02, 1.312e-01, 3.602e-01,
       1.107e+00, 9.438e+00, 4.124e-03, 1.340e-02, 1.003e-02, 4.667e-03,
       2.032e-02, 1.952e-03, 1.234e+01, 1.287e+01, 8.123e+01, 4.678e+02,
       1.092e-01, 1.626e-01, 8.324e-02, 4.715e-02, 3.390e-01, 7.434e-02])

In [266]:
from sklearn.preprocessing import MinMaxScaler


scaler = MinMaxScaler(feature_range=(0, 1))
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [267]:
x_train[0]

array([0.18698788, 0.03415624, 0.18359653, 0.0991732 , 0.30098402,
       0.12364272, 0.03202905, 0.04426938, 0.39040404, 0.23251896,
       0.00576567, 0.        , 0.01649154, 0.00408266, 0.08195941,
       0.08372638, 0.03301514, 0.1188439 , 0.17501548, 0.04813068,
       0.12507383, 0.02265458, 0.12634284, 0.05672787, 0.25113914,
       0.13127844, 0.06648562, 0.16202749, 0.35974768, 0.12659058])

### Classification

In [268]:
from sklearn.metrics import accuracy_score, precision_score, recall_score


def calculate_metrics(y_train, y_test, y_pred_train, y_pred_test):
    acc_train = accuracy_score(y_true=y_train, y_pred=y_pred_train)
    acc_test = accuracy_score(y_true=y_test, y_pred=y_pred_test)

    p = precision_score(y_true=y_test, y_pred=y_pred_test)
    r = recall_score(y_true=y_test, y_pred=y_pred_test)

    print(f'acc_train: {acc_train} - acc test: {acc_test} - precision: {p} - recall: {r}')

    return acc_train, acc_test, p, r

#### 1. Naive bayes

In [269]:
from sklearn.naive_bayes import GaussianNB


gnb = GaussianNB()
gnb.fit(x_train, y_train)

In [270]:
y_pred_train = gnb.predict(x_train)
y_pred_test = gnb.predict(x_test)

acc_train_gnb, acc_test_gnb, p_gnb, r_gnb = calculate_metrics(y_train, y_test, y_pred_train, y_pred_test)

acc_train: 0.9428571428571428 - acc test: 0.8859649122807017 - precision: 0.9066666666666666 - recall: 0.918918918918919


### 2. KNN

In [271]:
from sklearn.neighbors import KNeighborsClassifier


knn = KNeighborsClassifier(n_neighbors=8, algorithm='kd_tree', leaf_size=28)
knn.fit(x_train, y_train)

In [272]:
y_pred_train = knn.predict(x_train)
y_pred_test = knn.predict(x_test)

acc_train_knn, acc_test_knn, p_knn, r_knn = calculate_metrics(y_train, y_test, y_pred_train, y_pred_test)

acc_train: 0.9802197802197802 - acc test: 0.9473684210526315 - precision: 0.9358974358974359 - recall: 0.9864864864864865


### 3. Decision Tree

In [273]:
from sklearn.tree import DecisionTreeClassifier


dt = DecisionTreeClassifier(max_depth=32, min_samples_split=4, criterion='gini')
dt.fit(x_train, y_train)

In [274]:
y_pred_train = dt.predict(x_train)
y_pred_test = dt.predict(x_test)

acc_train_dt, acc_test_dt, p_dt, r_dt = calculate_metrics(y_train, y_test, y_pred_train, y_pred_test)

acc_train: 1.0 - acc test: 0.8947368421052632 - precision: 0.918918918918919 - recall: 0.918918918918919


### 4. Random Forest

In [275]:
from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier(n_estimators=500, max_depth=64, min_samples_split=8)
rf.fit(x_train, y_train)

In [276]:
y_pred_train = rf.predict(x_train)
y_pred_test = rf.predict(x_test)

acc_train_rf, acc_test_rf, p_dt, r_rf = calculate_metrics(y_train, y_test, y_pred_train, y_pred_test)

acc_train: 0.9956043956043956 - acc test: 0.9385964912280702 - precision: 0.935064935064935 - recall: 0.972972972972973


### 5. SVM

In [None]:
from sklearn.svm import SVC


svm = SVC(kernel='poly')
svm.fit(x_train, y_train)

In [278]:
y_pred_train = svm.predict(x_train)
y_pred_test = svm.predict(x_test)

acc_train_svm, acc_test_svm, p_svm, r_svm = calculate_metrics(y_train, y_test, y_pred_train, y_pred_test)

acc_train: 0.9846153846153847 - acc test: 0.9473684210526315 - precision: 0.9473684210526315 - recall: 0.972972972972973
