### Breast Cancer 

In [39]:
from sklearn.datasets import load_breast_cancer


bc = load_breast_cancer()
print(bc.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

:Number of Instances: 569

:Number of Attributes: 30 numeric, predictive attributes and the class

:Attribute Information:
    - radius (mean of distances from center to points on the perimeter)
    - texture (standard deviation of gray-scale values)
    - perimeter
    - area
    - smoothness (local variation in radius lengths)
    - compactness (perimeter^2 / area - 1.0)
    - concavity (severity of concave portions of the contour)
    - concave points (number of concave portions of the contour)
    - symmetry
    - fractal dimension ("coastline approximation" - 1)

    The mean, standard error, and "worst" or largest (mean of the three
    worst/largest values) of these features were computed for each image,
    resulting in 30 features.  For instance, field 0 is Mean Radius, field
    10 is Radius SE, field 20 is Worst Radius.

    - 

### Dataset Info

In [40]:
bc.target[20]

np.int64(1)

In [41]:
bc.target.shape

(569,)

In [42]:
bc.data

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]], shape=(569, 30))

In [43]:
bc.data.shape

(569, 30)

### Preprocessing

In [44]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(bc.data, bc.target, test_size=0.2)

In [45]:
print(f'Feature=> train: {x_train.shape} - test: {x_test.shape}')
print(f'Label=> train: {y_train.shape} - test: {y_test.shape}')

Feature=> train: (455, 30) - test: (114, 30)
Label=> train: (455,) - test: (114,)


In [46]:
x_train[0]

array([2.013e+01, 2.825e+01, 1.312e+02, 1.261e+03, 9.780e-02, 1.034e-01,
       1.440e-01, 9.791e-02, 1.752e-01, 5.533e-02, 7.655e-01, 2.463e+00,
       5.203e+00, 9.904e+01, 5.769e-03, 2.423e-02, 3.950e-02, 1.678e-02,
       1.898e-02, 2.498e-03, 2.369e+01, 3.825e+01, 1.550e+02, 1.731e+03,
       1.166e-01, 1.922e-01, 3.215e-01, 1.628e-01, 2.572e-01, 6.637e-02])

In [47]:
from sklearn.preprocessing import MinMaxScaler


scaler = MinMaxScaler(feature_range=(0, 1))
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [48]:
x_train[0]

array([0.62232003, 0.62698681, 0.60403566, 0.47401909, 0.40778189,
       0.25771425, 0.33739456, 0.48663022, 0.34949495, 0.11727451,
       0.23682781, 0.46472772, 0.2094897 , 0.17227931, 0.13787946,
       0.16506444, 0.09974747, 0.31786323, 0.13601591, 0.05538742,
       0.56065457, 0.7464428 , 0.52089247, 0.37991545, 0.3000066 ,
       0.15999651, 0.29095023, 0.55945017, 0.23906844, 0.07328124])

### Classification

In [49]:
from sklearn.metrics import accuracy_score, precision_score, recall_score


def calculate_metrics(y_train, y_test, y_pred_train, y_pred_test):
    acc_train = accuracy_score(y_true=y_train, y_pred=y_pred_train)
    acc_test = accuracy_score(y_true=y_test, y_pred=y_pred_test)

    p = precision_score(y_true=y_test, y_pred=y_pred_test)
    r = recall_score(y_true=y_test, y_pred=y_pred_test)

    print(f'acc_train: {acc_train} - acc test: {acc_test} - precision: {p} - recall: {r}')

    return acc_train, acc_test, p, r

#### 1. Naive bayes

In [50]:
from sklearn.naive_bayes import GaussianNB


gnb = GaussianNB()
gnb.fit(x_train, y_train)

In [None]:
y_pred_train = gnb.predict(x_train)
y_pred_test = gnb.predict(x_test)

acc_train_gnb, acc_test_gnb, p, r = calculate_metrics(y_train, y_test, y_pred_train, y_pred_test)

acc_train: 0.9362637362637363 - acc test: 0.9473684210526315 - precision: 0.9594594594594594 - recall: 0.9594594594594594
