In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer

In [3]:
b_cancer = load_breast_cancer()
print(b_cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [4]:
b_cancer_df = pd.DataFrame(b_cancer.data, columns = b_cancer.feature_names)
b_cancer_df['diagnosis'] = b_cancer.target
print('데이터셋 크기: ', b_cancer_df.shape)
b_cancer_df.head()


데이터셋 크기:  (569, 31)


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,diagnosis
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [5]:
b_cancer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
b_cancer_scaled = scaler.fit_transform(b_cancer.data)

In [7]:
print(b_cancer_scaled[0])

[ 1.09706398 -2.07333501  1.26993369  0.9843749   1.56846633  3.28351467
  2.65287398  2.53247522  2.21751501  2.25574689  2.48973393 -0.56526506
  2.83303087  2.48757756 -0.21400165  1.31686157  0.72402616  0.66081994
  1.14875667  0.90708308  1.88668963 -1.35929347  2.30360062  2.00123749
  1.30768627  2.61666502  2.10952635  2.29607613  2.75062224  1.93701461]


In [8]:
print(b_cancer_scaled[9])

[-0.47353452  1.10543868 -0.32948179 -0.50906338  1.58269942  2.56335845
  1.73887209  0.94176033  0.79729802  2.78309559 -0.38825014  0.6933453
 -0.40941963 -0.36076377  0.03600849  2.60958662  1.5098476   0.40939496
 -0.32113637  2.37734605 -0.24418961  2.44310906 -0.28627803 -0.29740917
  2.32029536  5.11287727  3.99543285  1.6200152   2.3704438   6.84685604]


In [9]:
print(b_cancer_scaled[17])

[ 5.68797577e-01  3.23544126e-01  6.64437745e-01  4.09297356e-01
  1.46883471e+00  1.85457312e+00  1.04709318e+00  1.38980180e+00
  1.28652444e+00  1.52568080e+00  5.92011298e-01 -2.60999587e-01
  4.89060924e-01  3.04567927e-01 -4.99317257e-03 -2.61640642e-02
 -4.54793148e-04  1.90411911e-01 -4.42214870e-01  1.31289578e-01
  9.71384843e-01  9.44946175e-01  8.79837547e-01  7.63666941e-01
  2.03974566e+00  1.07529849e+00  9.89304603e-01  1.41141082e+00
  1.30270860e+00  1.67656027e+00]


In [10]:
print(b_cancer_scaled[29])

[ 0.97777802 -0.98659467  0.94865013  0.85383059  0.15013907  0.21527015
  0.12493055  0.78957583 -0.2651265  -0.1853673   0.70425773 -0.71549273
  0.88558044  0.45681976 -0.47134541  0.27116829  0.07215909  0.28287142
 -0.1564696  -0.02001096  0.77465644 -1.00266557  0.82324445  0.60897062
 -0.30109094  0.17134395 -0.11172705  0.47192976 -0.23418298 -0.2635475 ]


In [11]:
# 로지스틱 회귀 모형 구축

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [12]:
# X, Y 설정
Y = b_cancer_df['diagnosis']
X = b_cancer_scaled

In [13]:
print(Y)

0      0
1      0
2      0
3      0
4      0
      ..
564    0
565    0
566    0
567    0
568    1
Name: diagnosis, Length: 569, dtype: int32


In [14]:
print(round(sum(Y)/len(Y)*100, 2), '%')

62.74 %


In [15]:
# 훈련용, 테스트용 데이터 분리
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 0, stratify=Y)

In [16]:
print(round(sum(Y_train)/len(Y_train)*100, 2), '%')

62.81 %


In [17]:
print(round(sum(Y_test)/len(Y_test)*100, 2), '%')

62.57 %


In [18]:
lr_b_cancer = LogisticRegression() # 로지스틱 회귀 모형 생성
lr_b_cancer.fit(X_train, Y_train) # 로지스틱 회귀 모형 훈련

In [19]:
Y_predict = lr_b_cancer.predict(X_test)

In [20]:
lr_b_cancer.coef_

array([[-0.54406091, -0.41605507, -0.51991133, -0.59308816,  0.0027904 ,
         0.41939012, -0.78884789, -1.02290774, -0.15221315,  0.37699245,
        -1.07237296, -0.06165012, -0.54319278, -0.69191037, -0.21537603,
         0.61125449,  0.11034357, -0.26876198,  0.49779553,  0.42281321,
        -0.97636344, -1.08977767, -0.82614726, -0.86970513, -0.55575019,
        -0.15928048, -0.62816926, -0.7691139 , -0.67505294, -0.73082045]])

In [21]:
lr_b_cancer.intercept_

array([0.23582794])

In [22]:
# 로지스틱 회귀 모형 성능 확인

from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [23]:
# 오차행렬 확인
confusion_matrix(Y_test, Y_predict)

array([[ 61,   3],
       [  4, 103]], dtype=int64)

In [24]:
accuracy = accuracy_score(Y_test, Y_predict)
precision = precision_score(Y_test, Y_predict)
recall = recall_score(Y_test, Y_predict)
f1 = f1_score(Y_test, Y_predict)

In [25]:
print('정확도: {0:.3f}, 정밀도: {1:.3f}, 재현율: {2:.3f}, F1: {3:.3f}'.format(accuracy, precision, recall, f1))

정확도: 0.959, 정밀도: 0.972, 재현율: 0.963, F1: 0.967


In [26]:
lr_b_cancer.predict([[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]])

array([1])

In [27]:
lr_b_cancer.predict([[1,0,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,1,1,0,1,0]])

array([0])