## 분류분석

### 로지스틱 회귀분석
> 1. 데이터 수집 - 사이킷런이 제공하는 유방암 진단 데이터

In [1]:
# 필요 라이브러리 등록
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_breast_cancer

In [2]:
# 데이터 준비
bCancer = load_breast_cancer()

In [3]:
bCancer.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [4]:
len(bCancer.data)

569

In [5]:
# DataFrame 으로 전환
dfBrCancer = pd.DataFrame(bCancer.data, columns=bCancer.feature_names)

In [6]:
dfBrCancer.tail()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
564,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,0.05623,...,25.45,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115
565,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,0.05533,...,23.69,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637
566,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,0.159,0.05648,...,18.98,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782
567,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,0.07016,...,25.74,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124
568,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,0.1587,0.05884,...,9.456,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039


In [7]:
# 종속변수로 암 판단 결과 추가
dfBrCancer['result'] = bCancer.target

In [8]:
# 암판단결과(result)
dfBrCancer.tail()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,result
564,21.56,22.39,142.0,1479.0,0.111,0.1159,0.2439,0.1389,0.1726,0.05623,...,26.4,166.1,2027.0,0.141,0.2113,0.4107,0.2216,0.206,0.07115,0
565,20.13,28.25,131.2,1261.0,0.0978,0.1034,0.144,0.09791,0.1752,0.05533,...,38.25,155.0,1731.0,0.1166,0.1922,0.3215,0.1628,0.2572,0.06637,0
566,16.6,28.08,108.3,858.1,0.08455,0.1023,0.09251,0.05302,0.159,0.05648,...,34.12,126.7,1124.0,0.1139,0.3094,0.3403,0.1418,0.2218,0.0782,0
567,20.6,29.33,140.1,1265.0,0.1178,0.277,0.3514,0.152,0.2397,0.07016,...,39.42,184.6,1821.0,0.165,0.8681,0.9387,0.265,0.4087,0.124,0
568,7.76,24.54,47.92,181.0,0.05263,0.04362,0.0,0.0,0.1587,0.05884,...,30.37,59.16,268.6,0.08996,0.06444,0.0,0.0,0.2871,0.07039,1


In [9]:
# 결측치는 없다.
dfBrCancer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [10]:
# 사이킷런 필요라이브러리 사용등록
from sklearn.preprocessing import StandardScaler

In [11]:
scaler = StandardScaler()

In [12]:
# 특성으로 사용할 데이터의 평균이 0, 분산이 1이 되는 정규분포형태로 스케일링
# 로지스틱 호귀 모델에서 X축으로 사용하기 위해서
brCancerScaled = scaler.fit_transform(bCancer.data)

In [13]:
bCancer.data[0]

array([1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
       3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
       8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
       3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
       1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01])

In [14]:
brCancerScaled[0]

array([ 1.09706398, -2.07333501,  1.26993369,  0.9843749 ,  1.56846633,
        3.28351467,  2.65287398,  2.53247522,  2.21751501,  2.25574689,
        2.48973393, -0.56526506,  2.83303087,  2.48757756, -0.21400165,
        1.31686157,  0.72402616,  0.66081994,  1.14875667,  0.90708308,
        1.88668963, -1.35929347,  2.30360062,  2.00123749,  1.30768627,
        2.61666502,  2.10952635,  2.29607613,  2.75062224,  1.93701461])

#### 로지스틱 회귀로 분석모델 구축
> `LogisticRegression` 사용

In [15]:
# 분석용 라이브러리 사용등록
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [16]:
# 독립변수 X , 종속변수 Y(악성인지/양성인지 결과값)
y = dfBrCancer['result']  # 암판단결과를 종속변수에 담는다.
X = brCancerScaled

In [17]:
# 훈련용데이터와 검증용데이터로 분리
# test_size = 0.3 -> 30% 는 검증용 데이터로 사용
# random_state 는 0 에서 43,42 사이로만 하면 된다.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [18]:
# 로지스틱 회귀 분석 모델 생성
model = LogisticRegression()

In [19]:
model.fit(X_train, y_train)

In [20]:
# 데이터 예측
y_predict = model.predict(X_test)

#### 분석평가지표로 예측결과와 실제결과를 비교
- 정확도, 정밀도, 재현율, F1, ROC_AUC_Score

In [21]:
# 평가지표용 라이브러리 사용등록
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [22]:
# 1. 오차행렬(실제결과/예측결과를 행렬)
## 171개 확인결과 0을 0으로 예측한게 60건
## 0인데 1로 잘못 예측한게 3건
## 1건을 1로 예측한게 107건
confusion_matrix(y_test, y_predict)

array([[ 60,   3],
       [  1, 107]])

In [23]:
# 정확도부터 ROC기반 AUC스코어까지
accuracy = accuracy_score(y_test, y_predict)  # 정확도
precision = precision_score(y_test, y_predict)  # 정밀도
recall = recall_score(y_test, y_predict)  # 재현율
f1 = f1_score(y_test, y_predict)  # 정밀도와 재현율을 결합한 평가지표
roc_auc = roc_auc_score(y_test, y_predict)

In [24]:
f'정확도: {accuracy:.3f}, 정밀도: {precision:.3f}, 재현율: {recall:.3f}, F1: {f1:.3f}, ROC_AUC: {roc_auc:.3f}'

'정확도: 0.977, 정밀도: 0.973, 재현율: 0.991, F1: 0.982, ROC_AUC: 0.972'

#### 결론
> 1 또는 0 으로 참/거짓 등의 값을 판단할 때는 로지스틱회귀를 사용할 것