In [33]:
# 모듈 준비
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from collections import Counter
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline

# EDA

In [34]:
breast_cancer = load_breast_cancer()

In [35]:
dir(breast_cancer)

['DESCR',
 'data',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

In [36]:
breast_cancer.DESCR



In [37]:
# 인스턴스 갯수
len(breast_cancer.target)

569

In [38]:
# label 상당히 균등한 것을 확인할 수 있다
# 키 값 2개 -> Binaray Classification
Counter(breast_cancer.target)

Counter({0: 212, 1: 357})

In [39]:
# 악성, 양성
breast_cancer.target_names

array(['malignant', 'benign'], dtype='<U9')

In [40]:
breast_cancer.data

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [41]:
len(breast_cancer.data)

569

# Train,Test 데이터셋 설정

In [42]:
x_train, x_test, y_train, y_test = train_test_split(breast_cancer.data, breast_cancer.target, test_size = 0.2 , random_state = 777)

# Decision Tree ( Recall 값 87% )

In [43]:
decision_tree = DecisionTreeClassifier()

In [44]:
decision_tree.fit(x_train,y_train)

DecisionTreeClassifier()

In [45]:
y_pred = decision_tree.predict(x_test) 

In [46]:
accuracy_score(y_pred, y_test)

0.9035087719298246

In [47]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.87      0.86        38
           1       0.93      0.92      0.93        76

    accuracy                           0.90       114
   macro avg       0.89      0.89      0.89       114
weighted avg       0.90      0.90      0.90       114



In [48]:
confusion_matrix(y_test, y_pred)

array([[33,  5],
       [ 6, 70]])

# Random Forest  ( Recall 값 87% )

In [60]:
RandomForest = RandomForestClassifier(random_state=777)

In [61]:
RandomForest.fit(x_train, y_train)

RandomForestClassifier(random_state=777)

In [62]:
y_pred = RandomForest.predict(x_test)

In [63]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.87      0.93        38
           1       0.94      1.00      0.97        76

    accuracy                           0.96       114
   macro avg       0.97      0.93      0.95       114
weighted avg       0.96      0.96      0.96       114



In [64]:
confusion_matrix(y_test, y_pred)

array([[33,  5],
       [ 0, 76]])

# SVM  ( Recall 값 68% )

In [53]:
svm_model = svm.SVC()

In [54]:
svm_model.fit(x_test, y_test)

SVC()

In [55]:
y_pred = svm_model.predict(x_test)

In [56]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.68      0.81        38
           1       0.86      1.00      0.93        76

    accuracy                           0.89       114
   macro avg       0.93      0.84      0.87       114
weighted avg       0.91      0.89      0.89       114



# SGD  ( Recall 값 97% )

In [57]:
sgd_model = SGDClassifier()
sgd_model.fit(x_train, y_train)
y_pred = sgd_model.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.97      0.86        38
           1       0.98      0.86      0.92        76

    accuracy                           0.89       114
   macro avg       0.88      0.91      0.89       114
weighted avg       0.91      0.89      0.90       114



# LogisticRegression  ( Recall 값 87% )

In [58]:
logistic_model = LogisticRegression()

In [59]:
logistic_model.fit(x_train, y_train)
y_pred = logistic_model.predict(x_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.87      0.89        38
           1       0.94      0.96      0.95        76

    accuracy                           0.93       114
   macro avg       0.93      0.91      0.92       114
weighted avg       0.93      0.93      0.93       114



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# 평가지표 선택 및 이유

Breast Cancer 분류에서 가장 중요한 점은 실제로 악성인 사람을 놓치지 않는 것이 중요하다.  <br/>
따라서, Recall 값을 볼 필요가 있다. <br/>
4가지 모델 중 SGD 모델이 Test 데이터에 대해 97%로 가장 높은 Recall 값을 가지고 있기 때문에,  <br/>
4가지 모델 중 SGD 모델이 유방함 환자 분류에 가장 적합한 것으로 판단된다 <br/>