In [6]:
# 유방암을 분류해 봅시다

# (1) 필요한 모듈 import하기
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report



# (2) 데이터 준비
breast_cancer = load_breast_cancer()
#Feature Data 지정하기
breast_cancer_data = breast_cancer.data




# (3) 데이터 이해하기
#Label Data 지정하기
breast_cancer_label = breast_cancer.target  # 타겟 데이터 확인
print('breast_cancer 분류\n')
print(breast_cancer.target_names)


# (4) train, test 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(breast_cancer_data, 
                                                    breast_cancer_label, 
                                                    test_size=0.2, 
                                                    random_state=7)



# (5) 다양한 모델로 학습시켜보기

# Decision Tree 사용해 보기
print('\n\n\n[Decision Tree]')
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train) # 모델 학습

y_pred = decision_tree.predict(X_test)
print(classification_report(y_test, y_pred))



# Random Forest 사용해 보기
print('\n\n\n[Random Forests]')
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))



# SVM 사용해 보기
print('\n\n\n[SVM]')
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

clf = make_pipeline(StandardScaler(), LinearSVC(random_state=0, tol=1e-5))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))



# SGD Classifier 사용해 보기
print('\n\n\n[SGD Classifier]')
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

clf = make_pipeline(StandardScaler(),SGDClassifier(max_iter=1000, tol=1e-3))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))



# Logistic Regression 사용해 보기
print('\n\n\n[Logistic Regression]')
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))




breast_cancer 분류

['malignant' 'benign']



[Decision Tree]
              precision    recall  f1-score   support

           0       0.92      0.82      0.87        40
           1       0.91      0.96      0.93        74

    accuracy                           0.91       114
   macro avg       0.91      0.89      0.90       114
weighted avg       0.91      0.91      0.91       114




[Random Forests]
              precision    recall  f1-score   support

           0       1.00      0.85      0.92        40
           1       0.93      1.00      0.96        74

    accuracy                           0.95       114
   macro avg       0.96      0.93      0.94       114
weighted avg       0.95      0.95      0.95       114




[SVM]
              precision    recall  f1-score   support

           0       0.97      0.95      0.96        40
           1       0.97      0.99      0.98        74

    accuracy                           0.97       114
   macro avg       0.97      0.97      

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


1 유방종양의 악성(malignant)과 양성(benign)을 판단할 때 악성종양을 놓치는 경우가 없어야 하므로 
이번 과제에서 중점적으로 볼 사항은 recall의 수치가 높은 알고리즘을 선택하는 것이다.
그러므로 유방종양의 분류에는 recall의 수치가 높은 SVM과 SGD Classifier가 적합하다고 할 수 있다.

2 Logistic Regression의 경우 수렴하는데 실패했다. 반복수를 높이라는 메시지가 있는데 
함수인자에 관련요소가 없어서 수정을 못하였다.