In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression

cancer = load_breast_cancer()

# 데이터전처리
- 선형모델을 사용할때는 표준화를 진행하는 것이 좋다
  - M.M : 단위가 작고,최대 최소가 존재할때 사용
  - SC : Data 분포를 통일할 때 사용

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()
data_scaled = scaler.fit_transform(cancer.data)

X_train, X_test, y_train, y_test = train_test_split(
    data_scaled,
    cancer.target,
    test_size=0.3,
    random_state=0
)

In [9]:
lr_clf = LogisticRegression() # 기본 solver는 lbfgs
lr_clf.fit(X_train,y_train)

lr_pred = lr_clf.predict(X_test)

In [12]:
from sklearn.metrics import accuracy_score, roc_auc_score

print("Accuracy : {:.3f}, ROC_AUC : {:.3f}".format(accuracy_score(y_test,lr_pred),roc_auc_score(y_test,lr_pred))) 

Accuracy : 0.977, ROC_AUC : 0.972


In [14]:
# solver에 따른 성능 변화 측정
solvers = ['lbfgs','liblinear','newton-cg','sag','saga']

for solver in solvers:
  lr_clf = LogisticRegression(solver=solver , max_iter=600) # max_iter : 최적화 횟수
  lr_clf.fit(X_train,y_train)

  lr_pred = lr_clf.predict(X_test)
  
  print(solver)
  print("Accuracy : {:.3f}, ROC_AUC : {:.3f}".format(accuracy_score(y_test,lr_pred),roc_auc_score(y_test,lr_pred))) 
  print()

lbfgs
Accuracy : 0.977, ROC_AUC : 0.972

liblinear
Accuracy : 0.982, ROC_AUC : 0.979

newton-cg
Accuracy : 0.977, ROC_AUC : 0.972

sag
Accuracy : 0.982, ROC_AUC : 0.979

saga
Accuracy : 0.982, ROC_AUC : 0.979



In [16]:
# GridsearchCV
from sklearn.model_selection import GridSearchCV

params = {'solver':["liblinear","lbfgs"],
           'penalty':['l2','l1'],
           'C':[0.01, 0.1, 1, 5, 10]}

grid_clf = GridSearchCV(
    LogisticRegression(),
    param_grid = params,
    scoring="accuracy",
    cv=3
)

grid_clf.fit(data_scaled, cancer.target)


15 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.9/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.9/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

 0.96131997        nan 0.97539218 0.97539

In [17]:
print('최적 하이퍼 파라미터:{}, 최적 평균 정확도:{:.3f}'.format(grid_clf.best_params_, 
                                                  grid_clf.best_score_))

최적 하이퍼 파라미터:{'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}, 최적 평균 정확도:0.979
