In [1]:
from scipy import sparse
import os
import mglearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="C:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)

In [5]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

iris = load_iris()
logreg = LogisticRegression()

kfold = KFold(n_splits=3)
print("교차 검증 점수 : \n{}".format(cross_val_score(logreg, iris.data, 
                                               iris.target, cv=kfold)))

교차 검증 점수 : 
[0. 0. 0.]


#### - iris 데이터가 각 클래스 별로 저장되어 있음을 알 수 있다.

In [8]:
kfold = KFold(n_splits=3, shuffle=True, random_state=0)
print("교차 검증 점수 : \n{}".format(cross_val_score(logreg, iris.data, 
                                               iris.target, cv=kfold)))

교차 검증 점수 : 
[0.98 0.96 0.96]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
from sklearn.model_selection import ShuffleSplit

shuffle_split = ShuffleSplit(test_size=.5, train_size=.5, n_splits=10)
scores = cross_val_score(logreg, iris.data, iris.target, cv=shuffle_split)
print('교차 검증 점수 :\n{}'.format(scores))
print('평균 정확도 : {:.2f}'.format(scores.mean()))

교차 검증 점수 :
[0.97333333 0.98666667 0.98666667 0.96       0.97333333 0.97333333
 0.98666667 0.94666667 0.94666667 0.98666667]
평균 정확도 : 0.97


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
from sklearn.model_selection import StratifiedShuffleSplit

shuffle_split = StratifiedShuffleSplit(test_size=.5, train_size=.5, n_splits=10)
print('교차 검증 점수 :\n{}'.format(scores))
print('평균 정확도 : {:.2f}'.format(scores.mean()))

교차 검증 점수 :
[0.97333333 0.98666667 0.98666667 0.96       0.97333333 0.97333333
 0.98666667 0.94666667 0.94666667 0.98666667]
평균 정확도 : 0.97


In [22]:
from sklearn.model_selection import GroupKFold
from sklearn.datasets import make_blobs

X, y = make_blobs(n_samples=12, random_state=0)
groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3]
scores = cross_val_score(logreg, X, y, groups, cv=GroupKFold(n_splits=3))
print("교차 검증 점수 :\n{}".format(scores))

교차 검증 점수 :
[0.75       0.6        0.66666667]




### 이진 분류

In [23]:
from sklearn.datasets import load_digits

# 불균형 데이터
digits = load_digits()
# 9는 true, 나머지는 false (1 : 9의 불균형 데이터)
y = digits.target == 9

X_train, X_test, y_train, y_test = train_test_split(digits.data, y, random_state=0)

In [24]:
from sklearn.dummy import DummyClassifier

dummy_majority = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)
pred_most_frequent = dummy_majority.predict(X_test)
print("예측된 레이블의 높은 빈도 값 : {}".format(np.unique(pred_most_frequent)))
print("테스트 점수 : {:.2f}".format(dummy_majority.score(X_test, y_test)))

예측된 레이블의 높은 빈도 값 : [False]
테스트 점수 : 0.90


In [26]:
from sklearn.tree import DecisionTreeClassifier

# 결정 트리 / 더미 분류
tree = DecisionTreeClassifier(max_depth=2).fit(X_train, y_train)
pred_tree = tree.predict(X_test)
print("테스트 점수 : {:.2f}".format(tree.score(X_test, y_test)))

테스트 점수 : 0.92


In [27]:
from sklearn.dummy import DummyClassifier

# 훈련 세트와 같은 비율로 예측
dummy = DummyClassifier().fit(X_train, y_train)
pred_dummy = dummy.predict(X_test)
print("dummy 점수 : {:.2f}".format(dummy.score(X_test, y_test)))

dummy 점수 : 0.82




In [28]:
# 로지스틱 회귀 분류
logreg = LogisticRegression(C=0.1).fit(X_train, y_train)
pred_logreg = logreg.predict(X_test)
print("logreg 점수 : {:.2f}".format(logreg.score(X_test, y_test)))

logreg 점수 : 0.98


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
from sklearn.metrics import confusion_matrix

# 혼동 행렬
confusion = confusion_matrix(y_test, pred_logreg)
print("오차 행렬 : \n{}".format(confusion))

오차 행렬 : 
[[402   1]
 [  6  41]]
