# 분류 문제 데이터셋
## 아래의 예제 중 원하는 데이터 Cell 만 실행
- 필기체 숫자 데이터 분류 문제 (MNIST)
- 유방암 악성/양성 분류 문제
- 당뇨병 환자 분류 문제
- 와인 종류 분류 문제

In [1]:
### MNIST
from sklearn import datasets
digits=datasets.load_digits()

n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))
print(data.shape)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, digits.target, test_size=0.3, shuffle=True)


(1797, 64)


In [2]:
### 유방암
from sklearn import datasets
cancer=datasets.load_breast_cancer()

n_samples = len(cancer.data)
data = cancer.data.reshape((n_samples, -1))
print(data.shape)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, cancer.target, test_size=0.3, shuffle=True)


(569, 30)


In [3]:
### 당뇨병

from sklearn import datasets
diabetes=datasets.load_diabetes()

n_samples = len(diabetes.data)
data = diabetes.data.reshape((n_samples, -1))
print(data.shape)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, diabetes.target, test_size=0.3, shuffle=True)


(442, 10)


In [4]:
### 와인

from sklearn import datasets
wine=datasets.load_wine()

n_samples = len(wine.data)
data = wine.data.reshape((n_samples, -1))
print(data.shape)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, wine.target, test_size=0.3, shuffle=True)


(178, 13)


# 분류 알고리즘 별 성능 평가
## [1] 데이터 전처리 본인 스스로 공부!!
## [2] 각 알고리즘 마다 설정 가능한 파라미터는 본인 스스로 공부!
- Decision Tree Classifier
- DA Classifier
- Logistic Regression Classifier
- KNN Classifier

### Decision Tree

In [5]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier(criterion='gini', random_state=1)
dtc.fit(X_train, y_train)
y_train_pred = dtc.predict(X_train)
y_test_pred = dtc.predict(X_test)

from sklearn.metrics import accuracy_score
print(accuracy_score(y_train, y_train_pred))
print(accuracy_score(y_test, y_test_pred))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_test_pred))

1.0
0.8888888888888888
[[16  0  0]
 [ 2 19  2]
 [ 0  2 13]]


### LDA

In [6]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

cld = LinearDiscriminantAnalysis()
cld.fit(X_train,y_train)
y_train_pred = cld.predict(X_train)
y_test_pred = cld.predict(X_test)

from sklearn.metrics import accuracy_score
print(accuracy_score(y_train, y_train_pred))
print(accuracy_score(y_test, y_test_pred))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_test_pred))

1.0
1.0
[[16  0  0]
 [ 0 23  0]
 [ 0  0 15]]


### QDA

In [7]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

cld = QuadraticDiscriminantAnalysis()
cld.fit(X_train,y_train)
y_train_pred = cld.predict(X_train)
y_test_pred = cld.predict(X_test)

from sklearn.metrics import accuracy_score
print(accuracy_score(y_train, y_train_pred))
print(accuracy_score(y_test, y_test_pred))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_test_pred))

1.0
0.9814814814814815
[[16  0  0]
 [ 0 23  0]
 [ 0  1 14]]


### Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=1)
lr.fit(X_train, y_train)
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)


from sklearn.metrics import accuracy_score
print(accuracy_score(y_train, y_train_pred))
print(accuracy_score(y_test, y_test_pred))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_test_pred))

0.9596774193548387
0.9629629629629629
[[16  0  0]
 [ 1 22  0]
 [ 0  1 14]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### KNN

In [9]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train,y_train)

y_train_pred = knn.predict(X_train)
y_test_pred = knn.predict(X_test)


from sklearn.metrics import accuracy_score
print(accuracy_score(y_train, y_train_pred))
print(accuracy_score(y_test, y_test_pred))

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_test_pred))

0.7983870967741935
0.7592592592592593
[[14  0  2]
 [ 2 15  6]
 [ 2  1 12]]
