In [71]:
import sklearn
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [61]:
iris = load_iris()

In [18]:
x_train,x_test,y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=234)

In [22]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(x_train, y_train)

In [32]:
predict = dt_clf.predict(x_test)

In [24]:
predict

array([2, 1, 1, 2, 1, 2, 2, 2, 2, 0, 0, 2, 0, 2, 2, 2, 1, 0, 0, 2, 2, 2,
       2, 0, 1, 2, 0, 0, 2, 1])

In [29]:
accuracy = accuracy_score(y_test, predict)

In [27]:
accuracy

0.9666666666666667

In [42]:
df = pd.DataFrame(data = x_test, columns =iris.feature_names)

In [43]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,6.9,3.1,5.4,2.1
1,5.5,2.5,4.0,1.3
2,5.5,2.4,3.7,1.0
3,5.9,3.0,5.1,1.8
4,5.6,2.7,4.2,1.3
5,6.3,2.9,5.6,1.8
6,6.4,3.1,5.5,1.8
7,6.1,3.0,4.9,1.8
8,7.7,2.6,6.9,2.3
9,5.1,3.8,1.6,0.2


In [44]:
df['answer'] = y_test
df['predict'] = predict

In [45]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),answer,predict
0,6.9,3.1,5.4,2.1,2,2
1,5.5,2.5,4.0,1.3,1,1
2,5.5,2.4,3.7,1.0,1,1
3,5.9,3.0,5.1,1.8,2,2
4,5.6,2.7,4.2,1.3,1,1
5,6.3,2.9,5.6,1.8,2,2
6,6.4,3.1,5.5,1.8,2,2
7,6.1,3.0,4.9,1.8,2,2
8,7.7,2.6,6.9,2.3,2,2
9,5.1,3.8,1.6,0.2,0,0


In [52]:
features = iris.data
label= iris.target

In [53]:
skfold = StratifiedKFold(n_splits=3)
cv_accuracy = []


In [55]:
def train_and_evaluate(train_index, test_index, model, features, label):
    x_train, x_test = features[train_index], features[test_index]
    y_train, y_test = label[train_index], label[test_index]

    model.fit(x_train, y_train)
    predict = model.predict(x_test)
    accuracy = accuracy_score(y_test, predict)

    return accuracy, len(x_train), len(x_test), test_index

In [56]:
for fold, (train_index, test_index) in enumerate(skfold.split(features, label), 1):
    accuracy, train_size, test_size, test_idx = train_and_evaluate(train_index, test_index, dt_clf, features, label)
    cv_accuracy.append(accuracy)

In [57]:
cv_accuracy

[0.98, 0.92, 0.98]

In [66]:
data = iris.data
label = iris.target

scores = cross_val_score(dt_clf, data, label, scoring="accuracy", cv=5)

print("교차 검증 별 정확도: ", np.round(scores, 5))
print("평균 검증 정확도: ", np.round(np.mean(scores), 5))

교차 검증 별 정확도:  [0.96667 0.96667 0.9     1.      1.     ]
평균 검증 정확도:  0.96667


In [74]:
items = ['강남구','서초구','송파구','노원구','마포구','용산구']

In [70]:
encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)
print(labels)

[0 3 4 1 2 5]


In [80]:
oh_encoder = OneHotEncoder()
items = np.array(items).reshape(-1,1)
oh_encoder.fit(items)
oh_labels = oh_encoder.transform(items)
print(oh_labels.toarray())

[[1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1.]]


In [81]:
df = pd.DataFrame(items)

In [83]:
pd.get_dummies(df)

Unnamed: 0,0_강남구,0_노원구,0_마포구,0_서초구,0_송파구,0_용산구
0,True,False,False,False,False,False
1,False,False,False,True,False,False
2,False,False,False,False,True,False
3,False,True,False,False,False,False
4,False,False,True,False,False,False
5,False,False,False,False,False,True


In [84]:
data = {'Fruit': ['Apple', 'Banana', 'Cherry', 'Apple', 'Cherry'],
        'Color': ['Red', 'Yellow', 'Red', 'Green', 'Red']}
df = pd.DataFrame(data)
df_encoded = pd.get_dummies(df, columns=['Fruit', 'Color'], prefix=['Fruit', 'Color'])

In [85]:
df_encoded

Unnamed: 0,Fruit_Apple,Fruit_Banana,Fruit_Cherry,Color_Green,Color_Red,Color_Yellow
0,True,False,False,False,True,False
1,False,True,False,False,False,True
2,False,False,True,False,True,False
3,True,False,False,True,False,False
4,False,False,True,False,True,False
