# Sckit-Learn Preprocessing Tool

In [22]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import  matplotlib.pyplot as plt
import seaborn as sns

mpl.rcParams['font.family'] = 'Malgun Gothic'
mpl.rcParams['axes.unicode_minus'] = False
plt.style.use("ggplot")
from IPython.display import set_matplotlib_formats
set_matplotlib_formats("retina")
sns.set_theme(style="whitegrid")

import warnings
warnings.filterwarnings("ignore")

from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, f1_score
from sklearn.metrics import precision_recall_curve, roc_auc_score, roc_curve

In [2]:
iris= load_iris()
iris_df = pd.DataFrame(iris.data, columns=[word[:12].strip() for word in iris.feature_names])
print('iris_df.shape :',iris_df.shape)
iris_df.head()

iris_df.shape : (150, 4)


Unnamed: 0,sepal length,sepal width,petal length,petal width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


## 1. sklearn.model_selection

### train_test_split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(iris_df.to_numpy(), iris.target, stratify = iris.target,
                                                    test_size=0.2, shuffle=True, random_state=42)
print('X_train.shape',X_train.shape)
print('y_train.shape',y_train.shape)
print('X_test.shape',X_test.shape)
print('y_test.shape',y_test.shape)

X_train.shape (120, 4)
y_train.shape (120,)
X_test.shape (30, 4)
y_test.shape (30,)


### cross_val_score

In [4]:
dt_clf = DecisionTreeClassifier()

scores = cross_val_score(dt_clf, X_train, y_train, cv=5, scoring='accuracy')
print('교차검증 cv=5 정확도 :',np.round(scores,4))
print('교차검증 cv=5 평균정확도 :',np.round(np.mean(scores),4))

교차검증 cv=5 정확도 : [0.9167 0.9583 0.9583 0.9583 0.9167]
교차검증 cv=5 평균정확도 : 0.9417


### GridSearchCV

In [5]:
params = {'max_depth':[1,2,3],
          'min_samples_leaf':[2,3]}

dt_clf = DecisionTreeClassifier()
grid_dtree = GridSearchCV(dt_clf, param_grid=params, cv=3, refit=True)
grid_dtree.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [1, 2, 3], 'min_samples_leaf': [2, 3]})

In [6]:
score_df = pd.DataFrame(grid_dtree.cv_results_)
score_df[['params','mean_test_score','rank_test_score','split0_test_score']].sort_values('rank_test_score').head()

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score
4,"{'max_depth': 3, 'min_samples_leaf': 2}",0.958333,1,0.95
5,"{'max_depth': 3, 'min_samples_leaf': 3}",0.958333,1,0.95
2,"{'max_depth': 2, 'min_samples_leaf': 2}",0.933333,3,0.925
3,"{'max_depth': 2, 'min_samples_leaf': 3}",0.933333,3,0.925
0,"{'max_depth': 1, 'min_samples_leaf': 2}",0.658333,5,0.675


In [7]:
print('GridSearchCV 최적 파라미터 :', grid_dtree.best_params_)
print('GridSearchCV 최고 정확도 : {0:.4f}'.format(grid_dtree.best_score_))

GridSearchCV 최적 파라미터 : {'max_depth': 3, 'min_samples_leaf': 2}
GridSearchCV 최고 정확도 : 0.9583


In [8]:
best_est = grid_dtree.best_estimator_
pred = best_est.predict(X_test)
print('Test Data ACC : {0:.4f}'.format(accuracy_score(y_test,pred)))

Test Data ACC : 0.9667


## 2. sklearn.preprocessing

### LabelEncoder

In [9]:
item = ['TV','냉장고','전자레인지','컴퓨터','선풍기','선풍기','믹서','믹서']

In [10]:
l_encoder = LabelEncoder()
l_encoder.fit(item)
labels = l_encoder.transform(item)

print(l_encoder.classes_)
print(labels)

['TV' '냉장고' '믹서' '선풍기' '전자레인지' '컴퓨터']
[0 1 4 5 3 3 2 2]


In [11]:
l_encoder.inverse_transform(labels)

array(['TV', '냉장고', '전자레인지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서'], dtype='<U5')

### OneHotEncoder

In [12]:
l_encoder = LabelEncoder()
labels = l_encoder.fit_transform(item)
print(l_encoder.classes_)
print(labels)

labels_2d = labels.reshape(-1,1)
oh_encoder = OneHotEncoder()
oh_labels = oh_encoder.fit_transform(labels_2d)
oh_labels.toarray()

['TV' '냉장고' '믹서' '선풍기' '전자레인지' '컴퓨터']
[0 1 4 5 3 3 2 2]


array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

In [13]:
# pd.get_dummies()
pd.get_dummies(item).to_numpy()

array([[1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0]], dtype=uint8)

### StandardScaler

In [14]:
std_scaler = StandardScaler()
iris_scaled = std_scaler.fit_transform(iris.data)
iris_scaled[:4]

array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ],
       [-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
       [-1.38535265,  0.32841405, -1.39706395, -1.3154443 ],
       [-1.50652052,  0.09821729, -1.2833891 , -1.3154443 ]])

### MinMaxScaler

In [15]:
norm_scaler = MinMaxScaler()
iris_normed = norm_scaler.fit_transform(iris.data)
iris_normed[:4]

array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667]])

## 3. sklearn.metrics

### confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

In [20]:
def get_clf_eval(y_test,pred):
    cf = confusion_matrix(y_test,pred)
    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test,pred)
    rec = recall_score(y_test,pred)
    f1 = f1_score(y_test,pred)
    
    print(cf)
    print('정확도 :{0:4f}, 정밀도 :{0:4f}, 재현율 :{0:4f}, F1 :{0:4f}'.format(acc,pre,rec,f1))

### precision_recall_curve

In [None]:
# 레이블 값이 1일 떄의 예측확률을 추출
pred_proba = clf.predict_proba(X_test)[:,1]

precisions, recalls, thresholds = precision_recall_curve(y_test, y_pred_proba)

thr_idx = np.arange(0,thresholds[0],15)
print('threshold :',np.round(thresholds[thr_idx],2))
print('precision :',np.round(precisions[thr_idx],3))
print('recall :',np.round(recalls[thr_idx],3))

In [21]:
def precision_recall_curve(y_test,pred_proba):
    precisions, recalls, thresholds = precision_recall_curve(y_test, pred_proba)
    
    plt.figure(figsize=(8,6))
    threshold_boundary = thresholds.shape[0]
    plt.plot(thresholds, precisions[0:threshold_boundary], linestyle='--', label='precision')
    pltplot(thresholds, recalls[0:threshold_boundary], label='recall')
    
    start,end = plt.xlim()
    plt.xticks(np.round(np.arange(start,end,0.1),2))
    plt.xlabel('Threshold value')
    plt.ylabel('Precision and Recall')
    plt.legend()
    plt.grid()
    plt.show()

### roc_curve, roc_auc_score

In [None]:
# 레이블 값이 1일 떄의 예측확률을 추출,
pred_proba = clf.predict_proba(X_test)[:,1]

fprs, tprs, thresholds = roc_curve(y_test, pred_proba)

thr_idx = np.arange(0,thresholds[0],5)
print('threshold :',np.round(thresholds[thr_idx],2))
print('fpr :',np.round(fprs[thr_idx],3))
print('tpr :',np.round(tprs[thr_idx],3))

In [23]:
def roc_curve_plot(y_test, pred_proba):
    fprs, tprs, thresholds = roc_curve(y_test, pred_proba)
    
    plt.figure(figsize=(8,6))
    plt.plot(fprs, tprs, label='ROC')
    plt.plot([0,1],[0,1],linestyle='k--', label='Random')
    
    start, end = plt.xlim()
    plt,xticks(np.round(np.arange(start,end,0.1),2))
    plt.xlim(0,1)
    plt.ylim(0,1)
    plt.xlabel('FPR(1 - Sensitivity)')
    plt.ylabel('TPR(Recall)')
    plt.legend()
    plt.show()

In [None]:
roc_score = roc_auc_score(y_test, pred_proba)
print('ROC AUC Score : {0:.4f}'.format(roc_score))