# 네이브 베이즈 분류 & KNN

In [14]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import  matplotlib.pyplot as plt
import seaborn as sns

mpl.rcParams['font.family'] = 'Malgun Gothic'
mpl.rcParams['axes.unicode_minus'] = False
plt.style.use("ggplot")
from IPython.display import set_matplotlib_formats
set_matplotlib_formats("retina")
sns.set_theme(style="whitegrid")

import warnings
warnings.filterwarnings("ignore")

from sklearn.datasets import load_iris,load_breast_cancer, load_boston
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, r2_score
from sklearn.metrics import precision_recall_curve, roc_auc_score, roc_curve, mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline

In [6]:
# 여러 분류 지표 출력
def get_clf_eval(y_test,pred):
    cf = confusion_matrix(y_test,pred)
    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test,pred)
    rec = recall_score(y_test,pred)
    f1 = f1_score(y_test,pred)
    
    print(cf)
    print('정확도 :{0:.4f}, 정밀도 :{1:.4f}, 재현율 :{2:.4f}, F1 :{3:.4f}'.format(acc,pre,rec,f1))
#     return [acc, pre, rec, f1]

# 다중 분류 지표 출력
def get_multiclf_eval(y_test,pred):
    cf = confusion_matrix(y_test,pred)
    acc = accuracy_score(y_test,pred)
    pre = precision_score(y_test,pred, average='micro')
    rec = recall_score(y_test,pred, average='macro')
    f1 = f1_score(y_test,pred, average='weighted')
    
    print(cf)
    print('정확도 :{0:.4f}, 정밀도 :{1:.4f}, 재현율 :{2:.4f}, F1 :{3:.4f}'.format(acc,pre,rec,f1))
#   return [acc, pre, rec, f1]

# roc curve
def roc_curve_plot(y_test, pred_proba):
    fprs, tprs, thresholds = roc_curve(y_test, pred_proba)
    
    plt.figure(figsize=(8,6))
    plt.plot(fprs, tprs, label='ROC')
    plt.plot([0,1],[0,1],linestyle='--', color='k', label='Random')
    
    start, end = plt.xlim()
    plt.xticks(np.round(np.arange(start,end,0.1),2))
    plt.xlim(0,1)
    plt.ylim(0,1)
    plt.xlabel('FPR(1 - Sensitivity)')
    plt.ylabel('TPR(Recall)')
    plt.legend()
    plt.show()

## 네이브 베이즈 분류(Naive Bayes Classification)

In [63]:
# iris data load
X, y = load_iris(return_X_y=True)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=0)

In [64]:
# 가우시안 NB
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

In [65]:
print('클래스별 사전 확률 :',np.round(gnb.class_prior_,2))

클래스별 사전 확률 : [0.32 0.31 0.37]


In [66]:
# 성능 평가
get_multiclf_eval(y_test,y_pred)

[[11  0  0]
 [ 0 13  0]
 [ 0  1  5]]
정확도 :0.9667, 정밀도 :0.9667, 재현율 :0.9444, F1 :0.9658


## KNN(KNearestNeighbors)

In [67]:
# iris data load
X, y = load_iris(return_X_y=True)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=0)

In [75]:
param_grid = {
    'n_neighbors':[3,5,7,9,12,15],
    'weights':['uniform','distance']
}

knn = KNeighborsClassifier()
grid_cv = GridSearchCV(knn, refit=True, cv=3, scoring='accuracy', param_grid=param_grid)
grid_cv.fit(X_train,y_train)

GridSearchCV(cv=3, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [3, 5, 7, 9, 12, 15],
                         'weights': ['uniform', 'distance']},
             scoring='accuracy')

In [76]:
print('grid_cv.best_params_ :',grid_cv.best_params_)
print('grid_cv.best_score_ :',grid_cv.best_score_)

grid_cv.best_params_ : {'n_neighbors': 9, 'weights': 'uniform'}
grid_cv.best_score_ : 0.9499999999999998


In [80]:
best = grid_cv.best_estimator_
pred = best.predict(X_test)
get_multiclf_eval(y_test,pred)

[[11  0  0]
 [ 0 13  0]
 [ 0  0  6]]
정확도 :1.0000, 정밀도 :1.0000, 재현율 :1.0000, F1 :1.0000
