# Model Evaluation

## Basics

### Overfitting and Underfitting

#### Over/Underfitting in LinearRegression with virtual data

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

# Ground Truth
def true_fun(X):
    return np.cos(1.5 * np.pi * X)

# Virtual data
np.random.seed(0)
n_samples = 30
degrees = [1, 4, 16]

X = np.sort(np.random.rand(n_samples))
# small noise
y = true_fun(X) + np.random.randn(n_samples) * 0.1

#visualization
plt.figure(figsize=(14, 5))
tradeoffs=['x\nhigh-bias\n low variance', 'x\nmedian', 'x\nlow-bias\n high variance']
for i in range(len(degrees)):
    ax = plt.subplot(1, len(degrees), i + 1)
    plt.setp(ax, xticks=(), yticks=())

    polynomial_features = PolynomialFeatures(degree=degrees[i], include_bias=False)
    linear_regression = LinearRegression()
    pipeline = Pipeline(
        [
            ("polynomial_features", polynomial_features),
            ("linear_regression", linear_regression),
        ]
    )
    pipeline.fit(X[:, np.newaxis], y)

    # Evaluate the models using crossvalidation
    scores = cross_val_score(
        pipeline, X[:, np.newaxis], y, scoring="neg_mean_squared_error", cv=10
    )

    X_test = np.linspace(0, 1, 100)
    plt.plot(X_test, pipeline.predict(X_test[:, np.newaxis]), label="Model")
    plt.plot(X_test, true_fun(X_test), label="True function")
    plt.scatter(X, y, edgecolor="b", s=20, label="Samples")
    plt.xlabel(tradeoffs[i])
    plt.ylabel("y")
    plt.xlim((0, 1))
    plt.ylim((-2, 2))
    plt.legend(loc="best")
    plt.title(
        "Degree {}\nMSE = {:.2f}(+/- {:.2f})".format(
            degrees[i], -scores.mean(), scores.std()
        )
    )
plt.show()

#### Over/Underfitting in DecisionTreeClassifier with virtual data


In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from matplotlib import pyplot

# create dataset
X, y = make_classification(n_samples=10000, n_features=20, n_informative=5, n_redundant=15, random_state=1)

# split into train test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# define lists to collect scores
train_scores, test_scores = list(), list()

# define the tree depths to evaluate
values = [i for i in range(1, 21)]

# evaluate a decision tree for each depth
for i in values:
	# configure the model
	model = DecisionTreeClassifier(max_depth=i)
	# fit model on the training dataset
	model.fit(X_train, y_train)
	# evaluate on the train dataset
	train_yhat = model.predict(X_train)
	train_acc = accuracy_score(y_train, train_yhat)
	train_scores.append(train_acc)
	# evaluate on the test dataset
	test_yhat = model.predict(X_test)
	test_acc = accuracy_score(y_test, test_yhat)
	test_scores.append(test_acc)
	# summarize progress
	print('>%d, train: %.3f, test: %.3f' % (i, train_acc, test_acc))

# plot of train and test scores vs tree depth
pyplot.plot(values, train_scores, '-o', label='Train')
pyplot.plot(values, test_scores, '-o', label='Test')
pyplot.legend()
pyplot.show()

### Bias and Variance

##### Setup

In [None]:
# common lib
import sklearn
import numpy as np

#### California Housing dataset

* The target variable is the median house value for California districts, expressed in hundreds of thousands of dollars.
* target값을 $100,000 기준으로 반올림하여 6개의 class로 범주화 함

In [None]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

housing = fetch_california_housing()
housing_X = housing.data
housing_y = np.round(housing.target).astype(int) # make y discrete
housing_feature_names = housing.feature_names

print('Number of target: ',len(set(housing_y)))

# combine features and class data
housing_np = np.append(housing_X, housing_y.reshape(-1,1), axis=1)
housing_col_names = housing_feature_names + ['class']

housing_pd = pd.DataFrame(housing_np, columns=housing_col_names)
housing_pd.head(3)

##### train_test_split

In [None]:
from sklearn.model_selection import train_test_split
housing_X_train, housing_X_test, housing_y_train, housing_y_test = train_test_split(housing_X, housing_y, random_state=42)

#### Models Complexity

* High Vias, Low Variance(Model Complexity is Low)
  * High k in kNN Classifier
  * Low max_depth in DecisionTree Classifier
* Low Vias, High Variance(Model Complexity is High)
  * Low k in kNN Classifier
  * High max_depth in DecisionTree Classifier

* visualization 함수

In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

def viz_val_bar(param_range, test_acc, name):
  idx = np.arange(len(param_range))
  plt.figure(figsize=(10,5))
  colors = sns.color_palette('hls',len(param_range))
  bars = plt.bar(idx, test_acc, width=0.3, color=colors)
  plt.ylabel('Accuracy')
  plt.xlabel(name, fontsize=15, rotation=30)
  plt.legend(handles=bars, labels=param_range)
  plt.show()

**Model Complexity**

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


clf1 = KNeighborsClassifier(n_neighbors=100)
clf2 = KNeighborsClassifier(n_neighbors=5)
clf3 = KNeighborsClassifier(n_neighbors=1)

clf4 = DecisionTreeClassifier(max_depth=4, random_state=42)
clf5 = DecisionTreeClassifier(max_depth=8, random_state=42)
clf6 = DecisionTreeClassifier(max_depth=64, random_state=42)

from sklearn.metrics import accuracy_score

clfs = [clf1, clf2, clf3, clf4, clf5, clf6]

raw_train_accs = []
raw_test_accs = []
class_names = []
for clf in clfs:

  class_name = clf.__class__.__name__
  class_names.append(class_name)

  clf.fit(housing_X_train, housing_y_train)
  
  pred = clf.predict(housing_X_train)
  raw_train_acc = accuracy_score(housing_y_train, pred)
  raw_train_accs.append(raw_train_acc)
  print('{0} Train 정확도: {1:.4f}'.format(class_name, raw_train_acc))

  pred = clf.predict(housing_X_test)
  raw_test_acc = accuracy_score(housing_y_test, pred)
  raw_test_accs.append(raw_test_acc)
  print('{0} Test 정확도: {1:.4f}'.format(class_name, raw_test_acc))

viz_val_bar(class_names, raw_train_accs, 'train')
viz_val_bar(class_names, raw_test_accs, 'test')


### Holdout method

##### Setup

In [None]:
# common lib
import sklearn
import numpy as np

#### train_test_split

**- sklearn.model_selection.[train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None)](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)**

Split arrays or matrices into random train and test subsets.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

iris = sns.load_dataset("iris")

iris_X = iris.iloc[:,0:3]
iris_y = iris.iloc[:,[4]]

# train_test_split
from sklearn.model_selection import train_test_split
iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(iris_X, iris_y, test_size=0.5, shuffle=True, random_state=4)

fig, ax = plt.subplots(ncols=3, figsize=(10,5))


sns.countplot(x="species", data=iris_y, ax=ax[0])
sns.countplot(x="species", data=iris_y_train, ax=ax[1])
sns.countplot(x="species", data=iris_y_test, ax=ax[2])
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

iris = sns.load_dataset("iris")

iris_X = iris.iloc[:,0:3]
iris_y = iris.iloc[:,[4]]

# train_test_split
from sklearn.model_selection import train_test_split
iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(iris_X, iris_y, test_size=0.5, stratify=iris_y, random_state=42)

fig, ax = plt.subplots(ncols=3, figsize=(10,5))

sns.countplot(x="species", data=iris_y, ax=ax[0])
sns.countplot(x="species", data=iris_y_train, ax=ax[1])
sns.countplot(x="species", data=iris_y_test, ax=ax[2])
plt.show()

#### Repeated holdout

The following plots illustrate the issue of increasing the pessimistic bias of a performance estimate if we make the test set too large -- because we withold too many examples for model training such that the model doesn't reach it's capacity -- this assumes that we would fit the model on the whole training set after model evaluation. On the other hand, if we decrease the size of the test set, the estimate of the generalization performance will have a larger variance.

In [None]:
import matplotlib.pyplot as plt
from mlxtend.data import iris_data
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

iris_X, iris_y = iris_data()

clf_1 = KNeighborsClassifier(n_neighbors=3,
                             weights='uniform', 
                             algorithm='kd_tree', 
                             leaf_size=30, 
                             p=2, 
                             metric='minkowski', 
                             metric_params=None, 
                             n_jobs=1)

test_sizes=[0.05, 0.2, 0.9]

for size in test_sizes:
  rng = np.random.RandomState(seed=12345)
  seeds = np.arange(10**5)
  rng.shuffle(seeds)
  seeds = seeds[:50]
  pred_2 = []
  for i in seeds:
      iris_X_train, iris_X_test, iris_y_train, iris_y_test = train_test_split(iris_X, iris_y,
                                                          test_size=size, 
                                                          random_state=i,
                                                          stratify=iris_y)
      y_pred_i = clf_1.fit(iris_X_train, iris_y_train).predict(iris_X_test)
      y_pred_i_acc = np.mean(iris_y_test == y_pred_i)
      pred_2.append(y_pred_i_acc)

  pred_2 = np.asarray(pred_2)
  print()
  with plt.style.context(('fivethirtyeight')):
      plt.bar(range(0, pred_2.shape[0]), pred_2, color='gray', alpha=0.7)
      plt.axhline(pred_2.max(), color='k', linewidth=1, linestyle='--')
      plt.axhline(pred_2.min(), color='k', linewidth=1, linestyle='--')
      plt.axhspan(pred_2.min(), pred_2.max(), alpha=0.2, color='steelblue')
      plt.ylim([0, pred_2.max() + 0.1])
      plt.xlabel('Repetition with test_size: {}'.format(size))
      plt.ylabel('Accuracy')
      plt.ylim([0.5, 1.05])
      plt.title('Average: {:.2f}%\n Variance: {:.2f}'.format(np.mean(pred_2)*100, np.var(pred_2)*100))
      plt.tight_layout()
      plt.show()

## Cross-Validation

### Setup

In [None]:
# Common imports
import sklearn
import numpy as np

###  KFold
**- sklearn.model_selection.[KFold(n_splits=5, *, shuffle=False, random_state=None)](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html)**

Provides train/test indices to split data in train/test sets. Split dataset into k consecutive folds (without shuffling by default). Each fold is then used once as a validation while the k - 1 remaining folds form the training set.

* n_splits: int, default=5
  * Number of folds. Must be at least 2.
* shuffle: bool, default=False
  * Whether to shuffle the data before splitting into batches.

**n_splits=5**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold

iris = sns.load_dataset("iris")

iris_X = iris.iloc[:,0:3]
iris_y = iris.iloc[:,[4]]

kf = KFold(n_splits=5)

fig, ax = plt.subplots(ncols=5, figsize=(15,5))

for idx, (train_idx, test_idx)  in enumerate(kf.split(iris_X, iris_y)):
  sns.countplot(x="species", data=iris_y.iloc[train_idx], ax=ax[idx])

plt.show()

**shuffle=True**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold

iris = sns.load_dataset("iris")

iris_X = iris.iloc[:,0:3]
iris_y = iris.iloc[:,[4]]

kf = KFold(shuffle=True)

fig, ax = plt.subplots(ncols=5, figsize=(15,5))

for idx, (train_idx, test_idx)  in enumerate(kf.split(iris_X, iris_y)):
  sns.countplot(x="species", data=iris_y.iloc[train_idx], ax=ax[idx])

plt.show()

### StratifiedKFold

**- sklearn.model_selection.[StratifiedKFold(n_splits=5, *, shuffle=False, random_state=None)](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html)**

Provides train/test indices to split data in train/test sets.

This cross-validation object is a variation of KFold that returns stratified folds. The folds are made by preserving the percentage of samples for each class.

*  As discussed in the lecture, it's important to stratify the splits (very crucial for small datasets!)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold

iris = sns.load_dataset("iris")

iris_X = iris.iloc[:,0:3]
iris_y = iris.iloc[:,[4]]

sf = StratifiedKFold(shuffle=True)

fig, ax = plt.subplots(ncols=5, figsize=(15,5))

for idx, (train_idx, test_idx)  in enumerate(sf.split(iris_X, iris_y)):
  sns.countplot(x="species", data=iris_y.iloc[train_idx], ax=ax[idx])

plt.show()

### cross_val_score

**- sklearn.model_selection.[cross_val_score(estimator, X, y=None, *, groups=None, scoring=None, cv=None, n_jobs=None, verbose=0, fit_params=None, pre_dispatch='2*n_jobs', error_score=nan)](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html)**

**CV**
  * default 5-fold
  * Determines the cross-validation splitting strategy

* visualization 함수

In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

def viz_val_bar(param_range, test_acc, name):
  idx = np.arange(len(param_range))
  plt.figure(figsize=(10,5))
  plt.bar(idx, test_acc, width=0.3)
  plt.ylabel('Accuracy')
  plt.xlabel(name, fontsize=15)
  plt.show()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

cv_acc = cross_val_score(estimator=DecisionTreeClassifier(random_state=123, max_depth=3),
                         X=iris_X,
                         y=iris_y,
                         cv=KFold(n_splits=10),
                         n_jobs=-1)

print('Average Accuracy: %.2f%%' % (np.mean(cv_acc)*100))
print('Variance: {:.2f}'.format(np.var(cv_acc)*100))
viz_val_bar(range(len(cv_acc)),cv_acc,'KFold')

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier

cv_acc = cross_val_score(estimator=DecisionTreeClassifier(random_state=123, max_depth=3),
                         X=iris_X,
                         y=iris_y,
                         cv=StratifiedKFold(n_splits=10, random_state=123, shuffle=True),
                         n_jobs=-1)

print('Average Accuracy: %.2f%%' % (np.mean(cv_acc)*100))
print('Variance: {:.2f}'.format(np.var(cv_acc)*100))
viz_val_bar(range(len(cv_acc)),cv_acc,'StratifiedKFold')

### Validation_curve

**- sklearn.model_selection.[validation_curve(estimator, X, y, *, param_name, param_range, groups=None, cv=None, scoring=None, n_jobs=None, pre_dispatch='all', verbose=0, error_score=nan, fit_params=None)](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.validation_curve.html) : Returns Scores on training sets and Scores on test set.**

Determine training and test scores for varying parameter values.

Compute scores for an estimator with different values of a specified parameter. This is similar to grid search with one parameter. However, this will also compute training scores and is merely a utility for plotting the results.

#### California Housing dataset

* The target variable is the median house value for California districts, expressed in hundreds of thousands of dollars.
* target값을 $100,000 기준으로 반올림하여 6개의 class로 범주화 함

In [None]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

housing = fetch_california_housing()
housing_X = housing.data
housing_y = np.round(housing.target).astype(int) # make y discrete
housing_feature_names = housing.feature_names

print('Number of target: ',len(set(housing_y)))

# combine features and class data
housing_np = np.append(housing_X, housing_y.reshape(-1,1), axis=1)
housing_col_names = housing_feature_names + ['class']

housing_pd = pd.DataFrame(housing_np, columns=housing_col_names)
housing_pd.head(3)

* Visualization 함수

In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

# 수치형 파라미터 시각화 함수
def viz_val_curve(param_range, train_mean, train_std, test_mean, test_std, param_name, xscale_log=False):
  plt.plot(param_range, train_mean, 
          color='blue', marker='o', 
          markersize=5, label='Training accuracy')

  plt.fill_between(param_range, train_mean + train_std,
                  train_mean - train_std, alpha=0.15,
                  color='blue')

  plt.plot(param_range, test_mean, 
          color='green', linestyle='--', 
          marker='s', markersize=5, 
          label='Validation accuracy')

  plt.fill_between(param_range, 
                  test_mean + test_std,
                  test_mean - test_std, 
                  alpha=0.15, color='green')


  plt.grid()
  plt.legend(loc='lower right')
  if xscale_log:
    plt.xscale('log')
  plt.xlabel(param_name)
  plt.ylabel('Accuracy')
  plt.ylim([np.min(test_mean)*0.8, np.max(train_mean)*1.2])
  plt.tight_layout()
  plt.show()

# 범주형 파라미터 시각화 함수
def viz_val_bar(param_range, train_mean, train_std, test_mean, test_std, param_name):
  idx = np.arange(len(param_range))
  plt.bar(idx, test_mean, width=0.3)
  plt.xlabel(param_name)
  plt.ylabel('Accuracy')
  plt.ylim([np.min(test_mean)*0.9, np.max(test_mean)*1.1])
  plt.xticks(idx, param_range, fontsize=15)
  plt.show()

**cv**

In [None]:
from sklearn.model_selection import validation_curve

param_range= [i for i in range(1,10)]
param_name='max_depth'

from sklearn.tree import DecisionTreeClassifier

dt_model_house_m = DecisionTreeClassifier(random_state=42)

train_scores, test_scores = validation_curve(
                estimator=dt_model_house_m, 
                X=housing_X, 
                y=housing_y, 
                param_name=param_name, 
                param_range=param_range,
                scoring='accuracy',
                cv=10)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

viz_val_curve(param_range, train_mean, train_std, test_mean, test_std, param_name)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import validation_curve

param_range= [i for i in range(1,10)]
param_name='max_depth'

from sklearn.tree import DecisionTreeClassifier

dt_model_house_m = DecisionTreeClassifier(random_state=42)

train_scores, test_scores = validation_curve(
                estimator=dt_model_house_m, 
                X=housing_X, 
                y=housing_y, 
                param_name=param_name, 
                param_range=param_range,
                scoring='accuracy',
                cv=StratifiedKFold(n_splits=10, random_state=123, shuffle=True))

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

viz_val_curve(param_range, train_mean, train_std, test_mean, test_std, param_name)

## Evaluation Metrics

### Loading the Breast Cancer Wisconsin dataset


*   In the Breast Cancer Wisconsin dataset, the firt column in this dataset stores the unique ID numbers of patients
*   The second column stores the corresponding cancer diagnoses (M = malignant, B = benign)
*   Columns 3-32 contain features that were extracted from digitized images of the nuclei of the cancer cells, which can be used to build a model to predict whether a tumor is benign or malignant.
*   The Breast Cancer Wisconsin dataset has been deposited in the UCI Machine Learning Repository, and more detailed information about this dataset can be found at https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic).

In [None]:
import pandas as pd

df = pd.read_csv('https://archive.ics.uci.edu/ml/'
                 'machine-learning-databases'
                 '/breast-cancer-wisconsin/wdbc.data', header=None)

print('shape: ',df.shape)
df.head()

*   First, we are converting the class labels from a string format into integers

In [None]:
from sklearn.preprocessing import LabelEncoder

X = df.loc[:, 2:].values
y = df.loc[:, 1].values
le = LabelEncoder()
y = le.fit_transform(y)
le.classes_

*   Here, class "M" (malignant cancer) will be converted to class 1, and "B" will be converted into class 0 (the order the class labels are mapped depends on the alphabetical order of the string labels)

In [None]:
le.transform(['M', 'B'])

*   Next, we split the data into 80% training data and 20% test data, using a stratified split


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, 
                     test_size=0.20,
                     stratify=y,                     
                     random_state=1)

###Confusion Matrix




In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix

pipe_knn = make_pipeline(StandardScaler(),
                         KNeighborsClassifier(n_neighbors=5))

pipe_knn.fit(X_train, y_train)

y_pred = pipe_knn.predict(X_test)

confmat = confusion_matrix(y_test, y_pred, labels=[1,0])
confmat

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels=[1,0]).ravel()
(tn, fp, fn, tp)

### Multiclass to Binary

In [None]:
y_target =    [1, 1, 1, 0, 0, 2, 0, 3]
y_predicted = [1, 0, 1, 0, 0, 2, 1, 3]

cm1 = confusion_matrix(y_target,y_predicted)
print(cm1)

#### Visualization

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

confmat_display = ConfusionMatrixDisplay(confmat).plot()
cm1_display = ConfusionMatrixDisplay(cm1, display_labels=[3,2,1,0]).plot()

###More examples

*   https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html?highlight=confusion%20matrix#sklearn.metrics.confusion_matrix
*   https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html#sklearn.metrics.confusion_matrix
*   http://rasbt.github.io/mlxtend/user_guide/evaluate/confusion_matrix/
*   http://rasbt.github.io/mlxtend/user_guide/plotting/plot_confusion_matrix/

### Precison, Recall, F1 Score and Balanced accuracy

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix

pipe_knn = make_pipeline(StandardScaler(),
                         KNeighborsClassifier(n_neighbors=5))

pipe_knn.fit(X_train, y_train)

y_pred = pipe_knn.predict(X_test)

confmat = confusion_matrix(y_test, y_pred, labels=[1,0])

print(confmat)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, \
                            recall_score, f1_score, matthews_corrcoef, balanced_accuracy_score


print('Accuracy: %.3f' % accuracy_score(y_true=y_test, y_pred=y_pred))
print('Precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred))
print('Recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred))
print('MCC: %.3f' % matthews_corrcoef(y_true=y_test, y_pred=y_pred))
print('Balanced accuracy: %.3f' % balanced_accuracy_score(y_true=y_test, y_pred=y_pred))

### Using those Metrics in GridSearch

**- sklearn.model_selection.[GridSearchCV(estimator, param_grid, scoring=None, n_jobs=None, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score=nan, return_train_score=False)](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) : Exhaustive search over specified parameter values for an estimator.**

**[scoring 종류](https://runebook.dev/ko/docs/scikit_learn/modules/model_evaluation)**

* F1 score

In [None]:
from sklearn.model_selection import GridSearchCV


param_range_n_neighbors = [3, 5, 7, 9, 15, 21, 31]
param_range_weights = ['uniform','distance']

pipe_knn = make_pipeline(StandardScaler(),
                         KNeighborsClassifier())

param_grid = [{'kneighborsclassifier__n_neighbors': param_range_n_neighbors, 'kneighborsclassifier__weights': param_range_weights}]

gs = GridSearchCV(estimator=pipe_knn,
                  param_grid=param_grid,
                  scoring='f1',
                  cv=10,
                  n_jobs=-1)


gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

* average_precision

In [None]:
from sklearn.model_selection import GridSearchCV


param_range = [3, 5, 7, 9, 15, 21, 31]
param_range_weights = ['uniform','distance']

pipe_knn = make_pipeline(StandardScaler(),
                         KNeighborsClassifier())

param_grid = [{'kneighborsclassifier__n_neighbors': param_range, 'kneighborsclassifier__weights': param_range_weights}]


gs = GridSearchCV(estimator=pipe_knn,
                  param_grid=param_grid,
                  scoring='average_precision',
                  cv=10,
                  n_jobs=-1)


gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

* F1_score, average='micro'

In [None]:
from sklearn.metrics import make_scorer
from mlxtend.data import iris_data

X_iris, y_iris = iris_data()

# for multiclass:
#scorer = make_scorer(f1_score, average='macro')
scorer = make_scorer(f1_score, average='micro')

from sklearn.model_selection import GridSearchCV

param_range = [3, 5, 7, 9, 15, 21, 31]
param_range_weights = ['uniform','distance']

pipe_knn = make_pipeline(StandardScaler(),
                         KNeighborsClassifier())

param_grid = [{'kneighborsclassifier__n_neighbors': param_range, 'kneighborsclassifier__weights': param_range_weights}]
param_range_weights = ['uniform','distance']

gs = GridSearchCV(estimator=pipe_knn,
                  param_grid=param_grid,
                  scoring=scorer,
                  cv=10,
                  n_jobs=-1)


gs = gs.fit(X_iris, y_iris)
print(gs.best_score_)
print(gs.best_params_)

### Receiver Operating Characteristic curve(ROC curve)

#### Load the Breast Cancer Wisconsin dataset

In [None]:
import pandas as pd

df = pd.read_csv('https://archive.ics.uci.edu/ml/'
                 'machine-learning-databases'
                 '/breast-cancer-wisconsin/wdbc.data', header=None)

from sklearn.preprocessing import LabelEncoder

X = df.loc[:, 2:].values
y = df.loc[:, 1].values
le = LabelEncoder()
y = le.fit_transform(y)
le.classes_

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, 
                     test_size=0.20,
                     stratify=y,                     
                     random_state=1)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline

# smaller training set to make the curve more interesting
X_train2 = X_train[:, [4, 14]]
X_test2 = X_test[:, [4, 14]]

pipe_knn = make_pipeline(StandardScaler(),
                         KNeighborsClassifier())
pipe_knn.fit(X_train2, y_train)

## train roc curve
y_preds_train = pipe_knn.predict_proba(X_train2)
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_preds_train[:,1])
roc_auc = metrics.auc(fpr, tpr)
display_train = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                                  estimator_name='Train')
# viz
display_train.plot(color='red')
plt.plot([0, 1],
         [0, 1],
         linestyle='--',
         color='black')
plt.plot([0, 0, 1],
         [0, 1, 1],
         linestyle=':',
         color='black')

## test roc curve
y_preds_test = pipe_knn.predict_proba(X_test2)
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_preds_test[:,1])
roc_auc = metrics.auc(fpr, tpr)
display_test = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                                  estimator_name='Test')
# viz
display_test.plot()
plt.plot([0, 1],
         [0, 1],
         linestyle='--',
         color='black')
plt.plot([0, 0, 1],
         [0, 1, 1],
         linestyle=':',
         color='black')
plt.show()

# Exercise

## 1번 문제 (Confidence interval and Resamping)
iris 데이터를 아래 모델들을 사용하여 분류하고, cross validation curve함수를 사용하여 Test set의 accuracy가 가장 높은 파라미터 조합을 도출하시오.
  * GaussianNB
  * kNeighborsClassifier

* Iris dataset

```python
from sklearn.datasets import load_iris
iris = load_iris() 
```

## 1번 문제 답안

## 2번 문제

Breast cancer 데이터를 사용하여 아래 Evaluation Metrics를 표현하시오.

* KNeighborsClassifier를 이용하여 데이터를 분류하고 confusion metrix로 결과를 도출하시오.
* DecisionTreeClassifier를 이용하여 데이터를 분류하고 Precision, Recall, F1 Score, Balanced_accuracy를 도출하시오.



```python
from sklearn import datasets
import pandas as pd

breast = datasets.load_breast_cancer()
breast_X = breast["data"]
breast_y = breast["target"]
breast_feature_name = breast.feature_names

pd.DataFrame(breast_X, columns=breast_feature_name).head(5)
```



## 2번 문제 답안