## **실습 4. AI 모델링 최적화**
## 본 실습파일은 <u><b>학습자용</u> 입니다.
* 본 과정에서는 웹페이지에서 추출한 Feature(특징) 기반으로 악성사이트를 탐지하는 머신러닝 분류문제를 예제코드를 통해서 해결할 것입니다.
---


### **[실습 프로세스]**
### 0. 데이터 불러오기
### 1. 데이터 전처리
### 2. train_test_split을 이용하여, train_x, test_x, train_y, test_y로 데이터 분리
### 3. GridSearch 활용 AI모델링



# <b>Step 0. 라이브러리 import 및 데이터 불러오기
### **가. 라이브러리 import**

* 데이터 프레임 관련 라이브러리

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings

warnings.filterwarnings(action='ignore')
%config InlineBackend.figure_format='retina'

### **나.  학습데이터 불러오기**

In [3]:
train_x = pd.read_csv('train_x.csv', index_col=0)
test_x = pd.read_csv('test_x.csv', index_col=0)
train_y = pd.read_csv('train_y.csv', index_col=0)
test_y = pd.read_csv('test_y.csv', index_col=0)

In [4]:
feature_names = list(train_x)

### **다.  데이터 전처리**

In [5]:
from sklearn.preprocessing import StandardScaler

In [6]:
train_x.describe()

Unnamed: 0,url_len,url_num_hyphens_dom,url_hostname_len,url_num_dots,url_num_underscores,url_ip_present,url_entropy,url_port,html_num_tags('iframe'),html_num_tags('script'),html_num_tags('div'),html_num_tags('head'),html_num_tags('body'),html_num_tags('form'),html_num_tags('a'),html_num_tags('embed')_is,html_num_tags('object')_is,url_query_len_is
count,2586.0,2586.0,2586.0,2586.0,2586.0,2586.0,2586.0,2586.0,2586.0,2586.0,2586.0,2586.0,2586.0,2586.0,2586.0,2586.0,2586.0,2586.0
mean,56.508894,0.422274,20.50116,2.526295,0.29041,0.058778,4.234053,0.00232,0.218097,8.469838,72.482985,0.99768,1.016628,0.979892,65.245166,0.016241,0.022428,0.064578
std,85.838214,0.862921,10.078656,1.452898,1.20137,0.235254,0.396204,0.048122,0.937711,11.99236,430.759978,0.149772,0.23129,1.640747,396.814806,0.126427,0.148101,0.245828
min,6.0,0.0,4.0,1.0,0.0,0.0,2.737839,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,24.25,0.0,14.0,2.0,0.0,0.0,3.994701,0.0,0.0,2.0,5.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0
50%,36.0,0.0,17.0,2.0,0.0,0.0,4.198935,0.0,0.0,4.0,31.0,1.0,1.0,1.0,15.0,0.0,0.0,0.0
75%,57.0,1.0,24.0,3.0,0.0,0.0,4.474171,0.0,0.0,11.75,59.0,1.0,1.0,1.0,48.75,0.0,0.0,0.0
max,1837.0,14.0,109.0,26.0,18.0,1.0,5.821782,1.0,26.0,174.0,19941.0,3.0,3.0,57.0,13451.0,1.0,1.0,1.0


In [7]:
std_scaler = StandardScaler()
std_scaler.fit(train_x)
train_x = std_scaler.transform(train_x)
test_x = std_scaler.transform(test_x)

### **라. train_test_split을 이용하여 train/test  데이터 분리**

- test_size = 0.3
- random_state = 2021

In [None]:
# from sklearn.model_selection import train_test_split

In [None]:
# train_test_split 사용

# train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.3, random_state=2021)
# train_x.shape, val_x.shape, train_y.shape, val_y.shape

### **마. Confusion Matrix 함수 정의**
#### Confusion Matrix란 Training 을 통한 Prediction 성능을 측정하기 위해 예측 value와 실제 value를 비교하기 위한 표입니다.
#### 아래 함수는 이번 과제에서 confusion matrix 결과를 보기 쉽게 표현한 것으로 사용 예를 참고하여 모델 결과 확인에 사용하시기 바랍니다.

**<span style="color:green">[참고링크] 공식 Document**</span>
 
* confusion matrix(https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html)

In [8]:
from sklearn.metrics import classification_report as creport
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

In [9]:
def plot_confusion_matrix(ax, matrix, labels = ['malicious','benign'], title='Confusion matrix', fontsize=9):
    ax.set_xticks([x for x in range(len(labels))])
    ax.set_yticks([y for y in range(len(labels))])

    # Place labels on minor ticks
    ax.set_xticks([x + 0.5 for x in range(len(labels))], minor=True)
    ax.set_xticklabels(labels, rotation='90', fontsize=fontsize, minor=True)
    ax.set_yticks([y + 0.5 for y in range(len(labels))], minor=True)
    ax.set_yticklabels(labels[::-1], fontsize=fontsize, minor=True)

    # Hide major tick labels
    ax.tick_params(which='major', labelbottom='off', labelleft='off')

    # Finally, hide minor tick marks
    ax.tick_params(which='minor', width=0)

    # Plot heat map
    proportions = [1. * row / sum(row) for row in matrix]
    ax.pcolor(np.array(proportions[::-1]), cmap=plt.cm.Blues)

    # Plot counts as text
    for row in range(len(matrix)):
        for col in range(len(matrix[row])):
            confusion = matrix[::-1][row][col]
            if confusion != 0:
                ax.text(col + 0.5, row + 0.5, int(confusion),
                        fontsize=fontsize,
                        horizontalalignment='center',
                        verticalalignment='center')

    # Add finishing touches
    ax.grid(True, linestyle=':')
    ax.set_title(title, fontsize=fontsize)
    ax.set_xlabel('prediction', fontsize=fontsize)
    ax.set_ylabel('actual', fontsize=fontsize)

    plt.show()

### <span style="color:blue">[예시] Confusion Matrix 사용 방법<span>

- 샘플
#### > confusion = confusion_matrix(test_y, dt_pred)
#### > fig, ax = plt.subplots(figsize=(10,3))
#### > plot_confusion_matrix(ax, confusion, fontsize=30)

---

# <b>RandomForest GridSearchCV
### 만족할만한 하이퍼파라미터 조합을 찾는 단순한 방법은 수동으로 하이퍼파라미터를 조정하면서 찾는 방법입니다.
### GridSearchcv는 자동으로 복수개의 내부 모형을 생성하고 이를 모두 실행시켜서 최적의 하이퍼파라미터를 탐색해 줍니다.
### 탐색하고자 하는 하이퍼파라미터를 지정하면 가능한 모든 하이퍼파라미터 조합에 대해 교차 검증을 사용해 평가하게 됩니다.


* 주요 파라미터<br>
<table align="left">
    <tr>
        <td align="center">파라미터 명</td><td align="center">설명</td>
    </tr>
     <tr>
        <td align="center">param_grid</td><td>파라미터 딕셔너리</td>
    </tr>
    <tr>
        <td align="center">scoring</td><td>예측 성능을 측정할 평가 방법</td>
    </tr>
    <tr>
        <td align="center">cv</td><td>교차 검증을 위해 분할되는 폴드 수</td>
    </tr>
</table>

**<span style="color:green">[참고링크] 공식 Document**</span>
 
* GridSearchCV(https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)
* model evaluation(https://scikit-learn.org/stable/modules/model_evaluation.html)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [12]:
model_RFC = RandomForestClassifier(random_state=1)
param = {
    'max_depth' : range(1, 5)
}
model = GridSearchCV(
    model_RFC,
    param,
    cv=5
)
model.fit(train_x, train_y)
model.best_params_, model.best_score_

({'max_depth': 4}, 0.9006183580651665)

In [13]:
model_RFC = RandomForestClassifier(random_state=1)
param = {
    'max_depth' : range(3, 8)
}
model = GridSearchCV(
    model_RFC,
    param,
    cv=5
)
model.fit(train_x, train_y)
model.best_params_, model.best_score_

({'max_depth': 7}, 0.9203393501265842)

In [14]:
model_RFC = RandomForestClassifier(random_state=1)
param = {
    'max_depth' : range(6, 12)
}
model = GridSearchCV(
    model_RFC,
    param,
    cv=5
)
model.fit(train_x, train_y)
model.best_params_, model.best_score_

({'max_depth': 11}, 0.9361956042807107)

In [15]:
model_RFC = RandomForestClassifier(random_state=1)
param = {
    'max_depth' : range(10, 20)
}
model = GridSearchCV(
    model_RFC,
    param,
    cv=5
)
model.fit(train_x, train_y)
model.best_params_, model.best_score_

({'max_depth': 18}, 0.9400633294250316)

In [16]:
model_RFC = RandomForestClassifier(random_state=1)
param = {
    'max_depth' : range(16, 25)
}
model = GridSearchCV(
    model_RFC,
    param,
    cv=5
)
model.fit(train_x, train_y)
model.best_params_, model.best_score_

({'max_depth': 18}, 0.9400633294250316)

In [17]:
model_RFC = RandomForestClassifier(random_state=1)
param = {
    'n_estimators' : [100, 200],
    'max_depth' : [4, 7, 11, 18], 
    'min_samples_leaf' : [1, 3, 5, 7, 10],     # default : 1 (leaf가 되기 위한 최소한의 샘플 데이터 수)
    'min_samples_split' : [2, 3, 5, 7, 10]     # default : 2 (노드를 분할하기 위한 최소한의 샘플 데이터 수)
}
model = GridSearchCV(
    model_RFC,
    param,
    cv=5
)
model.fit(train_x, train_y)
model.best_params_, model.best_score_

({'max_depth': 18,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 100},
 0.9400633294250316)