# 1번 문제

* Biomechanical features of orthopedic patients 데이터를 분석하고, 강의에서 배운 지도학습 모델들을 토대로 classification 하시오.
  * seaborn 라이브러리 import and usage
    * dataset의 features의 상관 관계를 seaborn 라이브러리를 통해 간단히 시각화 할 수 있습니다.
    * 
    ```python
    import matplotlib.pyplot as plt
    import seaborn as sns
    sns.heatmap(data.corr(),annot=True)
    plt.show()
    ```
  * feature의 분포와 features 사이의 상관관계를 확인하고 여러 모델의 정확도를 측정하시오.
    * dataset를 train set과 test set으로 분할하고, test set을 통해 각 모델의 정확도를 측정하시오.
  * 가장 높은 정확도를 가지는 모델을 확인시오.



# Biomechanical features of orthopedic patients dataset

**Content**

각 환자는 골반과 요추의 모양과 방향에서 도출된 6가지 생체역학적 속성으로 데이터 세트에 표현됨.(each one is a column):

* pelvic incidence
* pelvic tilt
* lumbar lordosis angle
* sacral slope
* pelvic radius
* grade of spondylolisthesis

**Inspiration**

이러한 생체역학 features를 사용하여 label에 따라 환자를 분류할 수 있음.

## Setup

In [None]:
# common lib
import sklearn
import numpy as np

## Dataset

### read csv by pandas

#### github

In [None]:
import pandas as pd

url = 'https://raw.githubusercontent.com/EugeneYoo/practice_file/main/column_2C_weka.csv'
data = pd.read_csv(url, encoding='utf8')
data.head()

### Data Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.heatmap(data.corr(),annot=True)
plt.show()

In [None]:
sns.pairplot(data,hue="class", palette='husl')
plt.show()

## Preprocess

### Split to data, target



In [None]:
target_name = 'class'

patients_X = data.drop([target_name],axis = 1) 
patients_y = data[target_name].values

### Min, Max, Var, Std of features

In [None]:
patients_X.max(axis=0)

In [None]:
patients_X.min(axis=0)

In [None]:
patients_X.var(axis=0)

### Select feature and split

**<font color='red'>feature 두 개를 선택하시오.</font>**
* 조건
  1. Variance가 높은 feature 하나를 선택
  2. 선택된 feature와 상관 관계가 없는 feature 선택

In [None]:
print(patients_X.columns)

features = ['pelvic_radius','degree_spondylolisthesis'] # you can choose

patients_X_selected = patients_X[features]

patients_X_selected = patients_X

from sklearn.model_selection import train_test_split
patients_X_train, patients_X_test, patients_y_train, patients_y_test = train_test_split(patients_X_selected, patients_y, test_size=0.3, random_state=42)

### Scaling 


In [None]:
from sklearn.preprocessing import StandardScaler

scaler_standard = StandardScaler()
patients_X_train_standard = scaler_standard.fit_transform(patients_X_train)
patients_X_test_standard = scaler_standard.transform(patients_X_test)

from sklearn.preprocessing import RobustScaler

scaler_robust = RobustScaler()
patients_X_train_robust = scaler_robust.fit_transform(patients_X_train)
patients_X_test_robust = scaler_robust.transform(patients_X_test)

from sklearn.preprocessing import MinMaxScaler

scaler_minmax = MinMaxScaler()
patients_X_train_minmax = scaler_minmax.fit_transform(patients_X_train)
patients_X_test_minmax = scaler_minmax.transform(patients_X_test)

## Accuracy for each model

#### Visualization 함수

In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

def viz_val_bar(param_range, test_acc, param_name):
  idx = np.arange(len(param_range))
  colors = sns.color_palette('hls',len(param_range))
  bars = plt.bar(idx, test_acc, width=0.3, color=colors)
  plt.xlabel(param_name)
  plt.ylabel('Accuracy')
  plt.ylim([np.min(test_acc)*0.99, np.max(test_acc)*1.01])
  plt.legend(handles=bars, labels=param_range)
  plt.show()

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

clf1 = KNeighborsClassifier()
clf2 = GaussianNB()
clf3 = LogisticRegression()
clf4 = DecisionTreeClassifier(random_state=1)
clf5 = RandomForestClassifier(random_state=1)

from sklearn.metrics import accuracy_score

clfs = [clf1, clf2, clf3, clf4, clf5]

raw_accs = []
class_names = []
for clf in clfs:
  clf.fit(patients_X_train, patients_y_train)
  pred = clf.predict(patients_X_test)
  class_name = clf.__class__.__name__
  class_names.append(class_name)
  raw_acc = accuracy_score(patients_y_test, pred)
  raw_accs.append(raw_acc)

viz_val_bar(class_names, raw_accs, 'raw')

print()

standard_accs = []
for clf in clfs:
  clf.fit(patients_X_train_standard, patients_y_train)
  pred_standard = clf.predict(patients_X_test_standard)
  class_name = clf.__class__.__name__
  standard_acc = accuracy_score(patients_y_test, pred_standard)
  standard_accs.append(standard_acc)
viz_val_bar(class_names, standard_accs, 'standard')

robust_accs = []
for clf in clfs:
  clf.fit(patients_X_train_robust, patients_y_train)
  pred_robust = clf.predict(patients_X_test_robust)
  class_name = clf.__class__.__name__
  robust_acc = accuracy_score(patients_y_test, pred_robust)
  robust_accs.append(robust_acc)
viz_val_bar(class_names, robust_accs,'robust')

minmax_accs = []
for clf in clfs:
  clf.fit(patients_X_train_minmax, patients_y_train)
  pred_minmax = clf.predict(patients_X_test_minmax)
  class_name = clf.__class__.__name__
  minmax_acc = accuracy_score(patients_y_test, pred_minmax)
  minmax_accs.append(minmax_acc)
viz_val_bar(class_names, minmax_accs,'minmax')


**<font color='red'>정확도가 높게 나타나는 모델 2개를 선택하고 스케일링 기법과 파라미터를 조정하여 더 높은 정확도를 도출하시오.</font>**

# 1번 문제 답안

## kNN

### Setup

In [None]:
# common lib
import sklearn
import numpy as np

### Dataset

#### github

In [None]:
import pandas as pd

url = 'https://raw.githubusercontent.com/EugeneYoo/practice_file/main/column_2C_weka.csv'
data = pd.read_csv(url, encoding='utf8')
data.head()

### Preprocess

#### Split to data, target



In [None]:
target_name = 'class'

patients_X = data.drop([target_name],axis = 1) 
patients_y = data[target_name].values

#### Select feature and train_test_split

In [None]:
features = ['pelvic_radius','degree_spondylolisthesis']

patients_X_selected = patients_X[features]

patients_X_selected = patients_X

from sklearn.model_selection import train_test_split
patients_X_train, patients_X_test, patients_y_train, patients_y_test = train_test_split(patients_X_selected, patients_y, test_size=0.3, random_state=42)

#### Scaling 


In [None]:
from sklearn.preprocessing import RobustScaler

scaler_robust = RobustScaler()
patients_X_train_scale = scaler_robust.fit_transform(patients_X_train)
patients_X_test_scale = scaler_robust.transform(patients_X_test)

### Make kNN model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_model_scaled = KNeighborsClassifier()
knn_model_scaled.fit(patients_X_train_scale, patients_y_train)

#### Evaluation

In [None]:
from sklearn import metrics

predict = knn_model_scaled.predict(patients_X_train_scale)
acc = metrics.accuracy_score(patients_y_train, predict)
print('Train Accuracy(scale): {}'.format(acc))

predict = knn_model_scaled.predict(patients_X_test_scale)
acc = metrics.accuracy_score(patients_y_test, predict)
print('Test Accuracy(scale): {}'.format(acc))

### Set Hyperparameter(n_neighbors, weights)

#### Visualization 함수

In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

# 수치형 파라미터 시각화 함수
def viz_val_curve(param_range, train_mean, train_std, test_mean, test_std, param_name, xscale_log=False):
  plt.plot(param_range, train_mean, 
          color='blue', marker='o', 
          markersize=5, label='Training accuracy')

  plt.fill_between(param_range, train_mean + train_std,
                  train_mean - train_std, alpha=0.15,
                  color='blue')

  plt.plot(param_range, test_mean, 
          color='green', linestyle='--', 
          marker='s', markersize=5, 
          label='Validation accuracy')

  plt.fill_between(param_range, 
                  test_mean + test_std,
                  test_mean - test_std, 
                  alpha=0.15, color='green')


  plt.grid()
  plt.legend(loc='lower right')
  if xscale_log:
    plt.xscale('log')
  plt.xlabel(param_name)
  plt.ylabel('Accuracy')
  plt.ylim([np.min(test_mean)*0.8, np.max(train_mean)*1.2])
  plt.tight_layout()
  plt.show()

# 범주형 파라미터 시각화 함수
def viz_val_bar(param_range, train_mean, train_std, test_mean, test_std, param_name):
  idx = np.arange(len(param_range))
  plt.bar(idx, test_mean, width=0.3)
  plt.xlabel(param_name)
  plt.ylabel('Accuracy')
  plt.ylim([np.min(test_mean)*0.9, np.max(test_mean)*1.1])
  plt.xticks(idx, param_range, fontsize=15)
  plt.show()

#### Validation_curve(n_neighbors)

##### **n_neighbors**
* int, default=5
* Number of neighbors to use by default for kneighbors queries

In [None]:
from sklearn.model_selection import validation_curve
param_range= [i for i in range(1, 17)]
param_name='n_neighbors'

from sklearn.neighbors import KNeighborsClassifier

knn_model_k = KNeighborsClassifier()

from sklearn.preprocessing import RobustScaler
scaler_robust = RobustScaler()
patients_X_scale = scaler_robust.fit_transform(patients_X)

train_scores, test_scores = validation_curve(
                estimator=knn_model_k, 
                X=patients_X_scale, 
                y=patients_y, 
                param_name='n_neighbors', 
                param_range=param_range,
                cv=10)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

viz_val_curve(param_range, train_mean, train_std, test_mean, test_std, param_name)

#### Evaluation

In [None]:
from sklearn.neighbors import KNeighborsClassifier

proper_model_k = KNeighborsClassifier(n_neighbors=6)
proper_model_k.fit(patients_X_train_scale, patients_y_train)

from sklearn import metrics

predict = proper_model_k.predict(patients_X_train_scale)
acc = metrics.accuracy_score(patients_y_train, predict)
print('Train Accuracy(n_neighbors): {}'.format(acc))

predict = proper_model_k.predict(patients_X_test_scale)
acc = metrics.accuracy_score(patients_y_test, predict)
print('Test Accuracy(n_neighbors): {}'.format(acc))

#### Validation_curve(weights)

##### **weights**
* default='uniform'
* Weight function used in prediction. 
  * 'uniform': uniform weights. All points in each neighborhood are weighted equally.
  * 'distance': weight points by the inverse of their distance
  * [callable]: a user-defined function which accepts an array of distances, and returns an array of the same shape containing the weights

In [None]:
from sklearn.model_selection import validation_curve
param_range = ['uniform', 'distance']
param_name='weights'

from sklearn.neighbors import KNeighborsClassifier

knn_model_w = KNeighborsClassifier(n_neighbors=5)

train_scores, test_scores = validation_curve(
                estimator=knn_model_w, 
                X=patients_X_scale, 
                y=patients_y, 
                param_name='weights', 
                param_range=param_range,
                cv=5)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

viz_val_bar(param_range, train_mean, train_std, test_mean, test_std, param_name)

#### Evaluation

In [None]:
from sklearn.neighbors import KNeighborsClassifier

proper_model_w = KNeighborsClassifier(n_neighbors=5, weights='uniform')
proper_model_w.fit(patients_X_train_scale, patients_y_train)

from sklearn import metrics

predict = proper_model_w.predict(patients_X_train_scale)
acc = metrics.accuracy_score(patients_y_train, predict)
print('Train Accuracy(weights): {}'.format(acc))

predict = proper_model_w.predict(patients_X_test_scale)
acc = metrics.accuracy_score(patients_y_test, predict)
print('Test Accuracy(weights): {}'.format(acc))

## DecisionTree

### Setup

In [None]:
# common lib
import sklearn
import numpy as np

### Dataset

#### github

In [None]:
import pandas as pd

url = 'https://raw.githubusercontent.com/EugeneYoo/practice_file/main/column_2C_weka.csv'
data = pd.read_csv(url, encoding='utf8')
data.head()

### Preprocess

#### Split to data, target



In [None]:
target_name = 'class'

patients_X = data.drop([target_name],axis = 1) 
patients_y = data[target_name].values

#### Select feature and train_test_split

In [None]:
features = ['pelvic_radius','degree_spondylolisthesis']

patients_X_selected = patients_X[features]

patients_X_selected = patients_X

from sklearn.model_selection import train_test_split
patients_X_train, patients_X_test, patients_y_train, patients_y_test = train_test_split(patients_X_selected, patients_y, test_size=0.3, random_state=42)

#### Scaling 


In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler_minmax = MinMaxScaler()
patients_X_train_scale = scaler_minmax.fit_transform(patients_X_train)
patients_X_test_scale = scaler_minmax.transform(patients_X_test)

### Make DecisionTree model

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model_scaled = DecisionTreeClassifier()
dt_model_scaled.fit(patients_X_train_scale, patients_y_train)

#### Evaluation

In [None]:
from sklearn import metrics

predict = dt_model_scaled.predict(patients_X_train_scale)
acc = metrics.accuracy_score(patients_y_train, predict)
print('Train Accuracy(scale): {}'.format(acc))

predict = dt_model_scaled.predict(patients_X_test_scale)
acc = metrics.accuracy_score(patients_y_test, predict)
print('Test Accuracy(scale): {}'.format(acc))

### Set Hyperparameter(max_depth)

#### Visualization 함수

In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

# 수치형 파라미터 시각화 함수
def viz_val_curve(param_range, train_mean, train_std, test_mean, test_std, param_name, xscale_log=False):
  plt.plot(param_range, train_mean, 
          color='blue', marker='o', 
          markersize=5, label='Training accuracy')

  plt.fill_between(param_range, train_mean + train_std,
                  train_mean - train_std, alpha=0.15,
                  color='blue')

  plt.plot(param_range, test_mean, 
          color='green', linestyle='--', 
          marker='s', markersize=5, 
          label='Validation accuracy')

  plt.fill_between(param_range, 
                  test_mean + test_std,
                  test_mean - test_std, 
                  alpha=0.15, color='green')


  plt.grid()
  plt.legend(loc='lower right')
  if xscale_log:
    plt.xscale('log')
  plt.xlabel(param_name)
  plt.ylabel('Accuracy')
  plt.ylim([np.min(test_mean)*0.8, np.max(train_mean)*1.2])
  plt.tight_layout()
  plt.show()

# 범주형 파라미터 시각화 함수
def viz_val_bar(param_range, train_mean, train_std, test_mean, test_std, param_name):
  idx = np.arange(len(param_range))
  plt.bar(idx, test_mean, width=0.3)
  plt.xlabel(param_name)
  plt.ylabel('Accuracy')
  plt.ylim([np.min(test_mean)*0.9, np.max(test_mean)*1.1])
  plt.xticks(idx, param_range, fontsize=15)
  plt.show()

#### Validation_curve(max_depth)

In [None]:
from sklearn.model_selection import validation_curve
param_range= [i for i in range(1, 17)]
param_name='max_depth'

from sklearn.tree import DecisionTreeClassifier

dt_model_scaled = DecisionTreeClassifier()

from sklearn.preprocessing import MinMaxScaler
scaler_minmax = MinMaxScaler()
patients_X_scale = scaler_minmax.fit_transform(patients_X)

train_scores, test_scores = validation_curve(
                estimator=dt_model_scaled, 
                X=patients_X_scale, 
                y=patients_y, 
                param_name=param_name, 
                param_range=param_range,
                cv=10)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

viz_val_curve(param_range, train_mean, train_std, test_mean, test_std, param_name)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model_scaled = DecisionTreeClassifier(max_depth=4)
dt_model_scaled.fit(patients_X_train_scale, patients_y_train)

from sklearn import metrics

predict = dt_model_scaled.predict(patients_X_train_scale)
acc = metrics.accuracy_score(patients_y_train, predict)
print('Train Accuracy(scale): {}'.format(acc))

predict = dt_model_scaled.predict(patients_X_test_scale)
acc = metrics.accuracy_score(patients_y_test, predict)
print('Test Accuracy(scale): {}'.format(acc))

# 2번 문제

* Email spam 데이터를 분석하고, 강의에서 배운 지도학습 모델들을 토대로 classification 하시오.
  * Vectorize
    * 
    ```python
    from sklearn.feature_extraction.text import CountVectorizer
    count_vectorizer = CountVectorizer() 
    ```
  * dataset를 train set과 test set으로 분할하고, test set을 통해 각 모델의 정확도를 측정하시오.
  * 가장 높은 정확도를 가지는 모델을 확인시오.



# Email spam dataset

**Content**

이 데이터 세트는 일부 메일을 임의로 수집하여 스팸 또는 햄으로 분류하는 데이터 세트입니다. 첫 번째 열에 스팸/햄 클래스.(each one is a column):

* v1(class)
* v2(text)

**Inspiration**

이러한 text를 사용하여 label에 따라 Email을 분류할 수 있음.

## Setup

In [None]:
# common lib
import sklearn
import numpy as np

## Dataset

### read csv by pandas

#### github

In [None]:
import pandas as pd

url = 'https://raw.githubusercontent.com/EugeneYoo/practice_file/main/spam.csv'
data_email = pd.read_csv(url, encoding="ISO-8859-1")
data_email.columns = ['class', 'sms1', 'sms2', 'sms3', 'sms4']
data_email.head()

## Preprocess

### Split to data, target



In [None]:
target_name = 'class'

email_X_raw = data_email.drop([target_name],axis = 1) 
email_y_raw = data_email[target_name].values.reshape(-1,1)

### Missing value deletion

In [None]:
email_X = email_X_raw.drop(['sms2','sms3','sms4'],axis = 1).values.ravel().tolist()

### Ordinal encoding

In [None]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
email_y = ordinal_encoder.fit_transform(email_y_raw).ravel()
email_y[:10]

### Split to train and test set

In [None]:
from sklearn.model_selection import train_test_split
email_X_train, email_X_test, email_y_train, email_y_test = train_test_split(email_X, email_y, test_size=0.3, random_state=42)

### Vectorization for text data
* sklearn.feature_extraction.text.[CountVectorizer()](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)
  * 문서를 토큰 리스트로 변환
  * 각 문서에서 토큰의 출현 빈도 카운트
  * 각 문서를 Bag of words(BOW) 인코딩 벡터로 변환


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer() 
email_X_train_count = count_vectorizer.fit_transform(email_X_train).toarray()
email_X_test_count = count_vectorizer.transform(email_X_test).toarray()

## Accuracy for each model

#### Visualization 함수

In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

def viz_val_bar(param_range, test_acc, param_name):
  idx = np.arange(len(param_range))
  plt.figure(figsize=(10,5))
  colors = sns.color_palette('hls',len(param_range))
  bars = plt.bar(idx, test_acc, width=0.3, color=colors)
  plt.xlabel(param_name)
  plt.ylabel('Accuracy')
  plt.ylim([np.min(test_acc)*0.99, np.max(test_acc)*1.01])
  # plt.xticks(idx, param_range, fontsize=15, rotation=30)
  plt.legend(handles=bars, labels=param_range)
  plt.show()

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


clf1 = KNeighborsClassifier()
clf2 = GaussianNB()
clf3 = MultinomialNB()
clf4 = BernoulliNB()
clf5 = LogisticRegression()
clf6 = DecisionTreeClassifier(random_state=1)
clf7 = RandomForestClassifier(random_state=1)

from sklearn.metrics import accuracy_score

clfs = [clf1, clf2, clf3, clf4, clf5, clf6, clf7]

raw_accs = []
class_names = []
for clf in clfs:
  clf.fit(email_X_train_count, email_y_train)
  pred = clf.predict(email_X_test_count)
  class_name = clf.__class__.__name__
  class_names.append(class_name)
  raw_acc = accuracy_score(email_y_test, pred)
  raw_accs.append(raw_acc)
  print('{0} 정확도: {1:.4f}'.format(class_name, raw_acc))

viz_val_bar(class_names, raw_accs, 'models')

* Visualization 함수

In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

# 수치형 파라미터 시각화 함수
def viz_val_curve(param_range, train_mean, train_std, test_mean, test_std, param_name, xscale_log=False):
  plt.plot(param_range, train_mean, 
          color='blue', marker='o', 
          markersize=5, label='Training accuracy')

  plt.fill_between(param_range, train_mean + train_std,
                  train_mean - train_std, alpha=0.15,
                  color='blue')

  plt.plot(param_range, test_mean, 
          color='green', linestyle='--', 
          marker='s', markersize=5, 
          label='Validation accuracy')

  plt.fill_between(param_range, 
                  test_mean + test_std,
                  test_mean - test_std, 
                  alpha=0.15, color='green')


  plt.grid()
  plt.legend(loc='lower right')
  if xscale_log:
    plt.xscale('log')
  plt.xlabel(param_name)
  plt.ylabel('Accuracy')
  plt.ylim([np.min(test_mean)*0.8, np.max(train_mean)*1.2])
  plt.tight_layout()
  plt.show()

# 범주형 파라미터 시각화 함수
def viz_val_bar(param_range, train_mean, train_std, test_mean, test_std, param_name):
  idx = np.arange(len(param_range))
  plt.bar(idx, test_mean, width=0.3)
  plt.xlabel(param_name)
  plt.ylabel('Accuracy')
  plt.ylim([np.min(test_mean)*0.9, np.max(test_mean)*1.1])
  plt.xticks(idx, param_range, fontsize=15)
  plt.show()

**<font color='red'>정확도가 높게 나타나는 모델 1개를 선택하고 파라미터를 조정하여 더 높은 정확도를 도출하시오.</font>**

# 2번 문제 답안

## Multinomial Naive Bayes

### Setup

In [None]:
# Common imports
import sklearn
import numpy as np

### Dataset

#### github

In [None]:
import pandas as pd

url = 'https://raw.githubusercontent.com/EugeneYoo/practice_file/main/spam.csv'
data_email = pd.read_csv(url, encoding="ISO-8859-1")
data_email.columns = ['class', 'sms1', 'sms2', 'sms3', 'sms4']
data_email.head()

### Preprocess

#### Split to data, target



In [None]:
target_name = 'class'

email_X_raw = data_email.drop([target_name],axis = 1) 
email_y_raw = data_email[target_name].values.reshape(-1,1)

#### Missing value deletion

In [None]:
email_X = email_X_raw.drop(['sms2','sms3','sms4'],axis = 1).values.ravel().tolist()

#### Ordinal encoding

In [None]:
from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
email_y = ordinal_encoder.fit_transform(email_y_raw).ravel()
email_y[:10]

#### Split to train and test set

In [None]:
from sklearn.model_selection import train_test_split
email_X_train, email_X_test, email_y_train, email_y_test = train_test_split(email_X, email_y, test_size=0.3, random_state=42)

#### Vectorization for text data
* sklearn.feature_extraction.text.[CountVectorizer()](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)
  * 문서를 토큰 리스트로 변환
  * 각 문서에서 토큰의 출현 빈도 카운트
  * 각 문서를 Bag of words(BOW) 인코딩 벡터로 변환


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer() 
email_X_train_count = count_vectorizer.fit_transform(email_X_train).toarray()
email_X_test_count = count_vectorizer.transform(email_X_test).toarray()

### Validation_curve

* Visualization 함수

In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

# 수치형 파라미터 시각화 함수
def viz_val_curve(param_range, train_mean, train_std, test_mean, test_std, param_name, xscale_log=False):
  plt.plot(param_range, train_mean, 
          color='blue', marker='o', 
          markersize=5, label='Training accuracy')

  plt.fill_between(param_range, train_mean + train_std,
                  train_mean - train_std, alpha=0.15,
                  color='blue')

  plt.plot(param_range, test_mean, 
          color='green', linestyle='--', 
          marker='s', markersize=5, 
          label='Validation accuracy')

  plt.fill_between(param_range, 
                  test_mean + test_std,
                  test_mean - test_std, 
                  alpha=0.15, color='green')


  plt.grid()
  plt.legend(loc='lower right')
  if xscale_log:
    plt.xscale('log')
  plt.xlabel(param_name)
  plt.ylabel('Accuracy')
  plt.ylim([np.min(test_mean)*0.8, np.max(train_mean)*1.2])
  plt.tight_layout()
  plt.show()

# 범주형 파라미터 시각화 함수
def viz_val_bar(param_range, train_mean, train_std, test_mean, test_std, param_name):
  idx = np.arange(len(param_range))
  plt.bar(idx, test_mean, width=0.3)
  plt.xlabel(param_name)
  plt.ylabel('Accuracy')
  plt.ylim([np.min(test_mean)*0.9, np.max(test_mean)*1.1])
  plt.xticks(idx, param_range, fontsize=15)
  plt.show()

**alpha**
* float, default=1.0
* Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing).
* 모수가 0이나 1이 나오는 경우 극단적인 값을 추론하므로 alpha값을 분자에 더해줌

In [None]:
from sklearn.model_selection import validation_curve
param_range= [10**i for i in range(-3,5)]
param_name='alpha'

from sklearn.naive_bayes import MultinomialNB
multinomial_model_count = MultinomialNB()

from sklearn.feature_extraction.text import CountVectorizer
# count_vectorizer
count_vectorizer = CountVectorizer() 
email_X_count = count_vectorizer.fit_transform(email_X)

train_scores, test_scores = validation_curve(
                estimator=multinomial_model_count, 
                X=email_X_count, 
                y=email_y, 
                param_name=param_name, 
                param_range=param_range,
                cv=10)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

viz_val_curve(param_range, train_mean, train_std, test_mean, test_std, param_name, True)

#### Evaluation

In [None]:
from sklearn.naive_bayes import MultinomialNB
multinomial_model_count = MultinomialNB(alpha=1e-1)
multinomial_model_count.fit(email_X_train_count, email_y_train)

from sklearn import metrics

predict = multinomial_model_count.predict(email_X_train_count)
acc = metrics.accuracy_score(email_y_train, predict)
print('Train Accuracy(alpha): {}'.format(acc))

predict = multinomial_model_count.predict(email_X_test_count)
acc = metrics.accuracy_score(email_y_test, predict)
print('Test Accuracy(alpha): {}'.format(acc))