# Lab 3. Machine Learning - Linear and Logistic Regression, and Decision Tree

## Table of Contents
- Linear regression
- Logistic regression
- Decision Tree

## Download dataset

In [None]:
%%capture
# happiness
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1g-kEqWUe7zYGrxQ2tGTcuT5MN4XDCXFX' -O happiness_train.csv
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1bFKKosFdHXGvU5lPIyjjnhv-aS5aPLj1' -O happiness_test.csv
# iris
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1vThUwXXgL_PtMpYq5PfVlygtLD46I6kA' -O Iris_Train.csv
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1wdwS1iksXAkHlVa3aZwzpkYapATb5nZr' -O Iris_Test.csv
# spam
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=19KW6xooGxbUOq-f6C-5U3_0qhfmcY7zp' -O Spam_train.csv
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1VyD2cL8GSyr8tkpWk8XsgbhpR-W10rpp' -O Spam_test.csv
# titanic
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1Afn3qiznaOEX437j5lEIV8SVmCwqy1Rl' -O Titanic_train.csv
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1CgDkX3xT53xwxTeUOI6MT0YqO2L9mfV4' -O Titanic_test.csv

## Import packages

In [None]:
import os
import copy
from os.path import join

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

np.random.seed(0)

# 1. Linear Regression

## SKLearn을 이용한 Linear Regression

In [None]:
# 데이터 준비
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression

X, y = make_regression(n_samples=100, n_features=1,
                       bias=100, noise=40, random_state=1)


# 데이터 시각화
plt.scatter(X, y, label="data")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

In [None]:
# bias를 한번에 계산하기 위해, 1을 X에 추가해 줍니다.
new_X = np.insert(X, 0, 1, axis=1)

# train, test 나눔
train_x, test_x, train_y, test_y = train_test_split(new_X, y, test_size=0.3, shuffle=True, random_state=1)

In [None]:
# Scikit-Learn 으로 학습
from sklearn.linear_model import LinearRegression

simple_linear = LinearRegression()
simple_linear.fit(train_x, train_y)

In [None]:
# train data에 대한 학습 모델 시각화
plt.scatter(train_x[:,1], train_y, label="train data")
plt.plot(train_x[:,1], simple_linear.predict(train_x), 'r-', label="model")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

In [None]:
# test data에 대한 학습 모델 시각화
plt.scatter(test_x[:,1], test_y, label="test data")
plt.plot(test_x[:,1], simple_linear.predict(test_x), 'r',  label="model")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

In [None]:
# 평가
def RMSE(pred, target):
  error = pred - target
  square = error * error
  mean = np.mean(square)
  root = np.sqrt(mean)
  return root

pred = simple_linear.predict(train_x)
print("Train RMSE =", RMSE(pred, train_y))

pred = simple_linear.predict(test_x)
print("Test RMSE =", RMSE(pred, test_y))

## Linear Regression의 Numerical Solution

In [None]:
# 데이터 준비
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression

X, y = make_regression(n_samples=100, n_features=1,
                       bias=100, noise=40, random_state=1)


# 데이터 시각화
plt.scatter(X, y, label="data")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

In [None]:
# bias를 한번에 계산하기 위해, 1을 X에 추가해 줍니다.
new_X = np.insert(X, 0, 1, axis=1)

# train, test 나눔
train_x, test_x, train_y, test_y = train_test_split(new_X, y, test_size=0.3, shuffle=True, random_state=1)

In [None]:
class LinearRegression_gd:
    def __init__(self, num_features, seed):
        np.random.seed(seed)
        self.W = np.random.rand(num_features, 1)

    def train(self, train_x, train_y, num_epochs, learning_rate):
        num_data, num_features = train_x.shape

        loss_memory = []
        train_y = np.expand_dims(train_y, 1)

        for i in range(num_epochs):

            # prediction 계산
            prediction = np.matmul(train_x, self.W)

            # Error 및 Loss 계산
            error = prediction - train_y
            loss = np.mean(error * error) / 2

            # Gradient 계산
            gradient= np.mean(train_x * error, axis=0, keepdims=True).T # 𝝏𝑳(𝒙, 𝑾)/𝝏𝑾

            # Weight Update
            # gradient, learning_rate 활용하여 self.W 업데이트
            self.W -= learning_rate * gradient

            # Loss ‘loss_memory’에 추가
            loss_memory.append(loss)

        # ‘loss_memory’ 반환
        return loss_memory

    def predict(self, test_x):
        pred = np.matmul(test_x, self.W).squeeze()
        return pred

In [None]:
# Hyper-parameter
num_epochs = 1500
learning_rate = 0.01
seed = 15

num_data, num_features = train_x.shape
model = LinearRegression_gd(num_features, seed)
loss_memory = model.train(train_x, train_y, num_epochs, learning_rate)

In [None]:
# Plot Loss
x_axis = list(range(num_epochs))

plt.plot(x_axis, loss_memory)
plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('MSE Loss')

plt.show()

In [None]:
# train data에 대한 학습 모델 시각화
plt.scatter(train_x[:,1], train_y, label="train data")
plt.plot(train_x[:,1], model.predict(train_x), 'r-', label="model")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

In [None]:
# test data에 대한 학습 모델 시각화
plt.scatter(test_x[:,1], test_y, label="test data")
plt.plot(test_x[:,1], model.predict(test_x), 'r-', label="model")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

In [None]:
# 평가
def RMSE(pred, target):
  error = pred - target
  square = error * error
  mean = np.mean(square)
  root = np.sqrt(mean)
  return root

pred = model.predict(train_x)
print("Train RMSE =", RMSE(pred, train_y))

pred = model.predict(test_x)
print("Test RMSE =", RMSE(pred, test_y))

## Linear Regression의 Analytical Solution

In [None]:
# 데이터 준비
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_regression

X, y = make_regression(n_samples=100, n_features=1,
                       bias=100, noise=40, random_state=1)


# 데이터 시각화
plt.scatter(X, y, label="data")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

In [None]:
# bias를 한번에 계산하기 위해, 1을 X에 추가해 줍니다.
new_X = np.insert(X, 0, 1, axis=1)

# train, test 나눔
train_x, test_x, train_y, test_y = train_test_split(new_X, y, test_size=0.3, shuffle=True, random_state=1)

In [None]:
# Normal Equation
# Inverse: np.linalg.inv()
# Mat. Multiplication: np.matmul()
first = None
second = None
W = None

pred_train = None # train set에 대한 예측 값
pred_test = None # test set에 대한 예측 값

In [None]:
#@title 정답코드: Normal Equation
first = np.linalg.inv(np.matmul(train_x.T, train_x))
second = np.matmul(train_x.T, train_y)
W = np.matmul(first, second)

pred_train = np.matmul(train_x, W)
pred_test = np.matmul(test_x, W)

In [None]:
# train data에 대한 학습 모델 시각화
plt.scatter(train_x[:,1], train_y, label="train data")
plt.plot(train_x[:,1], pred_train, 'r-', label="model")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

In [None]:
# test data에 대한 학습 모델 시각화
plt.scatter(test_x[:,1], test_y, label="test data")
plt.plot(test_x[:,1], pred_test, 'r',  label="model")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

In [None]:
# 평가
def RMSE(pred, target):
  error = pred - target
  square = error * error
  mean = np.mean(square)
  root = np.sqrt(mean)
  return root

print("Train RMSE =", RMSE(pred_train, train_y))

print("Test RMSE =", RMSE(pred_test, test_y))

## 당뇨병 데이터에서, SKLearn, Numerical Solution, Analytical Solution 비교


In [None]:
# 데이터 불러오기
from sklearn.datasets import load_diabetes
datasets = load_diabetes()

# bias를 한번에 계산하기 위해, 1을 X에 추가해 줍니다.
new_X = np.insert(datasets.data, 0, 1, axis=1)

# train, test 나눔
train_x, test_x, train_y, test_y = train_test_split(new_X, datasets.target, test_size=0.3, shuffle=True, random_state=1)

In [None]:
# Scikit-Learn 으로 학습
from sklearn.linear_model import LinearRegression

simple_linear = LinearRegression()
simple_linear.fit(train_x, train_y)

In [None]:
# 평가
def RMSE(pred, target):
  error = pred - target
  square = error * error
  mean = np.mean(square)
  root = np.sqrt(mean)
  return root

pred = simple_linear.predict(train_x)
print("Train RMSE =", RMSE(pred, train_y))

pred = simple_linear.predict(test_x)
print("Test RMSE =", RMSE(pred, test_y))

In [None]:
# TODO: Numerical Solution 코드 추가하기

In [None]:
# TODO: Analytical Solution 코드 추가하기

## 행복지수 데이터에서, SKLearn, Numerical Solution, Analytical Solution 비교


In [None]:
import os                            # 데이터 파일 경로 설정
import csv                           # 데이터 파일 로드

def Load_Dataset(filename):
    with open(filename, 'r') as f:
        csv_reader = csv.reader(f)                  # 파일 로드
        header = next(csv_reader)

        x_data = []
        y_data = []
        for line in csv_reader:
            features = line[6:]
            x = [1.0] + [float(i) for i in features]
            y = float(line[2])
            x_data.append(x)
            y_data.append(y)

        x_array = np.array(x_data)
        y_array = np.array(y_data)

    return header, x_array, y_array

In [None]:
# 데이터 불러오기
_, train_x, train_y = Load_Dataset('/content/happiness_train.csv')
_, test_x, test_y = Load_Dataset('/content/happiness_test.csv')

In [None]:
# Scikit-Learn 으로 학습
from sklearn.linear_model import LinearRegression

simple_linear = LinearRegression()
simple_linear.fit(train_x, train_y)

In [None]:
# 평가
def RMSE(pred, target):
  error = pred - target
  square = error * error
  mean = np.mean(square)
  root = np.sqrt(mean)
  return root

pred = simple_linear.predict(train_x)
print("Train RMSE =", RMSE(pred, train_y))

pred = simple_linear.predict(test_x)
print("Test RMSE =", RMSE(pred, test_y))

In [None]:
# TODO: Numerical Solution 코드 추가하기

In [None]:
# TODO: Analytical Solution 코드 추가하기

# 2. Logistic Regression

## SKLearn을 이용한 Logistic Regression 살펴보기

In [None]:
# 데이터 준비
import os                            # 데이터 파일 경로 설정
import csv                           # 데이터 파일 로드
import numpy as np                   # numpy 행렬 조작
import matplotlib.pyplot as plt      # 그래프 그리기(선택 사항)

def Load_Iris_Dataset(filename):
    with open(filename, 'r') as f:
        csv_reader = csv.reader(f)                  # 파일 로드
        header = next(csv_reader)

        x_data = []
        y_data = []
        for line in csv_reader:
            features = line[1]
            x = [1] + [float(features)]    # x_data에 bias를 위한 1추가
            y = float(line[0])

            x_data.append(x)
            y_data.append(y)

        x_array = np.array(x_data)
        y_array = np.array(y_data)

    return header, x_array, y_array

In [None]:
_, x_train, y_train = Load_Iris_Dataset('./Iris_Train.csv')
_, x_test, y_test = Load_Iris_Dataset('./Iris_Test.csv')


# train 데이터 시각화
is_virginica_x = x_train[y_train == 1]
is_virginica_y = y_train[y_train == 1]

not_virginica_x = x_train[y_train == 0]
not_virginica_y = y_train[y_train == 0]

plt.scatter(is_virginica_x[:, 1], is_virginica_y, color='b', label='Iris-Virginica')
plt.scatter(not_virginica_x[:, 1], not_virginica_y, color='r', label='Not Iris-Virginica')

plt.xlabel("Petal width", fontsize=14)
plt.ylabel("Class", fontsize=14)
plt.legend(loc="upper left", fontsize=10)
plt.show()

In [None]:
# Scikit-Learn 으로 학습
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(x_train, y_train)

In [None]:
# train data에 대한 학습 모델 시각화
is_virginica_x = x_train[y_train == 1]
is_virginica_y = y_train[y_train == 1]

not_virginica_x = x_train[y_train == 0]
not_virginica_y = y_train[y_train == 0]

plot_feat = np.linspace(0, 3, 100).reshape(-1, 1)
plot_x = np.insert(plot_feat, 0, 1, axis=1)
plot_prob = lr.predict(plot_x)

plt.scatter(is_virginica_x[:, 1], is_virginica_y, color='b', label='Iris-Virginica')
plt.scatter(not_virginica_x[:, 1], not_virginica_y, color='r', label='Not Iris-Virginica')


# 모델이 학습한 Probability
plt.plot(plot_feat, plot_prob, 'g-', label='Hypothesis')

plt.xlabel("Petal width", fontsize=14)
plt.ylabel("Probability", fontsize=14)
plt.legend(loc="upper left", fontsize=10)
plt.show()

In [None]:
# 평가
from sklearn.metrics import accuracy_score

y_pred = lr.predict(x_test)
print('Test Accuracy: {:.2f}%'.format(accuracy_score(y_test, y_pred)*100))

## Numerical Solution 구현

In [None]:
# 데이터 준비
import os                            # 데이터 파일 경로 설정
import csv                           # 데이터 파일 로드
import numpy as np                   # numpy 행렬 조작
import matplotlib.pyplot as plt      # 그래프 그리기(선택 사항)

def Load_Iris_Dataset(filename):
    with open(filename, 'r') as f:
        csv_reader = csv.reader(f)                  # 파일 로드
        header = next(csv_reader)

        x_data = []
        y_data = []
        for line in csv_reader:
            features = line[1]
            x = [1] + [float(features)]    # x_data에 bias를 위한 1추가
            y = float(line[0])

            x_data.append(x)
            y_data.append(y)

        x_array = np.array(x_data)
        y_array = np.array(y_data)

    return header, x_array, y_array

In [None]:
_, x_train, y_train = Load_Iris_Dataset('./Iris_Train.csv')
_, x_test, y_test = Load_Iris_Dataset('./Iris_Test.csv')


# train 데이터 시각화
is_virginica_x = x_train[y_train == 1]
is_virginica_y = y_train[y_train == 1]

not_virginica_x = x_train[y_train == 0]
not_virginica_y = y_train[y_train == 0]

plt.scatter(is_virginica_x[:, 1], is_virginica_y, color='b', label='Iris-Virginica')
plt.scatter(not_virginica_x[:, 1], not_virginica_y, color='r', label='Not Iris-Virginica')

plt.xlabel("Petal width", fontsize=14)
plt.ylabel("Class", fontsize=14)
plt.legend(loc="upper left", fontsize=10)
plt.show()

### 모델 정의
1.   __\_\_init\_\___


> *   인자: 모델 설정
*   출력: x
*   기능: 모델 초기화

> weight *W*를 random하게 initialization

2.   __train__


> *   입력: 학습데이터, 학습 설정
*   출력: Loss
*   기능: 데이터로 모델 학습

> 매 epoch마다 전체 데이터에 대해 loss, grad 계산하여 학습


3. __predict__

> *   입력: 검증 데이터
*   출력: 모델의 예측값
*   기능: train로 학습된 모델로 검증, 예측값 생성

> 검증 데이터에 대해 분류 예측 결과 산출

4. ___sigmoid__

> *   입력: 실수형 numpy array
*   출력: sigmoid를 취한 array
*   기능: 주어진 array에 대한 모든 sigmoid 값 계산

> $sigmoid(x) =\frac{1}{ 1+e^{-(x)}}$





In [None]:
class LogisticRegression:
    def __init__(self, num_features, seed):
        np.random.seed(seed)
        self.W = np.random.rand(num_features, 1)

    def train(self, train_x, train_y, num_epochs, learning_rate):
        loss_memory = []
        train_y = np.expand_dims(train_y, 1)

        for epoch in range(num_epochs):
          # prediction 계산 𝑿^𝑻 𝑾
          prediction = np.matmul(train_x, self.W)

          # sigmoid 적용
          prob = self._sigmoid(prediction)

          # Loss 계산
          error = prob - train_y
          loss = - np.mean(train_y * np.log(prob) + (1 - train_y) * np.log(1 - prob))

          # Gradient 계산
          grad = np.mean(train_x * error, axis=0, keepdims=True).T

          # Weight Update
          self.W -= grad * learning_rate

          loss_memory.append(loss)
        return loss_memory

    def predict_prob(self, test_x):
        prob = self._sigmoid(np.matmul(test_x, self.W))
        return prob.flatten()

    def predict(self, test_x):
        prob = self._sigmoid(np.matmul(test_x, self.W))
        prob = prob.flatten()
        y_pred_one_or_zero = []
        for y in prob:
            if y > 0.5:
                y_pred_one_or_zero.append(1)
            else:
                y_pred_one_or_zero.append(0)
        return y_pred_one_or_zero

    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

In [None]:
# Hyper-parameter 설정 및 학습
num_epochs = 1000
learning_rate = 1e-1
seed = 2

# Training
num_data, num_features = x_train.shape

model = LogisticRegression(num_features, seed)
loss_memory = model.train(x_train, y_train, num_epochs, learning_rate)

In [None]:
# Plot Loss
x_axis = list(range(num_epochs))

plt.plot(x_axis, loss_memory)
plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('MSE Loss')

plt.show()

In [None]:
# train data에 대한 학습 모델 시각화
is_virginica_x = x_train[y_train == 1]
is_virginica_y = y_train[y_train == 1]

not_virginica_x = x_train[y_train == 0]
not_virginica_y = y_train[y_train == 0]

plot_feat = np.linspace(0, 3, 100).reshape(-1, 1)
plot_x = np.insert(plot_feat, 0, 1, axis=1)
plot_prob = model.predict_prob(plot_x)

plt.scatter(is_virginica_x[:, 1], is_virginica_y, color='b', label='Iris-Virginica')
plt.scatter(not_virginica_x[:, 1], not_virginica_y, color='r', label='Not Iris-Virginica')

# 모델이 학습한 Probability
plt.plot(plot_feat, plot_prob, 'g-', label='Hypothesis')

plt.xlabel("Petal width", fontsize=14)
plt.ylabel("Probability", fontsize=14)
plt.legend(loc="upper left", fontsize=10)
plt.show()

In [None]:
# train data에 대한 학습 모델 시각화
is_virginica_x = x_train[y_train == 1]
is_virginica_y = y_train[y_train == 1]

not_virginica_x = x_train[y_train == 0]
not_virginica_y = y_train[y_train == 0]

plot_feat = np.linspace(0, 3, 100).reshape(-1, 1)
plot_x = np.insert(plot_feat, 0, 1, axis=1)
plot_prob = model.predict(plot_x)

plt.scatter(is_virginica_x[:, 1], is_virginica_y, color='b', label='Iris-Virginica')
plt.scatter(not_virginica_x[:, 1], not_virginica_y, color='r', label='Not Iris-Virginica')

# 모델이 학습한 Probability
plt.plot(plot_feat, plot_prob, 'g-', label='Hypothesis')

plt.xlabel("Petal width", fontsize=14)
plt.ylabel("Probability", fontsize=14)
plt.legend(loc="upper left", fontsize=10)
plt.show()

In [None]:
# 평가
from sklearn.metrics import accuracy_score

y_pred = model.predict(x_test)
print('Test Accuracy: {:.2f}%'.format(accuracy_score(y_test, y_pred)*100))

## 스팸메일 데이터에서 SKLearn과 Numerical solution 비교


In [None]:
# 데이터 준비
import os                            # 데이터 파일 경로 설정
import csv                           # 데이터 파일 로드
import numpy as np                   # numpy 행렬 조작

def Load_Spam_Dataset(filename):
    with open(filename, 'r') as f:
        csv_reader = csv.reader(f)                  # 파일 로드
        header = next(csv_reader)

        x_data = []
        y_data = []
        for line in csv_reader:
            features = line[:-1]
            x = [1] + list(map(float, features))   # x_data에 bias를 위한 1추가
            y = float(line[-1])

            x_data.append(x)
            y_data.append(y)

        x_array = np.array(x_data)
        y_array = np.array(y_data)

    return header, x_array, y_array

In [None]:
_, x_train, y_train = Load_Spam_Dataset('./Spam_train.csv')
_, x_test, y_test = Load_Spam_Dataset('./Spam_test.csv')

print(x_train.shape) # 데이터 수, feature 수
print(y_train.shape) # 데이터 수

In [None]:
# Logistic regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=10000)
lr.fit(x_train, y_train)

In [None]:
# 평가
from sklearn.metrics import accuracy_score

y_pred = lr.predict(x_test)
print('Test Accuracy: {:.2f}%'.format(accuracy_score(y_test, y_pred)*100))

In [None]:
# TODO: Numerical solution 구현 및 하이퍼파라미터 튜닝

## 유방암 데이터에서 SKLearn과 Numerical solution 비교


In [None]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
print(cancer.feature_names) # feature 이름 출력

# bias를 한번에 계산하기 위해, 1을 X에 추가해 줍니다.
import numpy as np
new_X = np.insert(cancer.data, 0, 1, axis=1)

# train, test 나눔
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(new_X, cancer.target, test_size=0.3, shuffle=True, random_state=1)

In [None]:
# Logistic regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=10000)
lr.fit(x_train, y_train)


In [None]:
# 평가
from sklearn.metrics import accuracy_score

y_pred = lr.predict(x_test)
print('Test Accuracy: {:.2f}%'.format(accuracy_score(y_test, y_pred)*100))

In [None]:
# TODO: Numerical solution 구현 및 하이퍼파라미터 튜닝

# 3. Decision Tree

In [None]:
# 데이터 준비
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.datasets import load_breast_cancer
import matplotlib.pyplot as plt
import numpy as np


# 유방암(Breast Cancer) 환자 데이터, 569개 데이터, 30개의 feature, 환자 class 2 (음성, 양성)
cancer = load_breast_cancer()

x = cancer.data
y = cancer.target
print(cancer.feature_names)

# train, test 나눔
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=True, random_state=1)


In [None]:
# 모델 학습
depth=3
max_leaf_nodes=10
model = DecisionTreeClassifier(criterion='entropy', max_depth=depth, max_leaf_nodes=max_leaf_nodes, random_state=2021) # decision tree
# model = RandomForestClassifier(criterion='entropy', max_depth=depth, max_leaf_nodes=max_leaf_nodes, random_state=2021) # random forest

model.fit(x_train, y_train)

In [None]:
# 모델 평가
pred = model.predict(x_test)
print("Accuracy: %.6f" % accuracy_score(y_true=y_test, y_pred=pred))

In [None]:
# 모델 분석

from sklearn.tree import plot_tree, export_text
from IPython import display

feature_names = cancer.feature_names
target_names = cancer.target_names

# 시각화(텍스트)
text_representation = export_text(model)
print(text_representation)


# 시각화(그림)
plt.figure(figsize=(20,20))
tree_plot = plot_tree(model,
                      feature_names=feature_names,
                      class_names = target_names,
                      label = 'all',
                      rounded=True,
                      proportion = True,
                      filled=True)

In [None]:
#  Depth에 따른 정확도 변화
accuracy = []
max_depth=10
max_leaf_nodes=10

# depth 1부터 10까지
for depth in range(1, max_depth + 1):
    model = DecisionTreeClassifier(criterion='entropy', max_depth=depth, max_leaf_nodes=max_leaf_nodes, random_state=2021)
    # model = RandomForestClassifier(criterion='entropy', max_depth=depth, max_leaf_nodes=max_leaf_nodes, random_state=2021)
    model.fit(x_train, y_train)

    pred = model.predict(x_test)

    acc = accuracy_score(y_true=y_test, y_pred=pred)
    accuracy.append(acc)

# 시각화
plt.plot(list(range(1, max_depth + 1)), accuracy)
plt.xlabel('Depth')
plt.ylabel('Accuracy')
plt.show()

## 와인 데이터에서의 Decision Tree

In [None]:
# 와인 데이터, 178개 데이터, 13개의 feature, 와인 종류 3가지
from sklearn.datasets import load_wine
from sklearn.metrics import accuracy_score

wine = load_wine()

x = wine.data
y = wine.target

# train, test 나눔
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=True, random_state=1)

In [None]:
# Tree 기반 모델
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# 모델 튜닝
depth=3
max_leaf_nodes=10
model = DecisionTreeClassifier(criterion='entropy', max_depth=depth, max_leaf_nodes=max_leaf_nodes, random_state=2021) # decision tree
# model = RandomForestClassifier(criterion='entropy', max_depth=depth, max_leaf_nodes=max_leaf_nodes, random_state=2021) # random forest

# 모델 학습
model.fit(x_train, y_train)

# 모델 평가
pred = model.predict(x_test)
print("Accuracy: %.6f" % accuracy_score(y_true=y_test, y_pred=pred))

## 스팸 데이터에서의 Decision Tree

In [None]:
# 데이터 준비
import os                            # 데이터 파일 경로 설정
import csv                           # 데이터 파일 로드
import numpy as np                   # numpy 행렬 조작
from sklearn.metrics import accuracy_score

def Load_Spam_Dataset(filename):
    with open(filename, 'r') as f:
        csv_reader = csv.reader(f)                  # 파일 로드
        header = next(csv_reader)

        x_data = []
        y_data = []
        for line in csv_reader:
            features = line[:-1]
            x = [1] + list(map(float, features))   # x_data에 bias를 위한 1추가
            y = float(line[-1])

            x_data.append(x)
            y_data.append(y)

        x_array = np.array(x_data)
        y_array = np.array(y_data)

    return header, x_array, y_array

In [None]:
_, x_train, y_train = Load_Spam_Dataset('./Spam_train.csv')
_, x_test, y_test = Load_Spam_Dataset('./Spam_test.csv')

print(x_train.shape) # 데이터 수, feature 수
print(y_train.shape) # 데이터 수

In [None]:
# Tree 기반 모델
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
depth=3
max_leaf_nodes=10
model = DecisionTreeClassifier(criterion='entropy', max_depth=depth, max_leaf_nodes=max_leaf_nodes, random_state=2021) # decision tree
# model = RandomForestClassifier(criterion='entropy', max_depth=depth, max_leaf_nodes=max_leaf_nodes, random_state=2021) # random forest

# 모델 학습
model.fit(x_train, y_train)

# 모델 평가
pred = model.predict(x_test)
print("Accuracy: %.6f" % accuracy_score(y_true=y_test, y_pred=pred))