# Support Vector Machine

In [None]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

X = np.array([[2, 2], [1, 3], [2, 3], [5, 3], [2, 4], [3, 4], 
              [6, 4], [1, 5], [5, 5], [4, 6], [6, 6], [5, 7]])
Y = np.array([0, 0, 0, 1, 0, 0, 
              1, 0, 1, 1, 1, 1])


plt.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired)

In [None]:
from sklearn.svm import SVC

# 모델 생성
model = SVC(C=100, kernel='linear')
# model = SVC(C=0.1, kernel='linear')
# model = SVC(C=10, kernel='rbf') # overfit 조심

# 모델 학습
model.fit(X, Y)

In [None]:
# 2차원 공간의 점들의 좌표. x와 y의 최대, 최소값을 구하여, 그래프의 가로축 세로축 길이를 결정. 
# x는 가로축, y는 세로축을 의미. (그래프를 그릴 평면의 사이즈 설정)
horizontal_min, horizontal_max = X[:, 0].min() - 1, X[:, 0].max() + 1
vertical_min, vertical_max = X[:, 1].min() - 1, X[:, 1].max() + 1

# .02 간격으로 min값과 max 사이 matrix를 생성.
h = .02
xx, yy = np.meshgrid(np.arange(horizontal_min, horizontal_max, h),
                     np.arange(vertical_min, vertical_max, h))

# 공간 상의 점들(영역)에 대한 모델의 예측 값들을 구함.
fig, ax = plt.subplots()
xy = np.vstack([xx.ravel(), yy.ravel()]).T

Z = model.decision_function(xy).reshape(xx.shape)
ax.scatter(X[:, 0], X[:, 1], c=Y)
ax.contour(xx, yy, Z, colors='k', levels=[-1,0,1], alpha=0.5, linestyles=['--', '-', '--'])

# 지지벡터(Support Vector) 표현
ax.scatter(model.support_vectors_[:,0], model.support_vectors_[:,1], s=60, facecolors='r')
ax.set_title('Support Vector Machine')

# Decision Tree

In [None]:
# 데이터 준비
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.datasets import load_breast_cancer
import matplotlib.pyplot as plt
import numpy as np


# 유방암(Breast Cancer) 환자 데이터, 569개 데이터, 30개의 feature, 환자 class 2 (음성, 양성)
cancer = load_breast_cancer()

x = cancer.data
y = cancer.target
print(cancer.feature_names)

# train, test 나눔
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=True, random_state=1)


In [None]:
# 모델 학습
depth=3
max_leaf_nodes=10
model = DecisionTreeClassifier(criterion='entropy', max_depth=depth, max_leaf_nodes=max_leaf_nodes, random_state=2021) # decision tree
# model = RandomForestClassifier(criterion='entropy', max_depth=depth, max_leaf_nodes=max_leaf_nodes, random_state=2021) # random forest

model.fit(x_train, y_train)

In [None]:
# 모델 평가
pred = model.predict(x_test)
print("Accuracy: %.6f" % accuracy_score(y_true=y_test, y_pred=pred))

In [None]:
# 모델 분석

from sklearn.tree import plot_tree, export_text
from IPython import display

feature_names = cancer.feature_names
target_names = cancer.target_names

# 시각화(텍스트)
text_representation = export_text(model)
print(text_representation)


# 시각화(그림)
plt.figure(figsize=(20,20))
tree_plot = plot_tree(model,
                      feature_names=feature_names,
                      class_names = target_names,
                      label = 'all',
                      rounded=True,
                      proportion = True,
                      filled=True)

In [None]:
#  Depth에 따른 정확도 변화
accuracy = []
max_depth=10
max_leaf_nodes=10

# depth 1부터 10까지
for depth in range(1, max_depth + 1):
    model = DecisionTreeClassifier(criterion='entropy', max_depth=depth, max_leaf_nodes=max_leaf_nodes, random_state=2021)
    # model = RandomForestClassifier(criterion='entropy', max_depth=depth, max_leaf_nodes=max_leaf_nodes, random_state=2021)
    model.fit(x_train, y_train)

    pred = model.predict(x_test)
    
    acc = accuracy_score(y_true=y_test, y_pred=pred)
    accuracy.append(acc)
  
# 시각화
plt.plot(list(range(1, max_depth + 1)), accuracy)
plt.xlabel('Depth')
plt.ylabel('Accuracy')
plt.show()

# 와인 데이터에서의 SVM과 Decision Tree

In [None]:
# 와인 데이터, 178개 데이터, 13개의 feature, 와인 종류 3가지
from sklearn.datasets import load_wine
from sklearn.metrics import accuracy_score

wine = load_wine()

x = wine.data
y = wine.target

# train, test 나눔
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=True, random_state=1)

In [None]:
# Tree 기반 모델
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# 모델 튜닝
depth=3
max_leaf_nodes=10
model = DecisionTreeClassifier(criterion='entropy', max_depth=depth, max_leaf_nodes=max_leaf_nodes, random_state=2021) # decision tree
# model = RandomForestClassifier(criterion='entropy', max_depth=depth, max_leaf_nodes=max_leaf_nodes, random_state=2021) # random forest

# 모델 학습
model.fit(x_train, y_train)

# 모델 평가
pred = model.predict(x_test)
print("Accuracy: %.6f" % accuracy_score(y_true=y_test, y_pred=pred))

In [None]:
# SVM
from sklearn.svm import SVC

# 모델 생성
model = SVC(C=10000, kernel='linear') # 모델 튜닝

# 모델 학습
model.fit(x_train, y_train)

# 모델 평가
pred = model.predict(x_test)
print("Accuracy: %.6f" % accuracy_score(y_true=y_test, y_pred=pred))

# 스팸 데이터에서의 SVM과 Decision Tree

In [None]:
# 데이터 준비
import os                            # 데이터 파일 경로 설정
import csv                           # 데이터 파일 로드
import numpy as np                   # numpy 행렬 조작
from sklearn.metrics import accuracy_score

def Load_Spam_Dataset(filename):
    with open(filename, 'r') as f:
        csv_reader = csv.reader(f)                  # 파일 로드
        header = next(csv_reader)

        x_data = []
        y_data = []
        for line in csv_reader:
            features = line[:-1]
            x = [1] + list(map(float, features))   # x_data에 bias를 위한 1추가
            y = float(line[-1])

            x_data.append(x)
            y_data.append(y)

        x_array = np.array(x_data)
        y_array = np.array(y_data)

    return header, x_array, y_array

In [None]:
_, x_train, y_train = Load_Spam_Dataset('./Spam_train.csv')
_, x_test, y_test = Load_Spam_Dataset('./Spam_test.csv')

print(x_train.shape) # 데이터 수, feature 수
print(y_train.shape) # 데이터 수

In [None]:
# Tree 기반 모델
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
depth=3
max_leaf_nodes=10
model = DecisionTreeClassifier(criterion='entropy', max_depth=depth, max_leaf_nodes=max_leaf_nodes, random_state=2021) # decision tree
# model = RandomForestClassifier(criterion='entropy', max_depth=depth, max_leaf_nodes=max_leaf_nodes, random_state=2021) # random forest

# 모델 학습
model.fit(x_train, y_train)

# 모델 평가
pred = model.predict(x_test)
print("Accuracy: %.6f" % accuracy_score(y_true=y_test, y_pred=pred))

In [None]:
# SVM
from sklearn.svm import SVC

# 모델 생성
model = SVC(C=0.1, kernel='linear')

# 모델 학습
model.fit(x_train, y_train)

# 모델 평가
pred = model.predict(x_test)
print("Accuracy: %.6f" % accuracy_score(y_true=y_test, y_pred=pred))