## SVM程式教學

### SVM在Sklearn裡面就有套件可以使用，使用pip install即可安裝

In [1]:
! pip install sklearn



### 將套件(Package)輸入到程式之中

In [2]:
#輸入datasets
from sklearn import datasets
#pandas可以提供高效能、簡易使用的資料格式(DataFrame)，讓使用者可以快速操作及分析資料
import pandas as pd
#數學公式計算都靠它
import numpy as np
#畫圖都靠它
import matplotlib.pyplot as plt
#此套件可將資料自由切分成 訓練資料集 和 測試資料集
from sklearn.model_selection import train_test_split
#標準化資料集
from sklearn.preprocessing import minmax_scale
#SVM分類器演算法的套件
from sklearn.svm import SVC
#計算accuracy,recall,precision測量指標
from sklearn.metrics import accuracy_score,recall_score,precision_score,confusion_matrix

### 使用安德森鳶尾花卉數據集(Iris dataset)來做數據分析-資料前處理-雙類別
Iris 資料集的介紹：http://bit.ly/2ptEM0N （連結到wiki)

In [3]:
#輸入資料集
iris = datasets.load_iris()

In [4]:
#iris['data']是資料內容
#數據標準化(normalization)主要解決不同性質數據問題，讓每一個性質的資料可以在同一個起跑點作分析
#常見方法有:Max-Min scalar; z-score statistic scalar; maxabs scaler; robust scaler等
#此處我將使用Max-Min scalar做示範
x_iris = minmax_scale(iris['data'],feature_range=(0, 1), axis=0, copy=True)

In [5]:
#存取成dataFrame形式:  iris['feature_names']是資料標題
x_iris = pd.DataFrame(x_iris, columns=iris['feature_names'])
#只要看到".head()"都是pandas呈現資料用
x_iris.head(3)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.222222,0.625,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.5,0.050847,0.041667


In [6]:
#存取成dataFrame形式: iris['target']是類別
y_iris = pd.DataFrame(iris['target'], columns=['target'])
y_iris.head(3)

Unnamed: 0,target
0,0
1,0
2,0


In [7]:
#將資料與類別合併在一起
iris_data = pd.concat([x_iris,y_iris], axis=1)
#因為原始的SVM是一個兩類別分類的問題，因此我們在此先取兩個類別作分類
#（不過Sklearn其實也能做多類別分類的問題，所以看您分類的需求來做處理）
iris_data = iris_data[iris_data['target'].isin([0,1])]
iris_data.head(3)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,0.222222,0.625,0.067797,0.041667,0
1,0.166667,0.416667,0.067797,0.041667,0
2,0.111111,0.5,0.050847,0.041667,0


In [8]:
#將Iris資料隨機切分成 70%訓練資料集 和 30%測試資料集
X_train, X_test, y_train, y_test = train_test_split(
    iris_data[['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)']], 
    iris_data[['target']], test_size=0.3, random_state=0)

## SVM分類演算法

In [9]:
#SVM演算法的設定(kernel function, c and gamma等都在此處處理)
svm = SVC(kernel='linear', probability=True)

In [10]:
#將訓練資料集丟入SVM進行訓練
svm.fit(X_train,y_train.values)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [11]:
#將測試資料集丟入SVM進行預測
y_pred = svm.predict(X_test)
y_pred

array([0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 1])

In [12]:
#測量是否預測準確
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

print('Accuracy = '+str(accuracy)+'\nRecall = '+str(recall)+'\nPrecision = '+str(precision))

confu_mat = confusion_matrix(y_test, y_pred, labels=[0,1])

print('confusion matrix = \n' + str(confu_mat))

Accuracy = 1.0
Recall = 1.0
Precision = 1.0
confusion matrix = 
[[15  0]
 [ 0 15]]


### 使用安德森鳶尾花卉數據集(Iris dataset)來做數據分析-資料前處理-多類別

In [13]:
#輸入資料集
iris = datasets.load_iris()

In [14]:
#iris['data']是資料內容
#數據標準化(normalization)主要解決不同性質數據問題，讓每一個性質的資料可以在同一個起跑點作分析
#常見方法有:Max-Min scalar; z-score statistic scalar; maxabs scaler; robust scaler等
#此處我將使用Max-Min scalar做示範
x_iris = minmax_scale(iris['data'],feature_range=(0, 1), axis=0, copy=True)

In [15]:
#存取成dataFrame形式:  iris['feature_names']是資料標題
x_iris = pd.DataFrame(x_iris, columns=iris['feature_names'])
#只要看到".head()"都是pandas呈現資料用
x_iris.head(3)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.222222,0.625,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.5,0.050847,0.041667


In [16]:
#存取成dataFrame形式: iris['target']是類別
y_iris = pd.DataFrame(iris['target'], columns=['target'])
y_iris.head(3)

Unnamed: 0,target
0,0
1,0
2,0


In [17]:
#將資料與類別合併在一起
iris_data = pd.concat([x_iris,y_iris], axis=1)
#多類別分類
iris_data = iris_data[iris_data['target'].isin([0,1,2])]
iris_data.head(3)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,0.222222,0.625,0.067797,0.041667,0
1,0.166667,0.416667,0.067797,0.041667,0
2,0.111111,0.5,0.050847,0.041667,0


In [18]:
#將Iris資料隨機切分成 70%訓練資料集 和 30%測試資料集
X_train, X_test, y_train, y_test = train_test_split(
    iris_data[['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)']], 
    iris_data[['target']], test_size=0.3, random_state=0)

## SVM分類演算法

In [19]:
#SVM演算法的設定(kernel function, c and gamma等都在此處處理)
svm = SVC(kernel='linear', probability=True)

In [20]:
#將訓練資料集丟入SVM進行訓練
svm.fit(X_train,y_train.values)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [21]:
#將測試資料集丟入SVM進行預測
y_pred = svm.predict(X_test)
y_pred

array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
       0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 2, 1, 1, 2, 0, 2, 0,
       0])

In [22]:
#測量是否預測準確
confu_mat = confusion_matrix(y_test, y_pred, labels=[0,1,2])

print('confusion matrix = \n' + str(confu_mat))

confusion matrix = 
[[16  0  0]
 [ 0 17  1]
 [ 0  1 10]]
