In [1]:
import os
from os.path import join

import pandas as pd
from copy import deepcopy

In [2]:
inputPath = join("./","DataSet")
inputPath

'./DataSet'

In [3]:
os.listdir(inputPath)

['iris.csv']

In [4]:
# 데이터 불러들이기
iris = pd.read_csv(join(inputPath,'iris.csv'), names=("sepal length", "sepal width", "petal length", "petal width", "Class"))
iris.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
#클래스 이름 확인
classes = iris["Class"].unique()

In [6]:
# 클래스 이름을 Iris-setosa : 1, Iris-versicolor : 2, Iris-virginica : 3 으로 대체
iris["Class"].replace(classes[0], 1, inplace = True) 
iris["Class"].replace(classes[1], 2, inplace = True) 
iris["Class"].replace(classes[2], 3, inplace = True) 

# 복원 추출로 10개를 샘플링해 제대로 클래스 이름이 숫자로 대체 되었는지 확인.
iris.sample(10)

Unnamed: 0,sepal length,sepal width,petal length,petal width,Class
56,6.3,3.3,4.7,1.6,2
96,5.7,2.9,4.2,1.3,2
132,6.4,2.8,5.6,2.2,3
105,7.6,3.0,6.6,2.1,3
52,6.9,3.1,4.9,1.5,2
137,6.4,3.1,5.5,1.8,3
63,6.1,2.9,4.7,1.4,2
60,5.0,2.0,3.5,1.0,2
36,5.5,3.5,1.3,0.2,1
117,7.7,3.8,6.7,2.2,3


One vs Rest Multiclass Classification https://en.wikipedia.org/wiki/Multiclass_classification#One-vs.-rest

선형 분리로 One vs Rest 다중 클래스 분류를 할것이기 때문에 SVM Classifier의 Kernel 중 LinearKernel을 사용합니다.

In [7]:
iris_x = deepcopy(iris)
iris_y = deepcopy(iris['Class'])
del iris_x['Class']

In [8]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(iris_x,iris_y, test_size=0.33, random_state = 11)
# 훈련데이터와 테스트 데이터로 분리


In [9]:
# 분류기에 넣을 데이터 준비. 1번 분류기는 1번 클래스에 대해서만 True로 판별하고 2,3 번 클래스에 대해서는 False로 판별해야 하므로,
# 1번 클래스만 1로 2,3번 클래스는 0으로 변경하여 데이터를 넣어줄 예정. 2번 3번 분류기에 대해서도 동일하게 데이터를 넣어주려고 한다.
x_train = list()
y_train = list()
x_test = list()
y_test = list()
for i in range(len(classes)):
    x_train.append(deepcopy(train_x))
    y_train.append(deepcopy(train_y))
    x_test.append(deepcopy(test_x))
    y_test.append(deepcopy(test_y))
    if i == 0:
        y_train[i].replace(i,1, inplace = True)
        y_train[i].replace(2,0, inplace = True)
        y_train[i].replace(3,0, inplace = True)
        y_test[i].replace(i,1, inplace = True)
        y_test[i].replace(2,0, inplace = True)
        y_test[i].replace(3,0, inplace = True)
    elif i == 1:
        y_train[i].replace(1,0, inplace = True)
        y_train[i].replace(i,2, inplace = True)
        y_train[i].replace(3,0, inplace = True)
        y_test[i].replace(1,0, inplace = True)
        y_test[i].replace(i,2, inplace = True)
        y_test[i].replace(3,0, inplace = True)
    else :
        y_train[i].replace(1,0, inplace = True)
        y_train[i].replace(2,0, inplace = True)
        y_train[i].replace(i,3, inplace = True)
        y_test[i].replace(1,0, inplace = True)
        y_test[i].replace(2,0, inplace = True)
        y_test[i].replace(i,3, inplace = True)

In [10]:
from sklearn.svm import SVC

clf1 = SVC(kernel='linear')
clf1.fit(x_train[0], y_train[0])
clf2 = SVC(kernel='linear')
clf2.fit(x_train[1], y_train[1])
clf3 = SVC(kernel='linear')
clf3.fit(x_train[2], y_train[2])



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [11]:
clf1.score(x_test[0], y_test[0])

1.0

In [12]:
clf2.score(x_test[1], y_test[1])

0.68

In [13]:
clf3.score(x_test[2], y_test[2])

0.98

In [14]:
from sklearn.ensemble import VotingClassifier

eclf = VotingClassifier(estimators=[('SVM_for_1', clf1), ('SVM_for_2', clf2), ('SVM_for_3', clf3)], voting='hard')

eclf.fit(train_x, train_y)

VotingClassifier(estimators=[('SVM_for_1', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)), ('SVM_for_2', SVC(C=1.0, cache_size=200, cla...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='hard', weights=None)

In [15]:
eclf.score(train_x, train_y)

  if diff:


0.99