# Multiclass SVM 구현

### 데이터 불러오기

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

#IRIS 데이터 로드
iris =  sns.load_dataset('iris') 
X= iris.iloc[:,:4] #학습할데이터
y = iris.iloc[:,-1] #타겟
print(y)

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: object


In [2]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

In [3]:
def standardization(train, test):
    scaler = StandardScaler()
    train = scaler.fit_transform(train)
    test = scaler.transform(test)
    return train, test

X_train, X_test = standardization(X_train, X_test)

In [4]:
X_train

array([[ 0.78522493,  0.32015325,  0.77221097,  1.04726529],
       [-0.26563371, -1.29989934,  0.0982814 , -0.11996537],
       [ 0.43493872,  0.78302542,  0.94069336,  1.43634218],
       [-0.84944407,  0.78302542, -1.24957775, -1.28719604],
       [-0.38239578, -1.7627715 ,  0.15444219,  0.13941922],
       [ 0.55170079, -0.374155  ,  1.05301496,  0.7878807 ],
       [ 0.31817664, -0.14271892,  0.65988937,  0.7878807 ],
       [ 0.20141457, -0.374155  ,  0.43524618,  0.39880381],
       [-1.66677857, -0.14271892, -1.36189934, -1.28719604],
       [-0.14887164, -0.60559109,  0.21060299,  0.13941922],
       [-0.14887164, -1.06846325, -0.12636179, -0.24965767],
       [ 0.31817664, -0.60559109,  0.15444219,  0.13941922],
       [ 0.66846286, -0.83702717,  0.88453256,  0.91757299],
       [ 0.0846525 , -0.14271892,  0.77221097,  0.7878807 ],
       [-0.49915786, -0.14271892,  0.43524618,  0.39880381],
       [-0.26563371, -0.60559109,  0.65988937,  1.04726529],
       [ 2.18636979,  1.

In [5]:
X_test

array([[-0.14887164, -0.374155  ,  0.26676379,  0.13941922],
       [ 0.31817664, -0.60559109,  0.54756778,  0.00972692],
       [ 0.31817664, -1.06846325,  1.05301496,  0.26911151],
       [-1.5500165 , -1.7627715 , -1.36189934, -1.15750374],
       [ 0.0846525 ,  0.32015325,  0.60372857,  0.7878807 ],
       [ 0.78522493, -0.14271892,  0.99685416,  0.7878807 ],
       [-0.84944407,  1.70876975, -1.24957775, -1.15750374],
       [ 0.20141457, -0.14271892,  0.60372857,  0.7878807 ],
       [-0.38239578,  2.63451409, -1.30573855, -1.28719604],
       [-0.38239578, -1.29989934,  0.15444219,  0.13941922],
       [ 0.66846286,  0.08871717,  0.99685416,  0.7878807 ],
       [-0.38239578,  1.0144615 , -1.36189934, -1.28719604],
       [-0.49915786,  0.78302542, -1.13725615, -1.28719604],
       [ 0.43493872, -0.60559109,  0.60372857,  0.7878807 ],
       [ 0.55170079, -1.7627715 ,  0.37908538,  0.13941922],
       [ 0.55170079,  0.55158933,  0.54756778,  0.52849611],
       [-1.19973028,  0.

### y값 원 핫 인코딩 해주기

라그랑주 승수...내가 너무 심각하게 받아들였나보다...네....

In [6]:
#일단 y값들을 다 원핫 인코딩 해주기
dummies_y_train=pd.get_dummies(y_train)
dummies_y_test=pd.get_dummies(y_test)

print(dummies_y_train.head())
print(dummies_y_test.head())

     setosa  versicolor  virginica
110       0           0          1
69        0           1          0
148       0           0          1
39        1           0          0
53        0           1          0
     setosa  versicolor  virginica
96        0           1          0
73        0           1          0
134       0           0          1
41        1           0          0
70        0           1          0


### 그리드서치로 파라미터 정해보기

In [7]:
from sklearn.model_selection import GridSearchCV

In [8]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
             'gamma': [0.001, 0.01, 0.1, 1, 10, 100] }

In [9]:
model = SVC()
grid_search = GridSearchCV(model, param_grid, cv=5, return_train_score=True)
grid_search.fit(X_train, dummies_y_train['setosa'])

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
             return_train_score=True)

In [10]:
test_score = grid_search.score(X_test, dummies_y_test['setosa'])
print("테스트 세트 점수: {:.2f}".format( test_score ))
print("최적 매개변수: {}".format(grid_search.best_params_))
print("최고 교차 검증 점수: {:.2f}".format(grid_search.best_score_))

테스트 세트 점수: 1.00
최적 매개변수: {'C': 0.1, 'gamma': 0.1}
최고 교차 검증 점수: 1.00


c랑 ganmma 둘 다 작으면 과소적합의 가능성이 있다고 하는데 왜 점수가 높을까 뭘 잘못한걸까

참고 사이트: https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html<br>
https://inuplace.tistory.com/616

In [11]:
#일단 다른 품종도 이렇게 나오는지 함 보자....
grid_search = GridSearchCV(model, param_grid, cv=5, return_train_score=True)
grid_search.fit(X_train, dummies_y_train['versicolor'])

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
             return_train_score=True)

In [12]:
test_score = grid_search.score(X_test, dummies_y_test['versicolor'])
print("테스트 세트 점수: {:.2f}".format( test_score ))
print("최적 매개변수: {}".format(grid_search.best_params_))
print("최고 교차 검증 점수: {:.2f}".format(grid_search.best_score_))

테스트 세트 점수: 0.93
최적 매개변수: {'C': 100, 'gamma': 0.01}
최고 교차 검증 점수: 0.97


점수가 좀 더 떨어졌다 (오히려 좋아)<br>
근데 이번에는 c값이 좀 다르게 나온다...근데 0.1에서 100은 너무 극단적이자나요...

In [13]:
#이왕 이렇게 된거 걍 마지막 품종도 그리드 서치 돌려보자,,~~
grid_search = GridSearchCV(model, param_grid, cv=5, return_train_score=True)
grid_search.fit(X_train, dummies_y_train['virginica'])

GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100],
                         'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
             return_train_score=True)

In [28]:
type(dummies_y_train[target])

pandas.core.series.Series

In [14]:
test_score = grid_search.score(X_test, dummies_y_test['virginica'])
print("테스트 세트 점수: {:.2f}".format( test_score ))
print("최적 매개변수: {}".format(grid_search.best_params_))
print("최고 교차 검증 점수: {:.2f}".format(grid_search.best_score_))

테스트 세트 점수: 0.97
최적 매개변수: {'C': 10, 'gamma': 0.01}
최고 교차 검증 점수: 0.97


참고사이트: https://bskyvision.com/entry/%EC%84%9C%ED%8F%AC%ED%8A%B8-%EB%B2%A1%ED%84%B0-%EB%A8%B8%EC%8B%A0SVM%EC%9D%98-%EC%82%AC%EC%9A%A9%EC%9E%90%EB%A1%9C%EC%84%9C-%EA%BC%AD-%EC%95%8C%EC%95%84%EC%95%BC%ED%95%A0-%EA%B2%83%EB%93%A4-%EB%A7%A4%EA%B0%9C%EB%B3%80%EC%88%98-C%EC%99%80-gamma

파라미터를 통일하고 싶었는데, 값이 다 다르게 나오고, 일일이 지정해주기에는 클래스에 무관한 모델을 만드는게 목표여서...애매한것 같다.

### SVM 만드는데 필요한 부분들 만들어보기

In [None]:
#grid_search1 = GridSearchCV(model, grid_search.best_params_, cv=5, return_train_score=True) 
#이거 안돌아가는 코드여서 주석처리 했습니다.
#params에 best params 직접 넣으려고 했는데 타입이 달라서 안되는것 같다. 그래서 list로 타입 바꿔서 했는데 그것도 안되는것 같다..

In [17]:
a=list(grid_search.best_params_.values())
a
#grid_search.best_params_가 dictionary형식으로 나오는데, dictionary의 value만 뽑아서 그때그때 parameter에 넣어줘도 될것 같다...

[10, 0.01]

In [18]:
#일단 y값별로 원핫 인코딩 하기 전에 해당 값들을 list 만들어 놓기
y_train_list=list(y_train.unique())
y_train_list

['virginica', 'versicolor', 'setosa']

In [15]:
#그리드 서치 범위 정해놓기
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
             'gamma': [0.001, 0.01, 0.1, 1, 10, 100] }

In [33]:
# list 첫번째 요소(virginica)인지 아닌지 test후 예측값 만들기
model = SVC()
##하이퍼 파라미터 구하기
grid_search = GridSearchCV(model, param_grid, cv=5, return_train_score=True)
grid_search.fit(X_train, dummies_y_train[y_train_list[0]])
best_params = list(grid_search.best_params_.values())

##위에서 구한 파라미터 이용하여 모델 만들기
model1=SVC(C=best_params[0],gamma=best_params[1],kernel='rbf')
model1.fit(X_train, dummies_y_train[y_train_list[0]])
pred=model1.predict(X_test)

pred

array([0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0], dtype=uint8)

위의 내용을 클래스의 숫자만큼 반복하면 될것 같다

In [37]:
#해당 예측값이 array로 나오니깐 돌린 결과값을 array에 추가하는 형식으로 하면 될것 같다
semi_final=[]
for i in range(len(y_train_list)):
  # list 첫번째 요소(virginica)인지 아닌지 test후 예측값 만들기
  model = SVC()
  ##하이퍼 파라미터 구하기
  grid_search = GridSearchCV(model, param_grid, cv=5, return_train_score=True)
  grid_search.fit(X_train, dummies_y_train[y_train_list[i]])
  best_params = list(grid_search.best_params_.values())

  ##위에서 구한 파라미터 이용하여 모델 만들기
  model1=SVC(C=best_params[0],gamma=best_params[1],kernel='rbf')
  model1.fit(X_train, dummies_y_train[y_train_list[i]])
  pred=model1.predict(X_test)

  semi_final.append(pred)
semi_final

[array([0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
        1, 0, 1, 0, 0, 1, 1, 0], dtype=uint8),
 array([1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,
        1, 1, 0, 0, 0, 0, 0, 1], dtype=uint8),
 array([0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
        0, 0, 0, 1, 1, 0, 0, 0], dtype=uint8)]

In [44]:
df=pd.DataFrame(np.transpose(semi_final))
df.columns = y_train_list
df

Unnamed: 0,virginica,versicolor,setosa
0,0,1,0
1,0,1,0
2,1,1,0
3,0,0,1
4,0,0,0
5,1,0,0
6,0,0,1
7,1,0,0
8,0,0,1
9,0,1,0


결과를 보면 다는 아니지만 본인이 true 값이라고 동시에 주장하는 관측치가 있다<br>
이런 경우에는 마지막에 1이 나온 열에 대해서 답을 가진다.<br>


In [55]:
df.loc[0,'virginica']+df.loc[0,'versicolor']+df.loc[0,'setosa']

1

In [59]:
df['conclu']=0
df['final']="I don't know"

for i in range(len(df)):
  df['conclu'][i]=df.loc[i,'virginica']+df.loc[i,'versicolor']+df.loc[i,'setosa']

for i in range(len(df)):
  if df['conclu'][i]==1:
    for j in range(len(y_train_list)):
      if df.loc[i,y_train_list[j]]==1:
        df['final'][i]=y_train_list[j]
  else:
    for j in range(len(y_train_list)):
      if df.loc[i,y_train_list[j]]==1:
        df['final'][i]=y_train_list[j]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [60]:
df #다 0이 나오는 경우도 있구나...그런 경우에는 걍 virginica로 합시다!! 

Unnamed: 0,virginica,versicolor,setosa,final,conclu
0,0,1,0,versicolor,1
1,0,1,0,versicolor,1
2,1,1,0,versicolor,2
3,0,0,1,setosa,1
4,0,0,0,I don't know,0
5,1,0,0,virginica,1
6,0,0,1,setosa,1
7,1,0,0,virginica,1
8,0,0,1,setosa,1
9,0,1,0,versicolor,1


In [62]:
#위랑 동일한 코드인데, 다 0이 나오는 경우의 조건을 추가
df['conclu']=0
df['final']="I don't know"

for i in range(len(df)):
  df['conclu'][i]=df.loc[i,'virginica']+df.loc[i,'versicolor']+df.loc[i,'setosa']

for i in range(len(df)):
  if df['conclu'][i]==1:
    for j in range(len(y_train_list)):
      if df.loc[i,y_train_list[j]]==1:
        df['final'][i]=y_train_list[j]
  elif df['conclu'][i]==0: #이 조건문을 추가해줌
    df['final'][i]=y_train_list[0]
  else:
    for j in range(len(y_train_list)):
      if df.loc[i,y_train_list[j]]==1:
        df['final'][i]=y_train_list[j]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [63]:
df #초기값 idon't know 없음

Unnamed: 0,virginica,versicolor,setosa,final,conclu
0,0,1,0,versicolor,1
1,0,1,0,versicolor,1
2,1,1,0,versicolor,2
3,0,0,1,setosa,1
4,0,0,0,virginica,0
5,1,0,0,virginica,1
6,0,0,1,setosa,1
7,1,0,0,virginica,1
8,0,0,1,setosa,1
9,0,1,0,versicolor,1


### 위 과정 합쳐서 SVM 구현하기

In [64]:
def self_SVM(X_train, X_test, y_train, y_test):
  #일단 y값별로 원핫 인코딩 하기 전에 해당 값들을 list 만들어 놓기
  y_train_list=list(y_train.unique())
  y_train_list

  #그리드 서치 범위 정해놓기
  param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
                'gamma': [0.001, 0.01, 0.1, 1, 10, 100] }

  #해당 예측값이 array로 나오니깐 돌린 결과값을 array에 추가하는 형식으로 하면 될것 같다
  semi_final=[]
  for i in range(len(y_train_list)):
    # list 첫번째 요소(virginica)인지 아닌지 test후 예측값 만들기
    model = SVC()
    ##하이퍼 파라미터 구하기
    grid_search = GridSearchCV(model, param_grid, cv=5, return_train_score=True)
    grid_search.fit(X_train, dummies_y_train[y_train_list[i]])
    best_params = list(grid_search.best_params_.values())

    ##위에서 구한 파라미터 이용하여 모델 만들기
    model1=SVC(C=best_params[0],gamma=best_params[1],kernel='rbf')
    model1.fit(X_train, dummies_y_train[y_train_list[i]])
    pred=model1.predict(X_test)

    semi_final.append(pred)

  #다루기 쉽게 데이터 프레임으로 만들어 놓기
  df=pd.DataFrame(np.transpose(semi_final))
  df.columns = y_train_list
  
  #1이 하나만 있는 경우는 그 값을 결과 값으로, 0만 있는 경우에는 첫번째 열의 값으로, 1이 두개 이상은 맨 마지막 값으로
  df['conclu']=0
  df['final']="I don't know"

  for i in range(len(df)):
    df['conclu'][i]=df.loc[i,'virginica']+df.loc[i,'versicolor']+df.loc[i,'setosa']

  for i in range(len(df)):
    if df['conclu'][i]==1:
      for j in range(len(y_train_list)):
        if df.loc[i,y_train_list[j]]==1:
          df['final'][i]=y_train_list[j]
    elif df['conclu'][i]==0: 
      df['final'][i]=y_train_list[0]
    else:
      for j in range(len(y_train_list)):
        if df.loc[i,y_train_list[j]]==1:
          df['final'][i]=y_train_list[j]
    
  return df['final']
  

In [65]:
self_SVM(X_train, X_test, y_train, y_test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0     versicolor
1     versicolor
2     versicolor
3         setosa
4      virginica
5      virginica
6         setosa
7      virginica
8         setosa
9     versicolor
10     virginica
11        setosa
12        setosa
13     virginica
14    versicolor
15    versicolor
16        setosa
17    versicolor
18    versicolor
19     virginica
20        setosa
21     virginica
22    versicolor
23    versicolor
24     virginica
25        setosa
26        setosa
27     virginica
28     virginica
29    versicolor
Name: final, dtype: object