In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import *
from sklearn.metrics import classification_report, confusion_matrix

In [31]:
#default값 설정
plt.rcParams['figure.figsize'] = [7, 7] 
sns.set(style='darkgrid')
plt.rcParams['scatter.edgecolors'] = 'black'
pd.set_option('display.max_columns', None)
pd.set_option('display.max_row', None)
pd.set_option('display.unicode.east_asian_width', True)

In [32]:
iris_dataset = load_iris()
iris = pd.DataFrame(iris_dataset.data,
        columns=iris_dataset.feature_names)
labels = iris_dataset.target_names
iris.info()
print(iris.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
dtypes: float64(4)
memory usage: 4.8 KB
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2


In [33]:
label = iris_dataset.target
print(label)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [34]:
#스케일링
scaler = StandardScaler()
iris = scaler.fit_transform(iris)
Features = pd.DataFrame(iris, columns=['SL','SW','PL','PW'])
print(Features.shape) #150개 자료 컬럼은 4개

(150, 4)


In [15]:
from sklearn import svm
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
kernel_list = ['linear','poly','rbf','sigmoid'] #svm만들때 kernel을 선택할 수 있음
params = dict(kernel=kernel_list)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=2985)
irisSVM = svm.SVC(C=1.0, random_state=2985, class_weight='balanced')
kernel_Type = GridSearchCV(irisSVM, params, cv=cv)
kernel_Type.fit(Features, label)
scores = kernel_Type.cv_results_['mean_test_score']

for score, kernel in zip(scores, kernel_list):
  print(f'{kernel} {score:3f}')

linear 0.973333
poly 0.946667
rbf 0.960000
sigmoid 0.913333


In [9]:
print(params)

{'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}


In [35]:
#kerner이 poly일때 degree에 따른 차이
degree_list = [1,2,3,4,5,6,7,8,9,10]
params = dict(degree=degree_list)
irisSVM = svm.SVC(C=1.0, random_state=2985, class_weight='balanced', kernel='poly')
degree_Type = GridSearchCV(irisSVM, params, cv=cv)
degree_Type.fit(Features, label)
scores = degree_Type.cv_results_['mean_test_score']

for score, degree in zip(scores, degree_list):
  print(f'{degree} {score:3f}')

1 0.966667
2 0.820000
3 0.893333
4 0.780000
5 0.866667
6 0.786667
7 0.840000
8 0.746667
9 0.813333
10 0.733333


In [36]:
kernel_list = ['linear','poly','rbf','sigmoid'] #svm만들때 kernel을 선택할 수 있음
params = dict(kernel=kernel_list)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=2985)
irisSVM = svm.SVC(C=1.0, random_state=2985, class_weight='balanced', degree=1)  #C 마진.오차 를 얼마나 허용할건지 마진이 좁아지면 과적합의 위험이 있음 그래서 soft마진으로.. 적당한값을 아래서 찾아야함
kernel_Type = GridSearchCV(irisSVM, params, cv=cv)
kernel_Type.fit(Features, label)
scores = kernel_Type.cv_results_['mean_test_score']

for score, kernel in zip(scores, kernel_list):
  print(f'{kernel} {score:3f}')

linear 0.973333
poly 0.966667
rbf 0.960000
sigmoid 0.913333


In [20]:
np.geomspace(1, 1000, num=4)

array([   1.,   10.,  100., 1000.])

In [37]:
np.geomspace(1, 256, num=9)


array([  1.,   2.,   4.,   8.,  16.,  32.,  64., 128., 256.])

In [39]:
X_train, X_test, Y_train, Y_test = train_test_split(
      Features, label, test_size=0.2)

In [40]:
C_list = np.geomspace(10000, 0.0001, num=50) #50개 구간으로 나눔   //log스케일..?
print(C_list)

[1.00000000e+04 6.86648845e+03 4.71486636e+03 3.23745754e+03
 2.22299648e+03 1.52641797e+03 1.04811313e+03 7.19685673e+02
 4.94171336e+02 3.39322177e+02 2.32995181e+02 1.59985872e+02
 1.09854114e+02 7.54312006e+01 5.17947468e+01 3.55648031e+01
 2.44205309e+01 1.67683294e+01 1.15139540e+01 7.90604321e+00
 5.42867544e+00 3.72759372e+00 2.55954792e+00 1.75751062e+00
 1.20679264e+00 8.28642773e-01 5.68986603e-01 3.90693994e-01
 2.68269580e-01 1.84206997e-01 1.26485522e-01 8.68511374e-02
 5.96362332e-02 4.09491506e-02 2.81176870e-02 1.93069773e-02
 1.32571137e-02 9.10298178e-03 6.25055193e-03 4.29193426e-03
 2.94705170e-03 2.02358965e-03 1.38949549e-03 9.54095476e-04
 6.55128557e-04 4.49843267e-04 3.08884360e-04 2.12095089e-04
 1.45634848e-04 1.00000000e-04]


In [None]:
params = dict(C=C_list)
cv = StratifiedShuffleSplit(n_splits=5, 
            test_size=0.2, random_state=2958)
iris_SVM = svm.SVC(random_state=2958, class_weight='balanced',
            kernel='poly', degree=1)
C_Type = GridSearchCV(iris_SVM, params, cv=cv) #params를 바꿔가며 값을 찾아줌, 교차검증할 때, stratified...이걸씀
C_Type.fit(X_train, Y_train)
scores = C_Type.cv_results_['mean_test_score']
for score, C in zip(scores, C_list):
    print(f' {C: .3f} {score: .3f}')

In [47]:
for c in C_list:
    model1 = svm.SVC(C=c, kernel='poly', class_weight='balanced',
                     gamma='scale', degree=1)
    model1.fit(X_train, Y_train)
    train_score = model1.score(X_train, Y_train)
    test_score = model1.score(X_test, Y_test)
    print('The Accuracy @ {} C parameter, Train score is {}, Test score is {}'.format(c, train_score, test_score))

The Accuracy @ 10000.0 C parameter, Train score is 0.9833333333333333, Test score is 1.0
The Accuracy @ 6866.488450042998 C parameter, Train score is 0.9833333333333333, Test score is 1.0
The Accuracy @ 4714.866363457394 C parameter, Train score is 0.9833333333333333, Test score is 1.0
The Accuracy @ 3237.457542817643 C parameter, Train score is 0.9833333333333333, Test score is 1.0
The Accuracy @ 2222.9964825261955 C parameter, Train score is 0.9833333333333333, Test score is 1.0
The Accuracy @ 1526.4179671752333 C parameter, Train score is 0.975, Test score is 1.0
The Accuracy @ 1048.1131341546852 C parameter, Train score is 0.975, Test score is 1.0
The Accuracy @ 719.6856730011522 C parameter, Train score is 0.975, Test score is 1.0
The Accuracy @ 494.17133613238383 C parameter, Train score is 0.975, Test score is 1.0
The Accuracy @ 339.32217718953297 C parameter, Train score is 0.975, Test score is 1.0
The Accuracy @ 232.99518105153717 C parameter, Train score is 0.975, Test score 

In [48]:
accuracies = cross_val_score(svm.SVC(kernel='poly', degree=1,
                C=75.43), Features, label, cv=cv)
print('Cross-Validation accuracy scores:{}'.format(accuracies))
print('Mean Cross-Validation accuracy score: {}'.format(round(
                    accuracies.mean(), 5)))

Cross-Validation accuracy scores:[0.96666667 0.96666667 0.96666667 1.         0.96666667]
Mean Cross-Validation accuracy score: 0.97333


In [None]:
for i in range(1, 1000):
    X_train, X_test, Y_train, Y_test = train_test_split(
        Features, label, test_size=0.2, random_state=i)
    iris_SVM = svm.SVC(kernel='poly', degree=1,
                C=75.43, class_weight='balanced')
    iris_SVM.fit(X_train, Y_train)
    train_score = iris_SVM.score(X_train, Y_train)
    test_score = iris_SVM.score(X_test, Y_test)
    if test_score >= train_score:
      print('Test: {} Train: {} RandomState: {}'.format(test_score, train_score, i))

In [57]:
X_train, X_test, Y_train, Y_test = train_test_split(
        Features, label, test_size=0.2, random_state=22)
iris_SVM = svm.SVC(kernel='poly', degree=1,
                C=75.43, class_weight='balanced')
iris_SVM.fit(X_train, Y_train)
train_score = iris_SVM.score(X_train, Y_train)
test_score = iris_SVM.score(X_test, Y_test)
print(train_score)
print(test_score)

0.9833333333333333
1.0


In [58]:
pd.DataFrame(confusion_matrix(Y_test, iris_SVM.predict(X_test)),
                              columns=['P_setosa','P_versicolor','P_virsinica'],
                              index=['A_setosa','A_versicolor','A_virsinica'])

Unnamed: 0,P_setosa,P_versicolor,P_virsinica
A_setosa,6,0,0
A_versicolor,0,10,0
A_virsinica,0,0,14


In [59]:
print(classification_report(Y_test, iris_SVM.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00        14

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

