In [1]:
#lib
import numpy as np
from sklearn import svm
from sklearn import model_selection
import matplotlib.pyplot as plt
import matplotlib as mpl

data_path = "tumor.data"

In [27]:
# Function
def tumor_type(s):
    it = {b'Grade_1':0, b'Grade_2_invasion':1, b'Grade_2_noninvasion':2}
    return it[s]

### Import Data

In [28]:
data = np.loadtxt(data_path, 
    dtype=float,
    delimiter=',', 
    converters={7:tumor_type})

# x: feature, y: label
x, y = np.split(data, (7,), axis=1)

In [21]:
x = x[:,:4]
x

array([[ 0. , 51. ,  5.2,  1. ],
       [ 0. , 41. ,  5.2,  1. ],
       [ 0. , 64. ,  3.5,  1. ],
       [ 0. , 60. ,  2.5,  1. ],
       [ 0. , 46. ,  6.8,  1. ],
       [ 1. , 57. ,  6.2,  1. ],
       [ 1. , 60. ,  5. ,  2. ],
       [ 0. , 63. ,  4. ,  1. ],
       [ 0. , 60. ,  2.3,  2. ],
       [ 0. , 56. ,  2.8,  2. ],
       [ 0. , 59. ,  3.1,  1. ],
       [ 0. , 46. ,  2. ,  2. ],
       [ 0. , 57. ,  3.6,  2. ],
       [ 0. , 52. ,  5.8,  1. ],
       [ 1. , 69. ,  5.2,  1. ],
       [ 0. , 52. ,  2. ,  2. ],
       [ 0. , 39. ,  3.8,  1. ],
       [ 0. , 33. ,  2.8,  1. ],
       [ 0. , 55. ,  3.2,  1. ],
       [ 1. , 66. ,  4.6,  1. ],
       [ 0. , 70. ,  4.6,  3. ],
       [ 0. , 54. ,  2.6,  2. ],
       [ 0. , 59. ,  3. ,  1. ],
       [ 1. , 53. ,  1.5,  2. ],
       [ 0. , 60. ,  3.5,  2. ],
       [ 0. , 61. ,  4.5,  2. ],
       [ 0. , 62. ,  5. ,  2. ],
       [ 0. , 62. ,  2.6,  1. ],
       [ 0. , 63. ,  3.2,  1. ],
       [ 0. , 49. ,  4.5,  1. ],
       [ 0

---

#### Ref from iris_cls.py

In [22]:
# test: 30%
data_train, data_test, tag_train, tag_test = model_selection.train_test_split(
    x, y, random_state=1, test_size=0.3)

def classifier():
    clf = svm.SVC(C=1000, kernel='linear', decision_function_shape='ovr')    
    return clf
clf = classifier()

def train(clf, x_train, y_train):
    clf.fit(x_train, y_train.ravel())
train(clf, data_train, tag_train)

def print_accuracy(clf, x_train, y_train, x_test, y_test):
    print('training prediction:%.3f' % (clf.score(x_train, y_train)))
    print('test data prediction:%.3f' % (clf.score(x_test, y_test)))
print_accuracy(clf, data_train, tag_train, data_test, tag_test)

training prediction:0.656
test data prediction:0.574


### LinearSVC fastest
### SGDClassifier Slower
### SVC slowest

---

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import SGDClassifier

In [31]:
Poly_svm_clf = Pipeline([
    ("poly_features", PolynomialFeatures(degree=3)),
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(C=100, kernel='poly', decision_function_shape='ovr'))    
])
Poly_svm_clf.fit(data_train, tag_train.ravel())


train_pred = Poly_svm_clf.predict(data_train)
test_pred = Poly_svm_clf.predict(data_test)

from sklearn.metrics import accuracy_score
print(accuracy_score(tag_train, train_pred))
print(accuracy_score(tag_test, test_pred))

0.8
0.5555555555555556


---
### Find the appropriate hyperparameter values
GridSearchCV

In [30]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel':('poly', 'rbf', 'sigmoid'), 'C':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(data_train, tag_train.ravel())
sorted(clf.cv_results_.keys())

print(clf.best_estimator_, clf.best_score_)
print(clf.cv_results_["mean_test_score"])

[0.368 0.368 0.368 0.368 0.368 0.368 0.368 0.368 0.368 0.464 0.368 0.368
 0.52  0.528 0.368 0.552 0.592 0.36  0.6   0.616 0.368 0.608 0.624 0.368]


In [8]:
clf.cv_results_

{'mean_fit_time': array([0.00104122, 0.00121131, 0.00107503, 0.0011302 , 0.00103683,
        0.00114183, 0.00106368, 0.00111136, 0.00110621, 0.00115023,
        0.00207238, 0.00116205]),
 'std_fit_time': array([1.44612014e-04, 4.68361390e-05, 1.19675963e-04, 3.09953689e-05,
        1.02066924e-04, 5.48912685e-05, 1.01170035e-04, 5.62808456e-05,
        1.94700874e-04, 7.77688514e-05, 1.18410624e-03, 7.82547745e-05]),
 'mean_score_time': array([0.00045538, 0.00069909, 0.00046458, 0.00068374, 0.00050769,
        0.00068898, 0.00047307, 0.00069141, 0.00040898, 0.00063372,
        0.00044661, 0.00058589]),
 'std_score_time': array([4.29807271e-05, 2.01140517e-05, 3.11395475e-05, 3.51077814e-05,
        9.86629663e-05, 3.06889817e-05, 6.15194597e-05, 7.04616252e-05,
        9.89204402e-06, 3.29497582e-05, 3.56115960e-05, 2.75493055e-05]),
 'param_C': masked_array(data=[0.001, 0.001, 0.01, 0.01, 0.1, 0.1, 1, 1, 10, 10, 100,
                    100],
              mask=[False, False, False, F