## Loading an example dataset

In [1]:
from sklearn import datasets

iris = datasets.load_iris()
digits = datasets.load_digits()

In [2]:
digits.data

array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ..., 16.,  9.,  0.],
       ...,
       [ 0.,  0.,  1., ...,  6.,  0.,  0.],
       [ 0.,  0.,  2., ..., 12.,  0.,  0.],
       [ 0.,  0., 10., ..., 12.,  1.,  0.]])

In [3]:
digits.target

array([0, 1, 2, ..., 8, 9, 8])

## Learning and predicting

In [4]:
from sklearn import svm

clf = svm.SVC(gamma=0.001, C=100.)
clf.fit(digits.data[:-1], digits.target[:-1])

SVC(C=100.0, gamma=0.001)

In [5]:
clf.predict(digits.data[-1:])

array([8])

## Model persistence


In [6]:
from sklearn import svm
from sklearn import datasets

clf = svm.SVC()
X, y = datasets.load_iris(return_X_y=True)
clf.fit(X, y)

SVC()

In [7]:
import pickle

s = pickle.dumps(clf)
clf2 = pickle.loads(s)
print("yhat:", clf2.predict(X[0:3]), " y:", y[0:3])


yhat: [0 0 0]  y: [0 0 0]


- joblib
- : more efficient on big data but it can only pickle to the disk and not to a string

In [8]:
from joblib import dump, load

dump(clf, 'filename.joblib')
clf2 = load('filename.joblib')

## Conventions

### 1. Type casting

In [9]:
import numpy as np
from sklearn import random_projection ## 차원축소

rng = np.random.RandomState(0)
X = rng.rand(10, 2000) ## np.rand처럼 10 X 2000 ndarray 생성
X = np.array(X, dtype='float32')
X.dtype, X.shape

(dtype('float32'), (10, 2000))

- 차원 축소됨
- Regression targets are cast to float64 by fit_transform(X)
- classification targets are maintained by fit_transform(X)

In [10]:
transformer = random_projection.GaussianRandomProjection()
X_new = transformer.fit_transform(X)
X_new.dtype, X_new.shape

(dtype('float64'), (10, 1973))

In [11]:
from sklearn import datasets
from sklearn.svm import SVC

iris = datasets.load_iris()
clf = SVC()

- 종속변수 데이터로 피팅

In [12]:
clf.fit(iris.data, iris.target)
list(clf.predict(iris.data[:3]))

[0, 0, 0]

- 종속변수 이름으로 피팅

In [13]:
clf.fit(iris.data, iris.target_names[iris.target])
list(clf.predict(iris.data[:3]))

['setosa', 'setosa', 'setosa']

### 2. Refitting and updating parameters

In [14]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.svm import SVC

X, y = load_iris(return_X_y=True)
clf = SVC()

- Calling fit() more than once will overwrite what was learned by any previous fit()

In [15]:
clf.set_params(kernel='linear').fit(X,y)
clf.predict(X[:15])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [16]:
clf.set_params(kernel='rbf').fit(X,y)
clf.predict(X[:15])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

### 3. Multiclass vs. multilabel fitting

- OneVsRestClassifier: https://bit.ly/3i2Faw4
- LabelBinarizer: One-hot encoding 해줌 (범주를 0과 1로 표현)
- MultiLabelBinarizer: 2d array을 이진화해준다

In [17]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer

X = [[1, 2], [2, 4], [4, 5], [3, 2], [3, 1]]
classif = OneVsRestClassifier(estimator=SVC(random_state=0))

In [18]:
y = [0, 0, 1, 1, 2]
classif.fit(X,y).predict(X)

array([0, 0, 1, 1, 2])

In [19]:
y = LabelBinarizer().fit_transform(y) ## one-hot encoding
print(y)
classif.fit(X,y).predict(X)

[[1 0 0]
 [1 0 0]
 [0 1 0]
 [0 1 0]
 [0 0 1]]


array([[1, 0, 0],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [20]:
y = [[0, 1], [0, 2], [1, 3], [0, 2, 3], [2, 4]]
y = MultiLabelBinarizer().fit_transform(y)
print(y)
classif.fit(X, y).predict(X)
## 이때 결과는 당연히 overfit됨(train-data로 predict함)

[[1 1 0 0 0]
 [1 0 1 0 0]
 [0 1 0 1 0]
 [1 0 1 1 0]
 [0 0 1 0 1]]


array([[1, 1, 0, 0, 0],
       [1, 0, 1, 0, 0],
       [0, 1, 0, 1, 0],
       [1, 0, 1, 0, 0],
       [1, 0, 1, 0, 0]])