In [85]:
from sklearn import datasets

diabetes = datasets.load_diabetes()
diabetes_X_train = diabetes.data[:-20]
diabetes_X_test  = diabetes.data[-20:]
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test  = diabetes.target[-20:]

In [86]:
from sklearn import linear_model
regr = linear_model.LinearRegression()

# fit parameters of hypothesis 
regr.fit(diabetes_X_train, diabetes_y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [87]:
regr.coef_

array([  3.03499549e-01,  -2.37639315e+02,   5.10530605e+02,
         3.27736980e+02,  -8.14131709e+02,   4.92814588e+02,
         1.02848452e+02,   1.84606489e+02,   7.43519617e+02,
         7.60951722e+01])

In [88]:
regr._get_param_names()

['copy_X', 'fit_intercept', 'n_jobs', 'normalize']

In [89]:
import numpy as np
np.mean((regr.predict(diabetes_X_test)-diabetes_y_test)**2)

2004.5676026898207

In [90]:
print(regr.predict(diabetes_X_test))
print(diabetes_y_test)

[ 197.61846908  155.43979328  172.88665147  111.53537279  164.80054784
  131.06954875  259.12237761  100.47935157  117.0601052   124.30503555
  218.36632793   61.19831284  132.25046751  120.3332925    52.54458691
  194.03798088  102.57139702  123.56604987  211.0346317    52.60335674]
[ 233.   91.  111.  152.  120.   67.  310.   94.  183.   66.  173.   72.
   49.   64.   48.  178.  104.  132.  220.   57.]


In [91]:
regr.score(diabetes_X_test, diabetes_y_test)

0.58507530226905746

In [92]:
%%html
위의 결과들을 보면 알겠지만 정확도가 별로 높지 않음
</br>
score 메서드는 Returns the coefficient of determination R^2 of the prediction. 라고 설명되어 있는데 np.mean(~~)한 식과 같은 내용이다.
</br>
각 차원마다 data point(?)가 적으면 예측한 값들의 variance 가 높다(= 데이터들이 분산되어 있음)

In [93]:
X = np.c_[.5, 1].T
y = [.5, 1]
test = np.c_[0, 2].T
print(X, y, test)

[[ 0.5]
 [ 1. ]] [0.5, 1] [[0]
 [2]]


In [94]:
%matplotlib notebook

import matplotlib.pyplot as plt
plt.figure()

np.random.seed(0)
for _ in range(6):
    this_X = .1 * np.random.normal(size=(2, 1)) + X  # high variance
    print('this_X : ', this_X)
    print(X)
    regr.fit(X, y)
    plt.plot(test, regr.predict(test))
    plt.scatter(this_X, y, s=3)

<IPython.core.display.Javascript object>

this_X :  [[ 0.67640523]
 [ 1.04001572]]
[[ 0.5]
 [ 1. ]]


this_X :  [[ 0.5978738 ]
 [ 1.22408932]]
[[ 0.5]
 [ 1. ]]
this_X :  [[ 0.6867558 ]
 [ 0.90227221]]
[[ 0.5]
 [ 1. ]]
this_X :  [[ 0.59500884]
 [ 0.98486428]]
[[ 0.5]
 [ 1. ]]
this_X :  [[ 0.48967811]
 [ 1.04105985]]
[[ 0.5]
 [ 1. ]]
this_X :  [[ 0.51440436]
 [ 1.14542735]]
[[ 0.5]
 [ 1. ]]


In [95]:
%matplotlib notebook

regr = linear_model.Ridge(alpha=.1)

plt.figure()

np.random.seed(0)
for _ in range(6):
    this_X = .1*np.random.normal(size=(2, 1)) + X
    regr.fit(this_X, y)
    plt.plot(test, regr.predict(test))
    plt.scatter(this_X, y, s=3)

<IPython.core.display.Javascript object>

In [96]:
%%html
확률적인 방법으로 high-dimensional 에서 높은 variance를 낮추는 해결책으로 Ridge 가 있음.
bias/variance tradeoff 예시임.
Ridge의 alpha 값이 높을수록 bias 는 높아지고 variance 는 낮아짐 => underfitting

In [97]:
# We can choose alpha to minimize left out error, this time using the diabetes dataset rather than our synthetic data:
alphas = np.logspace(-4, -1, 6)
from __future__ import print_function
print([regr.set_params(alpha=alpha
                       ).fit(diabetes_X_train, diabetes_y_train,
                             ).score(diabetes_X_test, diabetes_y_test) for alpha in alphas])

[0.58511106838835314, 0.58520730154446743, 0.58546775406984897, 0.58555120365039148, 0.58307170855541612, 0.570589994372801]


In [98]:
%%html
차원을 줄이는 용도로 Ridge 외에 Lasso 라는 것이 있음

In [99]:
from sklearn import linear_model
import numpy as np

regr = linear_model.Lasso()
alphas = np.logspace(-4, -1, 6)

scores = [regr.set_params(alpha=alpha
            ).fit(diabetes_X_train, diabetes_y_train
            ).score(diabetes_X_test, diabetes_y_test)
       for alpha in alphas]
best_alpha = alphas[scores.index(max(scores))]
regr.alpha = best_alpha
regr.fit(diabetes_X_train, diabetes_y_train)

print(regr.coef_)

[   0.         -212.43764548  517.19478111  313.77959962 -160.8303982    -0.
 -187.19554705   69.38229038  508.66011217   71.84239008]


In [100]:
from sklearn import datasets
iris = datasets.load_iris()
iris_X = iris.data
iris_y = iris.target
np.unique(iris_y)

np.random.seed(0)
indices = np.random.permutation(len(iris_X))
iris_X_train = iris_X[indices[:-10]]
iris_y_train = iris_y[indices[:-10]]
iris_X_test  = iris_X[indices[-10:]]
iris_y_test  = iris_y[indices[-10:]]

logistic = linear_model.LogisticRegression(C=1e5)
logistic.fit(iris_X_train, iris_y_train)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [101]:
print(logistic.predict(iris_X_test))
print(iris_y_test)

[1 2 1 0 0 0 2 1 2 0]
[1 1 1 0 0 0 2 1 2 0]


In [115]:
from sklearn import datasets, neighbors, linear_model

digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target

X_digits_train = X_digits[:-50]
y_digits_train = y_digits[:-50]

X_digits_test = X_digits[-50:]
y_digits_test = y_digits[-50:]


knn = neighbors.KNeighborsClassifier()
knn.fit(X_digits_train, y_digits_train)

logistic = linear_model.LogisticRegression()
logistic.fit(X_digits_train, y_digits_train)

print('Knn Score : %f' % knn.score(X_digits_test, y_digits_test))
print('LogisticRegression Score : %f' % logistic.score(X_digits_test, y_digits_test))

Knn Score : 0.980000
LogisticRegression Score : 0.980000


In [125]:
from sklearn import datasets, neighbors, linear_model

digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target

n_samples = len(X_digits)

X_train = X_digits[:int(.9 * n_samples)]
y_train = y_digits[:int(.9 * n_samples)]
X_test = X_digits[int(.9 * n_samples):]
y_test = y_digits[int(.9 * n_samples):]

knn = neighbors.KNeighborsClassifier()
logistic = linear_model.LogisticRegression()

print('KNN score: %f' % knn.fit(X_train, y_train).score(X_test, y_test))
print('LogisticRegression score: %f'
      % logistic.fit(X_train, y_train).score(X_test, y_test))

KNN score: 0.961111


LogisticRegression score: 0.938889


In [127]:
from sklearn import svm
svc = svm.SVC(kernel='poly', degree=3)
svc.fit(iris_X_train, iris_y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [128]:
svc = svm.SVC(kernel='rbf')
# gamma: inverse of size of
# radial kernel