In [None]:
## kNN on Iris

import numpy as np
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier

iris = datasets.load_iris()
iris_X = iris.data
iris_y = iris.target
np.unique(iris_y)

np.random.seed(0)
indices = np.random.permutation(len(iris_X))
iris_X_train = iris_X[indices[:-10]]
iris_y_train = iris_y[indices[:-10]]
iris_X_test = iris_X[indices[-10:]]
iris_y_test = iris_y[indices[-10:]]

knn = KNeighborsClassifier()
knn.fit(iris_X_train, iris_y_train)
KNeighborsClassifier(algorithm = 'auto', leaf_size = 30, metric = 'minkowski', metric_params = None,
                    n_jobs = None, n_neighbors= 5, p = 2, weights = 'uniform')
print(knn.predict(iris_X_test))
print(iris_y_test)

In [None]:
## linear regression

diabetes = datasets.load_diabetes()
diabetes_X_train = diabetes.data[:-20]
diabetes_y_train = diabetes.target[:-20]
diabetes_X_test = diabetes.data[-20:]
diabetes_y_test = diabetes.target[-20:]

from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(diabetes_X_train, diabetes_y_train)
linear_model.LinearRegression(copy_X = True, fit_intercept = True, n_jobs = None, normalize = False)
print(regr.coef_)

np.mean((regr.predict(diabetes_X_test) - diabetes_y_test)**2)

regr.score(diabetes_X_test, diabetes_y_test)

In [None]:
## Ridge regression

X = np.c_[.5, 1].T
y = [.5, 1]
test = np.c_[0, 2].T
regr = linear_model.LinearRegression()

import matplotlib.pyplot as plt
plt.figure()

np.random.seed(0)
for _ in range(6):
    this_X = .1 * np.random.normal(size = (2, 1)) + X
    regr.fit(this_X, y)
    plt.plot(test, regr.predict(test))
    plt.scatter(this_X, y, s = 3)

regr = linear_model.Ridge(alpha = .1)
plt.figure()

np.random.seed(0)
for _ in range(6):
    this_X = .1 * np.random.normal(size = (2, 1)) + X
    regr.fit(this_X, y)
    plt.plot(test, regr.predict(test))
    plt.scatter(this_X, y, s = 3)

In [None]:
## LASSO regression

regr = linear_model.Lasso()
alphas = np.logspace(-4, -1, 6)
scores = [regr.set_params(alpha = alpha).fit(diabetes_X_train, diabetes_y_train).score(diabetes_X_test, diabetes_y_test) for alpha in alphas]
best_alpha = alphas[scores.index(max(scores))]
regr.alpha = best_alpha

regr.fit(diabetes_X_train, diabetes_y_train)
linear_model.Lasso(copy_X = True, fit_intercept = True, max_iter = 1000, normalize = False, positive = False,
                  precompute = False, random_state = None, selection = 'cyclic', tol = 0.0001, warm_start = False)
print(regr.coef_)

In [None]:
## logistic regression

log = linear_model.LogisticRegression(solver = 'lbfgs', C = 1e5, multi_class = 'multinomial')

log.fit(iris_X_train, iris_y_train)
linear_model.LogisticRegression(C = 100000.0, class_weight = None, dual = False, fit_intercept = True, intercept_scaling = 1,
                               max_iter = 100, multi_class = 'multinomial', n_jobs = None, penalty = 'l2', random_state = None,
                               solver = 'lbfgs', tol = 0.0001, verbose = 0, warm_start = False)

In [None]:
## support vector machine

from sklearn import svm
svc = svm.SVC(kernel = 'rbf')
svc.fit(iris_X_train, iris_y_train)
svm.SVC(C = 1.0, cache_size = 200, class_weight = None, coef0 = 0.0, decision_function_shape = 'ovr', gamma = 'auto_deprecated', 
        max_iter = -1, probability = False, random_state = None, shrinking = True, tol = 0.001, verbose = False)

print(svc.predict(iris_X_test))
print(iris_y_test)

In [None]:
## score

from sklearn import datasets, svm, linear_model
import numpy as np

digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target
svc = svm.SVC(C = 1, kernel = 'linear')
svc.fit(X_digits[:-100], y_digits[:-100]).score(X_digits[-100:], y_digits[-100:])

X_folds = np.array_split(X_digits, 3)
y_folds = np.array_split(y_digits, 3)
scores = list()

for k in range(3):
    X_train = list(X_folds)
    X_test = X_train.pop(k)
    X_train = np.concatenate(X_train)
    y_train = list(y_folds)
    y_test = y_train.pop(k)
    y_train = np.concatenate(y_train)
    scores.append(svc.fit(X_train, y_train).score(X_test, y_test))
print(scores)

In [None]:
## cross-validation

from sklearn.model_selection import KFold, cross_val_score

X = ['a', 'a', 'a', 'b', 'b', 'c', 'c', 'c', 'c', 'c']
k_fold = KFold(n_splits = 5)
for train_indices, test_indices in k_fold.split(X):
    print('train: %s | test %s' % (train_indices, test_indices))

[svc.fit(X_digits[train], y_digits[train]).score(X_digits[test], y_digits[test]) for train, test in k_fold.split(X_digits)]

cross_val_score(svc, X_digits, y_digits, cv = k_fold, n_jobs = -1)

cross_val_score(svc, X_digits, y_digits, cv = k_fold, scoring = 'precision_macro')

In [None]:
## grid search

from sklearn.model_selection import GridSearchCV, cross_val_score

Cs = np.logspace(-6, -1, 10)
clf = GridSearchCV(estimator = svc, param_grid = dict(C = Cs), n_jobs = -1)
clf.fit(X_digits[:1000], y_digits[:1000])

clf.best_score_

clf.best_estimator_.C

clf.score(X_digits[1000:], y_digits[1000:])

cross_val_score(clf, X_digits, y_digits)


lasso = linear_model.LassoCV(cv = 3)
diabetes = datasets.load_diabetes()
X_diabetes = diabetes.data
y_diabetes = diabetes.target
lasso.fit(X_diabetes, y_diabetes)
linear_model.LassoCV(alphas = None, copy_X = True, cv = 3, eps = 0.001, fit_intercept = True,
                    max_iter = 1000, n_alphas = 100, n_jobs = None, normalize = False,
                    positive = False, precompute = 'auto', random_state = None,
                    selection = 'cyclic', tol = 0.0001, verbose = False)
lasso.alpha_

In [None]:
## k-means clustering

from sklearn import cluster, datasets

iris = datasets.load_iris()
X_iris = iris.data
y_iris = iris.target

k_means = cluster.KMeans(n_clusters = 3)
k_means.fit(X_iris)
cluster.KMeans(algorithm = 'auto', copy_x = True, init = 'k-means++')
print(k_means.labels_[::10])
print(y_iris[::10])

In [None]:
## vector quantization

import scipy as sp
import numpy as np

try:
    face = sp.face(gray = True)
except AttributeError:
    from scipy import misc
    face = misc.face(gray = True)
    
X = face.reshape((-1, 1))
k_means = cluster.KMeans(n_clusters = 5, n_init = 1)
k_means.fit(X)
cluster.KMeans(algorithm = 'auto', copy_x = True, init = 'k-mean++')

values = k_means.cluster_centers_.squeeze()
labels = k_means.labels_
face_compressed = np.choose(labels, values)
face_compressed.shape = face.shape

In [None]:
## connectivity-constrained clustering

import matplotlib.pyplot as plt
from skimage.data import coins
from skimage.transform import rescale
from sklearn.feature_extraction.image import grid_to_graph
from sklearn.cluster import AgglomerativeClustering

orig_coins = coins()

smoothed_coins = sp.ndimage.filters.gaussian_filter(orig_coins, sigma = 2)
rescaled_coins = rescale(smoothed_coins, 0.2, mode = 'reflect')

X = np.reshape(rescaled_coins, (-1, 1))

connectivity = grid_to_graph(*rescaled_coins.shape)

In [None]:
## principle component analysis

x1 = np.random.normal(size = 100)
x2 = np.random.normal(size = 100)
x3 = x1 + x2
X = np.c_[x1, x2, x3]

from sklearn import decomposition

pca = decomposition.PCA()
pca.fit(X)
decomposition.PCA(copy = True, iterated_power = 'auto', n_components = None, random_state = None,
                 svd_solver = 'auto', tol = 0.0, whiten = False)
print(pca.explained_variance_)

pca.n_components = 2
X_reduced = pca.fit_transform(X)
X_reduced.shape

In [None]:
## independent component analysis

from scipy import signal

time = np.linspace(0, 10, 2000)
s1 = np.sin(2*time)
s2 = np.sign(np.sin(3*time))
s3 = signal.sawtooth(2 * np.pi * time)
S = np.c_[s1, s2, s3]

S += 0.2 * np.random.normal(size = S.shape)
S /= S.std(axis = 0)

A = np.array([[1,1,1], [0.5,2,1], [1.5,1,2]])
X = np.dot(S, A.T)

ica = decomposition.FastICA()
S_ = ica.fit_transform(X)
A_ = ica.mixing_.T
np.allclose(X, np.dot(S_, A_) + ica.mean_)