In [1]:
# starting with basic classification (supervised) problem
# source: http://scikit-learn.org/stable/tutorial/basic/tutorial.html#introduction

from sklearn import datasets
iris = datasets.load_iris()
digits = datasets.load_digits()

# A dataset is a dict-like object that holds all the data & some metadata
# This data is stored in the .data member (n_samples, n_features array)
# In the case of supervised problem, one or more response variables are stored 
# in the .target member

In [2]:
# .data gives access to the features that can be used to classify the digit samples
print(digits.data)

[[  0.   0.   5. ...,   0.   0.   0.]
 [  0.   0.   0. ...,  10.   0.   0.]
 [  0.   0.   0. ...,  16.   9.   0.]
 ..., 
 [  0.   0.   1. ...,   6.   0.   0.]
 [  0.   0.   2. ...,  12.   0.   0.]
 [  0.   0.  10. ...,  12.   1.   0.]]


In [3]:
# .target gives ground truth for the digit dataset
digits.target

array([0, 1, 2, ..., 8, 9, 8])

In [4]:
# data always a 2D array, shape (n_samples, n_features)
# original sample is an image of shape (8,8)
digits.images[0]

array([[  0.,   0.,   5.,  13.,   9.,   1.,   0.,   0.],
       [  0.,   0.,  13.,  15.,  10.,  15.,   5.,   0.],
       [  0.,   3.,  15.,   2.,   0.,  11.,   8.,   0.],
       [  0.,   4.,  12.,   0.,   0.,   8.,   8.,   0.],
       [  0.,   5.,   8.,   0.,   0.,   9.,   8.,   0.],
       [  0.,   4.,  11.,   0.,   1.,  12.,   7.,   0.],
       [  0.,   2.,  14.,   5.,  10.,  12.,   0.,   0.],
       [  0.,   0.,   6.,  13.,  10.,   0.,   0.,   0.]])

In [5]:
# LEARNING & PREDICTING
# need to fit an estimator to be able to predict the classes to which unseen 
# samples belong. In scikit-learn: fit(X,y) and predit(T)
# an example of estimator is class sklearn.svm.SVC (support vector classification)
from sklearn.svm import SVC

# choosing the parameters of the model
# we set the gamma value manually, can be found automatically
clf = SVC(gamma=0.001, C=100.)  # our estimator - clf as it's a classifier


In [6]:
# clf must be fitted to a model and learn from it. As a training set, we use all 
# the images of our dataset apart from the last one - [:-1]
clf.fit(digits.data[:-1], digits.target[:-1])

SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [8]:
clf.predict(digits.data[-1:])

array([8])

<img src="http://scikit-learn.org/stable/_images/plot_digits_last_image_0011.png">

In [None]:
# Next example
#http://scikit-learn.org/stable/auto_examples/classification/plot_digits_classification.html#example-classification-plot-digits-classification-py