In [1]:
# This is the jupyter notebook.
#
# The jupyter notebook is comparable to an interactive python shell.
# There are some major differences however

In [2]:
# The basic unit of a jupyter notebook is a cell.  We are in a cell right now.
# A cell should be thought of as a list of instructions or code.
#
# It may be changed at any time, and once you wish to execute its contents,
# we press Shift + Enter.

In [3]:
# For example.
my_words = "Hello MDST!"
print(my_words)

Hello MDST!


In [4]:
# The output of the code is display below the cell once it is executed.
#
# Variables that are set in one cell are available to any other cell in the notebook.
my_variable_1 = "Lorem Ipsum"
my_variable_2 = 23
my_variable_3 = 19.6732

In [5]:
my_variable_2 + my_variable_3

42.6732

In [6]:
print(my_variable_1)

Lorem Ipsum


In [7]:
# Variables are also still available even if the cell from which they come is changed or disappears entirely!

In [9]:
my_variable_4

'Where did I come from?'

In [10]:
# Well?  Where did my_variable_4 come from?  Can you give a likely explanation?
# 

In [11]:
# What are some potential problems with variable names in a notebook?
# 

In [12]:
# Python has lots of libraries that we can import and make use of.
# For example, numpy has important numerical operations.
# Another library is sklearn, which includes powerful machine learning capabilities.
import numpy as np
import sklearn
import sklearn.datasets


In [14]:
# The notebook has lots of useful features, like tab completion.
# For example, put the cursor at the end of the next line and press tab.
np.linalg.

In [15]:
# We will now do a simple machine learning task.
# If you are new to python or data science, DON'T PANIC if this looks overwhelming.

# A useful exercise would be to fill in the comment lines in the following code.

digits = sklearn.datasets.load_digits()
Xdata = digits['data']
Ydata = digits['target']
# What data type is digits?
# What are Xdata and Ydata?
#

In [16]:
from sklearn.cross_validation import train_test_split

Xtrain, Xtest, Ytrain, Ytest = train_test_split(Xdata, Ydata)
# Explain what Xtrain, Xtest, Ytrain, Ytest are and how they will be used.
#

In [17]:
from sklearn.neighbors import KNeighborsClassifier

k_neighbor_classifier = KNeighborsClassifier(n_jobs=4)
k_neighbor_classifier.fit(Xtrain, Ytrain)

# What is k_neighbors_classifier?
# What does the fit method do?
#

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=4, n_neighbors=5, p=2,
           weights='uniform')

In [18]:
yhat = k_neighbor_classifier.predict(Xtest)

# In your own words, what is yhat?
#

In [21]:
from sklearn.metrics import confusion_matrix

confusion_matrix(Ytest, yhat)
# What do the entries of the confusion matrix represent?
#

array([[46,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 54,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, 39,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0, 46,  0,  0,  0,  0,  0,  1],
       [ 0,  0,  0,  0, 39,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0, 55,  1,  0,  0,  1],
       [ 0,  0,  0,  0,  0,  0, 37,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0, 43,  0,  0],
       [ 0,  2,  0,  0,  0,  0,  0,  0, 41,  0],
       [ 0,  0,  0,  0,  0,  1,  0,  0,  1, 43]])