# Machine Learning Basics

# 1. A linear classifier

In [None]:
%pdb

In [None]:
import numpy as np
import sklearn

from matplotlib import pyplot as plt
from sklearn import datasets

### Data Acquisition

In [None]:
# load the data set
iris = datasets.load_iris()

print("Features: %s" % iris.feature_names)
print("Labels  : %s" % iris.target_names)

### Data Exploration

In [None]:
print(iris.data.shape)
print(iris.target.shape)

n_samples, n_features = iris.data.shape

In [None]:
# look at the data
plt.figure(2, figsize=(8, 6))
plt.clf()

# iris.target[iris.target > 0] = 1  # distinguish only two kind of iris flowers

plt.scatter(iris.data[:, 0], iris.data[:, 1], c=iris.target, cmap=plt.cm.Set1)

plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.show()

In [None]:
# plot the data, again
plt.figure(2, figsize=(8, 6))
plt.clf()

plt.scatter(iris.data[:, 2], iris.data[:, 3], c=iris.target, cmap=plt.cm.Set1)

plt.xlabel('Petal length')
plt.ylabel('Petal width')
plt.show()

### Preprocessing

This would include things like feature extraction, dealing with missing numbers, outlier detection, data normalization, etc. pp.

### Modeling

In [None]:
from sklearn import linear_model

# train a logistic regressor to classify the data
clf = linear_model.LogisticRegression()

# train a classifier
clf.fit(iris.data[:-10], iris.target[:-10])

# make predictions using the trained classifier
predictions = clf.predict(iris.data[-10:])

print("Predicted label(s): %s" % predictions)
print("True label(s): %s" % iris.target[-10:])

😮 ... Oops

### Modeling, again

In [None]:
# shuffle the data
shuffle_index = np.random.permutation(n_samples)

# use 10-20 percent of the data for testing, the rest for training
split = int(n_samples * 0.1)

# split the data into training and test sets
test_idx = shuffle_index[:split]
train_idx = shuffle_index[split:]

X_train = iris.data[train_idx]
X_test = iris.data[test_idx]
y_train = iris.target[train_idx]
y_test = iris.target[test_idx]

In [None]:
# make a new classifier
clf = linear_model.LogisticRegression()  # C=1e5
# clf = sklearn.svm.SVC(gamma=0.001, C=100.)

# train a classifier
clf.fit(X_train, y_train)

# make predictions using the trained classifier
y_pred = clf.predict(X_test)

print("Predicted labels: %s" % y_pred)
print("True labels     : %s" % y_test)

### Model Evaluation

In [None]:
from sklearn import metrics

In [None]:
# to make it easier we will distinguish only two classes here
# i.e. turn the problem into a so-called binary classification problem

# make a new classifier
clf = linear_model.LogisticRegression()

y_train[y_train == 2] = 1
y_test[y_test == 2] = 1

# train a classifier
clf.fit(X_train, y_train)

# make predictions using the trained classifier
y_pred = clf.predict(X_test)

print("Predicted labels: %s" % y_pred)
print("True labels     : %s" % y_test)

#### Accuracy

The number of true positive plus the number of true negatives divided by the number of predictions.

In [None]:
# the number of true positives and true negatives in one go
true_predictions = len(y_pred[y_pred == y_test])
print(true_predictions / len(y_pred))

In [None]:
accuracy = metrics.accuracy_score(y_test, y_pred)
print(accuracy)

#### F1-Score

Task: Calculate the number of false positives and the number of false negatives in the predictions.

<img width="400px" align="left" src="files/fscore.png">

In [None]:
# make a new classifier
clf = linear_model.LogisticRegression()

# get the original three classes again
y_train = iris.target[train_idx]
y_test = iris.target[test_idx]

# train a classifier
clf.fit(X_train, y_train)

# make predictions using the trained classifier
y_pred = clf.predict(X_test)

print("Predicted labels: %s" % y_pred)
print("True labels     : %s" % y_test)

In [None]:
f1_score = metrics.f1_score(y_test, y_pred, average=None)
print(f1_score)

### Task for who finished the rest 😉

In [None]:
# Advanced snippet from http://scikit-learn.org/stable/auto_examples/linear_model/plot_iris_logistic.html

# Task: understand the following code which plots the decision boundaries of the trained classifier for two features
# and adapt it such that it shows the decision boundaries aka rules for the other two features.

# load the data set
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only use the first two features because it's easier to visualize 2D than 4D
Y = iris.target

h = .02  # step size in the mesh

# instantiate a classifer and train it
logreg = sklearn.linear_model.LogisticRegression(C=1e5)
logreg.fit(X, Y)

# plot the decision boundary
# therefore assign a color to each point in the mesh [x_min, x_max]x[y_min, y_max]
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])

# put the result into a color plot
Z = Z.reshape(xx.shape)

plt.figure(2, figsize=(8, 6))
plt.clf()

plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

# plot the training samples
plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.show()