Logistic regressions
---

In [None]:
from sklearn import datasets

# Load data set
iris = datasets.load_iris()
print('Type:', type(iris))

In [None]:
# Inspect object
iris.keys()

In [None]:
# Get feature names
iris.feature_names

In [None]:
import pandas as pd

# Category distribution
pd.value_counts(iris['target'])

In [None]:
# Categories
print('Target names:', iris.target_names)

In [None]:
# Create the binary target vector
y = (iris['target'] == 0).astype(int)

In [None]:
# Create the input matrix
X = iris['data'] # Get matrix with all features
X = X[:, [2, 3]] # Keep only petal features

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

# Plot data points
setosa_idx = (y == 1) # setosa data points

plt.scatter(X[:, 0][setosa_idx], X[:, 1][setosa_idx],
    color='C3', label='setosa') # Setosa
plt.scatter(X[:, 0][~setosa_idx], X[:, 1][~setosa_idx],
    color='C0', label='other') # Versicolor and virginica

# Set labels
plt.xlabel('petal length (cm)')
plt.ylabel('petal width (cm)')
plt.legend()
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

# Split into train/test sets
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.3, random_state=0)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

# Create the estimator
logreg = make_pipeline(StandardScaler(), LogisticRegression())

# Fit it to train data
logreg.fit(X_tr, y_tr);

In [None]:
# Create the estimator
logreg = make_pipeline(
    StandardScaler(), LogisticRegression(solver='liblinear'))

# Fit it to train data
logreg.fit(X_tr, y_tr)

# Accuracy on test set
accuracy = logreg.score(X_te, y_te)
print('Accuracy: {:.3f}'.format(accuracy))

In [None]:
# New flower
new_flower = [
    1.5, # petal length (cm)
    0.3, # petal width (cm)
]

# Classify it
logreg.predict([new_flower])

In [None]:
# Get probabilities
logreg.predict_proba([new_flower])

In [None]:
import numpy as np

# Helper function
def decision_surface(ax, X, y, logreg):
    # Plot data points
    setosa_idx = (y == 1) # Setosa data points
    
    plt.scatter(X[:, 0][setosa_idx], X[:, 1][setosa_idx],
        color='C3', label='setosa') # Setosa
    plt.scatter(X[:, 0][~setosa_idx], X[:, 1][~setosa_idx],
        color='C0', label='other') # Versicolor and virginica

    # Create a grid of values
    xlim, ylim = ax.get_xlim(), ax.get_ylim()
    x_values = np.linspace(*xlim, num=40)
    y_values = np.linspace(*ylim, num=40)
    xx, yy = np.meshgrid(x_values, y_values)
    points = np.c_[xx.flatten(), yy.flatten()]

    # Probability for positive class (setosa)
    probs = logreg.predict_proba(points)
    prob_setosa = probs[:, 1]

    # Draw decision boundary (p=0.5)
    zz = prob_setosa.reshape(xx.shape)
    plt.contour(xx, yy, zz, levels=[0.5], colors='gray')
    
    # Plot decision surface with level curves
    plt.contourf(xx, yy, zz, 10, alpha=0.3, cmap=plt.cm.coolwarm)

    # Add labels
    plt.xlabel('petal length (cm)')
    plt.ylabel('petal width (cm)')
    plt.legend()
    plt.colorbar(label='probability')
    plt.show()

In [None]:
# Plot decision surface
fig = plt.figure()
decision_surface(fig.gca(), X, y, logreg)