Following the tutorial at https://chrisalbon.com/machine_learning/logistic_regression/logistic_regression/

In [40]:
# Load the library with the iris dataset
from sklearn.datasets import load_iris

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

# Load pandas
import pandas as pd

# Load numpy
import numpy as np

# Create an object called iris with the iris data
iris = load_iris()

# Create a dataframe with the four feature variables
df = pd.DataFrame(iris.data, columns=iris.feature_names)

# View the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [41]:
# Add a new column with the species names, this is what we are going to try to predict
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

# View the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [42]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [43]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'],
      dtype='<U10')

## Create training and test data

In [44]:
# Create a new column that for each row, generates a random number between 0 and 1, and
# if that value is less than or equal to .75, then sets the value of that cell as True
# and false otherwise. This is a quick and dirty way of randomly assigning some rows to
# be used as the training data and some as the test data.
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75

# View the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,False
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [45]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test = df[df['is_train']==True], df[df['is_train']==False]
train.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True
5,5.4,3.9,1.7,0.4,setosa,True


In [46]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 119
Number of observations in the test data: 31


## Preprocess Data

In [47]:
# Create a list of the feature column's names

# print df.columns
# Index([u'sepal length (cm)', u'sepal width (cm)', u'petal length (cm)',
#        u'petal width (cm)', u'species', u'is_train'],
#       dtype='object')

features = df.columns[:4]

features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [48]:
train[features].head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4


In [49]:
# Standarize features
scaler = StandardScaler()
train_features_std = scaler.fit_transform(train[features])
train_features_std[:10]

array([[-0.9373903 ,  1.10834653, -1.35245055, -1.32794147],
       [-1.4353789 ,  0.34742401, -1.40910018, -1.32794147],
       [-1.55987605,  0.09378317, -1.29580091, -1.32794147],
       [-1.06188745,  1.36198737, -1.35245055, -1.32794147],
       [-0.56389885,  2.12290989, -1.18250164, -1.06760369],
       [-1.55987605,  0.85470569, -1.35245055, -1.19777258],
       [-1.06188745,  0.85470569, -1.29580091, -1.32794147],
       [-1.1863846 ,  0.09378317, -1.29580091, -1.45811037],
       [-0.56389885,  1.61562821, -1.29580091, -1.32794147],
       [-1.31088175,  0.85470569, -1.23915128, -1.32794147]])

In [50]:
test_features_std = scaler.fit_transform(test[features])
test_features_std[:10]

array([[-1.00261275, -0.03502131, -1.3010662 , -1.26410887],
       [-1.55565538, -0.21596475, -1.3010662 , -1.26410887],
       [-1.6662639 , -0.03502131, -1.47466614, -1.40209258],
       [-0.11774456,  2.49818686, -1.24319956, -0.98814144],
       [-1.33443833,  1.05063933, -1.53253279, -1.26410887],
       [-0.67078718,  1.95535654, -1.24319956, -1.40209258],
       [-0.33896161,  2.13629998, -1.3010662 , -1.26410887],
       [-1.00261275,  0.14592213, -1.24319956, -1.40209258],
       [-1.44504685, -1.3016254 , -1.35893285, -1.12612515],
       [-0.56017866,  1.23158277, -1.24319956, -1.26410887]])

In [51]:
# train['species'] contains the actual species names. Before we can use it,
# we need to convert each species name into a digit. So, in this case there
# are three species, which have been coded as 0, 1, or 2.

# print pd.factorize(train['species'])
# (array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
#        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
#        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
#        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
#        2, 2, 2, 2]), Index([u'setosa', u'versicolor', u'virginica'], dtype='object'))

y = pd.factorize(train['species'])[0]

y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2])

## Train the LR Classifier

In [52]:
# Create a random forest classifier. By convention, clf means 'classifier'
clf = LogisticRegression(n_jobs=2)
# n_jobs parameter tells the engine how many processors is it allowed to use. 
# A value of “-1” means there is no restriction whereas a value of “1” means it can only use one processor.

# Train the classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(train_features_std, y)

  " = {}.".format(self.n_jobs))


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=2,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

## Apply Classifier to Test Data

In [53]:
# Apply the classifier we trained to the test data (which, remember, it has never seen before)
clf.predict(test_features_std)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2,
       2, 2, 2, 2, 2, 1, 2, 2])

In [54]:
# View the predicted probabilities of the first 10 observations
clf.predict_proba(test_features_std)[0:10]
# There are three species of plant, thus [ 1. , 0. , 0. ] tells us that the classifier is certain that 
# the plant is the first class. Taking another example, [ 0.9, 0.1, 0. ] tells us that the classifier gives 
# a 90% probability the plant belongs to the first class and a 10% probability the plant belongs to the 
# second class. Because 90 is greater than 10, the classifier predicts the plant is the first class.

array([[  7.65036622e-01,   2.34596344e-01,   3.67034355e-04],
       [  7.26916168e-01,   2.72772967e-01,   3.10865824e-04],
       [  7.61294591e-01,   2.38539331e-01,   1.66077915e-04],
       [  9.83775517e-01,   1.52949260e-02,   9.29556595e-04],
       [  9.14403112e-01,   8.53510085e-02,   2.45879745e-04],
       [  9.59997229e-01,   3.97009832e-02,   3.01788121e-04],
       [  9.72390278e-01,   2.71919142e-02,   4.17807365e-04],
       [  7.77532053e-01,   2.22188100e-01,   2.79847230e-04],
       [  5.60140245e-01,   4.39440552e-01,   4.19202973e-04],
       [  9.22351785e-01,   7.71939498e-02,   4.54265148e-04]])

## Evaluate Classifier

In [55]:
# Create actual english names for the plants for each predicted plant class
preds = iris.target_names[clf.predict(test_features_std)]
preds

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'virginica', 'virginica',
       'versicolor', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'versicolor', 'virginica', 'versicolor', 'versicolor',
       'versicolor', 'virginica', 'virginica', 'virginica', 'virginica',
       'virginica', 'virginica', 'versicolor', 'virginica', 'virginica'],
      dtype='<U10')

In [56]:
# View the ACTUAL species for the first five observations
test['species'].head()

1     setosa
8     setosa
13    setosa
15    setosa
22    setosa
Name: species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]

In [57]:
# Create confusion matrix
pd.crosstab(test['species'], preds, rownames=['Actual Species'], colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,10,0,0
versicolor,0,9,3
virginica,0,1,8


## Compute Accuracy, Precision, Recall

In [58]:
print(classification_report(test['species'], preds))
print(accuracy_score(test['species'], preds))

             precision    recall  f1-score   support

     setosa       1.00      1.00      1.00        10
 versicolor       0.90      0.75      0.82        12
  virginica       0.73      0.89      0.80         9

avg / total       0.88      0.87      0.87        31

0.870967741935
