In [1]:
# Load the library with the iris dataset
from sklearn.datasets import load_iris

from sklearn.linear_model import Lasso
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

# Load pandas
import pandas as pd

# Load numpy
import numpy as np

# Create an object called iris with the iris data
iris = load_iris()

# Create a dataframe with the four feature variables
df = pd.DataFrame(iris.data, columns=iris.feature_names)

# View the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [2]:
# Add a new column with the species names, this is what we are going to try to predict
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

# View the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [4]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'],
      dtype='<U10')

## Create training and test data

In [5]:
# Create a new column that for each row, generates a random number between 0 and 1, and
# if that value is less than or equal to .75, then sets the value of that cell as True
# and false otherwise. This is a quick and dirty way of randomly assigning some rows to
# be used as the training data and some as the test data.
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75

# View the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,True
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [6]:
# Create two new dataframes, one with the training rows, one with the test rows
train, test = df[df['is_train']==True], df[df['is_train']==False]
train.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,True
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [7]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 104
Number of observations in the test data: 46


## Preprocess Data

In [8]:
# Create a list of the feature column's names

# print df.columns
# Index([u'sepal length (cm)', u'sepal width (cm)', u'petal length (cm)',
#        u'petal width (cm)', u'species', u'is_train'],
#       dtype='object')

features = df.columns[:4]

features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [9]:
train[features].head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [10]:
# Standarize features
scaler = StandardScaler()
train_features_std = scaler.fit_transform(train[features])
train_features_std[:10]

array([[-0.94206709,  1.12109594, -1.46138035, -1.38532643],
       [-1.17561888, -0.11926553, -1.46138035, -1.38532643],
       [-1.40917068,  0.37687906, -1.51899352, -1.38532643],
       [-1.52594657,  0.12880677, -1.40376717, -1.38532643],
       [-1.05884299,  1.36916823, -1.46138035, -1.38532643],
       [-1.52594657,  0.87302364, -1.46138035, -1.25482466],
       [-1.05884299,  0.87302364, -1.40376717, -1.38532643],
       [-1.75949837, -0.36733782, -1.46138035, -1.38532643],
       [-1.17561888,  0.12880677, -1.40376717, -1.5158282 ],
       [-1.29239478,  0.87302364, -1.346154  , -1.38532643]])

In [11]:
test_features_std = scaler.fit_transform(test[features])
test_features_std[:10]

array([[-0.41073612,  1.69487278, -0.95017196, -0.90193065],
       [-0.41073612,  1.2877493 , -1.06489089, -1.17669098],
       [ 0.13691204,  1.89843453, -1.23696927, -1.17669098],
       [-0.41073612,  1.69487278, -1.17960981, -0.90193065],
       [-0.82147223,  1.2877493 , -1.06489089, -0.90193065],
       [-0.82147223,  0.47350232, -0.95017196, -0.76455048],
       [-0.95838427,  0.67706406, -1.00753142, -0.90193065],
       [-0.68456019,  0.88062581, -1.06489089, -1.17669098],
       [-0.68456019,  0.67706406, -1.12225035, -1.17669098],
       [-1.36912038,  0.26994057, -1.00753142, -1.17669098]])

In [12]:
# train['species'] contains the actual species names. Before we can use it,
# we need to convert each species name into a digit. So, in this case there
# are three species, which have been coded as 0, 1, or 2.

# print pd.factorize(train['species'])
# (array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
#        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
#        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
#        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
#        2, 2, 2, 2]), Index([u'setosa', u'versicolor', u'virginica'], dtype='object'))

y = pd.factorize(train['species'])[0]

y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## Train the LR Classifier

In [13]:
# Create a random forest classifier. By convention, clf means 'classifier'
clf = Lasso(alpha=0.1)
# n_jobs parameter tells the engine how many processors is it allowed to use. 
# A value of “-1” means there is no restriction whereas a value of “1” means it can only use one processor.

# Train the classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(train_features_std, y)

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

## Apply Classifier to Test Data

In [22]:
# Apply the classifier we trained to the test data (which, remember, it has never seen before)
est = clf.predict(test_features_std)
# est = est>=0.5
# Stop here: How do we deal with multi-class problem in Lasso regression??
est

array([ 0.44714458,  0.29678287,  0.25303831,  0.3888185 ,  0.41798154,
        0.50774392,  0.43256306,  0.29678287,  0.28220135,  0.31136439,
        0.41798154,  0.28220135,  0.25303831,  0.26761983,  0.32821916,
        0.32821916,  0.55376173,  0.34280069,  0.31136439,  1.53660139,
        1.04404493,  1.41540271,  1.49058357,  1.44683901,  1.52201986,
        1.52201986,  1.32791359,  1.58034595,  1.07320798,  1.10237102,
        1.67010833,  1.47600205,  1.26731425,  1.04404493,  1.29647729,
        1.06089971,  1.8642146 ,  2.13577499,  2.36131756,  2.15490301,
        2.35677105,  2.1817928 ,  1.76214395,  1.62181726,  2.09430367,
        1.85190633])

In [15]:
# # View the predicted probabilities of the first 10 observations
# clf.predict_proba(test_features_std)[0:10]
# # There are three species of plant, thus [ 1. , 0. , 0. ] tells us that the classifier is certain that 
# # the plant is the first class. Taking another example, [ 0.9, 0.1, 0. ] tells us that the classifier gives 
# # a 90% probability the plant belongs to the first class and a 10% probability the plant belongs to the 
# # second class. Because 90 is greater than 10, the classifier predicts the plant is the first class.

AttributeError: 'Lasso' object has no attribute 'predict_proba'

## Evaluate Classifier

In [21]:
# Create actual english names for the plants for each predicted plant class
preds = iris.target_names[est]
preds

IndexError: boolean index did not match indexed array along dimension 0; dimension is 3 but corresponding boolean dimension is 46

In [56]:
# View the ACTUAL species for the first five observations
test['species'].head()

1     setosa
8     setosa
13    setosa
15    setosa
22    setosa
Name: species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]

In [57]:
# Create confusion matrix
pd.crosstab(test['species'], preds, rownames=['Actual Species'], colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,10,0,0
versicolor,0,9,3
virginica,0,1,8


## Compute Accuracy, Precision, Recall

In [58]:
print(classification_report(test['species'], preds))
print(accuracy_score(test['species'], preds))

             precision    recall  f1-score   support

     setosa       1.00      1.00      1.00        10
 versicolor       0.90      0.75      0.82        12
  virginica       0.73      0.89      0.80         9

avg / total       0.88      0.87      0.87        31

0.870967741935
