# Logistic Regression

### Objective: Model Setup and Tuning 

10/4/2018 <br>
Mooyoung Lee

## Iris dataset

In [1]:
from sklearn import datasets
iris = datasets.load_iris()
print(iris['DESCR'])

Iris Plants Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :Date: July, 1988

This is a copy of UCI ML iris d

In [2]:
import pandas as pd
import numpy as np

X = pd.DataFrame(iris.data, columns = iris.feature_names)
y = iris.target.astype(np.int)
X.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [3]:
X.isnull().sum()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
dtype: int64

# Scale data

In [4]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (0,1))
X_scaled = scaler.fit_transform(X)

In [5]:
X_scaled.min(), X_scaled.max()

(0.0, 1.0)

# Logistic Regression

Accuracy check w/ stratified 10-fold cross-validation

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
clf = LogisticRegression()
accuracy = cross_val_score(clf, X_scaled, y, cv = 10)
print('Accuracy: ', np.round(accuracy,3))
print('Mean Accuracy: ', np.mean(accuracy))

Accuracy:  [0.733 0.8   0.867 0.8   0.867 0.733 0.8   0.867 0.933 1.   ]
Mean Accuracy:  0.8400000000000001


# Tuning w/ penalty and regularization stringth

The smaller value of 'C' specify stronger regularization.

In [7]:
from sklearn.model_selection import RandomizedSearchCV

tuned_parameters = {'penalty' : ['l1','l2'],
                   'C' : [.0001, .001, .01, .1, .5, 1, 10, 100]}

clf = RandomizedSearchCV(LogisticRegression(), tuned_parameters, cv=10, n_iter = 10)
clf.fit(X,y)


## Print Results
print("Best parameters set found on development set:\n")
print(clf.best_params_)
print("Grid scores on development set:\n")
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

Best parameters set found on development set:

{'penalty': 'l2', 'C': 100}
Grid scores on development set:

0.947 (+/-0.100) for {'penalty': 'l2', 'C': 0.5}
0.980 (+/-0.085) for {'penalty': 'l2', 'C': 100}
0.980 (+/-0.085) for {'penalty': 'l1', 'C': 10}
0.507 (+/-0.122) for {'penalty': 'l2', 'C': 0.001}
0.960 (+/-0.122) for {'penalty': 'l1', 'C': 1}
0.333 (+/-0.000) for {'penalty': 'l1', 'C': 0.01}
0.813 (+/-0.100) for {'penalty': 'l1', 'C': 0.1}
0.833 (+/-0.107) for {'penalty': 'l2', 'C': 0.1}
0.333 (+/-0.000) for {'penalty': 'l2', 'C': 0.0001}
0.947 (+/-0.080) for {'penalty': 'l1', 'C': 0.5}


# Tuning test for L2 penalty solvers

In [8]:
from sklearn.model_selection import GridSearchCV

tuned_parameters = {'penalty' : ['l2'],
                    'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                   'C' : [10]}

clf = GridSearchCV(LogisticRegression(), tuned_parameters, cv=10)
clf.fit(X,y)


## Print Results
print("Best parameters set found on development set:\n")
print(clf.best_params_)
print("Grid scores on development set:\n")
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))



Best parameters set found on development set:

{'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
Grid scores on development set:

0.960 (+/-0.088) for {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.960 (+/-0.088) for {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.967 (+/-0.123) for {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.967 (+/-0.123) for {'C': 10, 'penalty': 'l2', 'solver': 'sag'}
0.960 (+/-0.122) for {'C': 10, 'penalty': 'l2', 'solver': 'saga'}


