In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 

import warnings
warnings.filterwarnings(action = "ignore")

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing, model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [8]:
# Load data
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [9]:
X[:3]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2]])

In [10]:
np.unique(y) #setosa versicolor virginica

array([0, 1, 2])

In [11]:
# Standarize features
scaler = StandardScaler()
X_std = scaler.fit_transform(X)
X_std = pd.DataFrame(data = X_std, columns=iris.feature_names)
y = pd.Series(data = y, name = "Species" )

In [12]:
X_std.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444


- multi_class : str, {‘ovr’, ‘multinomial’, 'auto'}, default: ‘ovr’
If the option chosen is ‘ovr’, then a binary problem is fit for each label. ‘auto’ selects ‘ovr’ if the data is binary, or if solver=’liblinear’, and otherwise selects ‘multinomial’.

solver : str, {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default: ‘liblinear’.

- For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones.

- For ‘multinomial’ the loss minimised is the multinomial loss fit across the entire probability distribution, even when the data is binary. ‘multinomial’ is unavailable when solver=’liblinear’. Please use 'newton-cg','sag', 'saga' and 'lbfs' handle multinomial loss.   Choosing to minimize softmax loss. ‘liblinear’ is limited to one-versus-rest schemes.


In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = .25, random_state = 1)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((112, 4), (38, 4), (112,), (38,))

In [15]:
clf = LogisticRegression(random_state=0) #default: multi_class will take ovr and solver will take liblinear value
clf.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [16]:
ypred = clf.predict(X_test)
ypred

array([0, 1, 1, 0, 2, 2, 2, 0, 0, 2, 1, 0, 2, 2, 2, 0, 1, 2, 0, 0, 1, 2,
       2, 0, 2, 1, 0, 0, 1, 2, 2, 2, 1, 2, 2, 0, 1, 0])

In [17]:
pd.crosstab(y_test, ypred)

col_0,0,1,2
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,13,0,0
1,0,9,7
2,0,0,9


In [18]:
from sklearn.metrics import classification_report

In [20]:
print(classification_report(y_test, ypred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      0.56      0.72        16
           2       0.56      1.00      0.72         9

   micro avg       0.82      0.82      0.82        38
   macro avg       0.85      0.85      0.81        38
weighted avg       0.90      0.82      0.82        38



In [17]:
# Create one-vs-rest / multinomial logistic regression object
clf = LogisticRegression(random_state=0, 
                         multi_class='multinomial', 
                         solver='newton-cg')
#Algorithm to use in the optimization problem.
clf.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'multinomial',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 0,
 'solver': 'newton-cg',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [18]:
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=0, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False)

In [19]:
pd.crosstab(clf.predict(X_test), y_test)

Species,0,1,2
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,13,0,0
1,0,15,0
2,0,1,9


In [22]:
params = [{
    'C': np.logspace(-5,5,10),
    'multi_class': ['ovr', 'multinomial'], 
    'solver': ['newton-cg', 'lbfgs','sag', 'saga']
}]

In [23]:
import sklearn
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'accuracy', 'roc_auc', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'brier_score_loss', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted'])

In [24]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator = clf, param_grid=params, scoring = "accuracy")
grid.fit(X_std, y)

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'C': array([1.00000e-05, 1.29155e-04, 1.66810e-03, 2.15443e-02, 2.78256e-01,
       3.59381e+00, 4.64159e+01, 5.99484e+02, 7.74264e+03, 1.00000e+05]), 'multi_class': ['ovr', 'multinomial'], 'solver': ['newton-cg', 'lbfgs', 'sag', 'saga']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [25]:
grid.best_params_

{'C': 3.593813663804626, 'multi_class': 'multinomial', 'solver': 'newton-cg'}

In [24]:
grid.best_score_

0.9665683020683021

In [25]:
# Train model
model = grid.best_estimator_
model.fit(X_train, y_train)

LogisticRegression(C=3.593813663804626, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='multinomial', n_jobs=None, penalty='l2',
          random_state=0, solver='newton-cg', tol=0.0001, verbose=0,
          warm_start=False)

In [26]:
ypred = model.predict(X_test)
ypred

array([0, 1, 1, 0, 2, 1, 2, 0, 0, 2, 1, 0, 2, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       2, 0, 2, 1, 0, 0, 1, 2, 1, 2, 1, 2, 2, 0, 1, 0])

In [27]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [28]:
accuracy_score(y_test, ypred)

0.9736842105263158

In [29]:
df = pd.DataFrame({"y": y_test, "ypred": ypred})

In [30]:
pd.crosstab(df.y, df.ypred)

ypred,0,1,2
y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,13,0,0
1,0,15,1
2,0,0,9


In [31]:
# View predicted probabilities
p_values = model.predict_proba(X_test)
p_values[:3]

array([[9.95477274e-01, 4.52272575e-03, 5.01195607e-11],
       [1.00624611e-01, 8.97602563e-01, 1.77282601e-03],
       [1.78570919e-03, 9.46270958e-01, 5.19433327e-02]])