In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 

import warnings
warnings.filterwarnings(action = "ignore")

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing, model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [34]:
# Load data
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [35]:
X[:3]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2]])

In [36]:
np.unique(y) #setosa versicolor virginica

array([0, 1, 2])

In [37]:
# Standarize features
scaler = StandardScaler()
X_std = scaler.fit_transform(X)
X_std = pd.DataFrame(data = X_std, columns=iris.feature_names)
y = pd.Series(data = y, name = "Species" )

In [38]:
X_std.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444


- multi_class : str, {‘ovr’, ‘multinomial’, 'auto'}, default: ‘ovr’
If the option chosen is ‘ovr’, then a binary problem is fit for each label. ‘auto’ selects ‘ovr’ if the data is binary, or if solver=’liblinear’, and otherwise selects ‘multinomial’.

solver : str, {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default: ‘liblinear’.

- For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones.

- For ‘multinomial’ the loss minimised is the multinomial loss fit across the entire probability distribution, even when the data is binary. ‘multinomial’ is unavailable when solver=’liblinear’. Please use 'newton-cg','sag', 'saga' and 'lbfs' handle multinomial loss.   Choosing to minimize softmax loss. ‘liblinear’ is limited to one-versus-rest schemes.


In [39]:
from sklearn.model_selection import train_test_split

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = .25, random_state = 1)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((112, 4), (38, 4), (112,), (38,))

In [41]:
clf = LogisticRegression(random_state=0) #default: multi_class will take ovr and solver will take liblinear value
clf.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [42]:
ypred = clf.predict(X_test)
ypred

array([0, 1, 1, 0, 2, 2, 2, 0, 0, 2, 1, 0, 2, 2, 2, 0, 1, 2, 0, 0, 1, 2,
       2, 0, 2, 1, 0, 0, 1, 2, 2, 2, 1, 2, 2, 0, 1, 0])

In [43]:
pd.crosstab(y_test, ypred)

col_0,0,1,2
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,13,0,0
1,0,9,7
2,0,0,9


In [44]:
# Create one-vs-rest logistic regression object
clf = LogisticRegression(random_state=0, 
                         multi_class='multinomial', 
                         solver='newton-cg')
#Algorithm to use in the optimization problem.
clf.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'multinomial',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 0,
 'solver': 'newton-cg',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [45]:
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=0, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [46]:
pd.crosstab(clf.predict(X_test), y_test)

Species,0,1,2
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,13,0,0
1,0,15,0
2,0,1,9


In [47]:
params = [{
    'C': np.logspace(-5,5,10),
    'multi_class': ['ovr', 'multinomial'], 
    'solver': ['newton-cg', 'lbfgs','sag', 'saga']
}]

In [48]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator = clf, param_grid=params, scoring = "accuracy")
grid.fit(X_std, y)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100,
                                          multi_class='multinomial',
                                          n_jobs=None, penalty='l2',
                                          random_state=0, solver='newton-cg',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid=[{'C': array([1.00000000e-05, 1.29154967e-04, 1.66810054e-03, 2.15443469e-02,
       2.78255940e-01, 3.59381366e+00, 4.64158883e+01, 5.99484250e+02,
       7.74263683e+03, 1.00000000e+05]),
                          'multi_class': ['ovr', 'multinomial'],
        

In [49]:
grid.best_params_

{'C': 3.593813663804626, 'multi_class': 'multinomial', 'solver': 'newton-cg'}

In [50]:
grid.best_score_

0.9666666666666667

In [51]:
# Train model
model = grid.best_estimator_
model.fit(X_train, y_train)

LogisticRegression(C=3.593813663804626, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='multinomial', n_jobs=None,
                   penalty='l2', random_state=0, solver='newton-cg', tol=0.0001,
                   verbose=0, warm_start=False)

In [52]:
ypred = model.predict(X_test)
ypred

array([0, 1, 1, 0, 2, 1, 2, 0, 0, 2, 1, 0, 2, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       2, 0, 2, 1, 0, 0, 1, 2, 1, 2, 1, 2, 2, 0, 1, 0])

In [53]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [54]:
accuracy_score(y_test, ypred)

0.9736842105263158

In [109]:
df = pd.DataFrame({"y": y_test, "ypred": ypred})

In [110]:
pd.crosstab(df.y, df.ypred)

ypred,0,1,2
y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,13,0,0
1,0,15,1
2,0,0,9


In [111]:
accuracy_score(y_test, ypred)

0.9736842105263158

In [112]:
# View predicted probabilities
p_values = model.predict_proba(X_test)
p_values[:3]

array([[9.95477274e-01, 4.52272575e-03, 5.01195607e-11],
       [1.00624611e-01, 8.97602563e-01, 1.77282601e-03],
       [1.78570919e-03, 9.46270958e-01, 5.19433327e-02]])

----

### Trying out different models

In [135]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [150]:
lore = LogisticRegression()
rf = RandomForestClassifier()
dt = DecisionTreeClassifier()
ada = AdaBoostClassifier()
bag = BaggingClassifier()
gb = GradientBoostingClassifier()
#vot = VotingClassifier()
knn = KNeighborsClassifier()
nb = GaussianNB()
svc = SVC()
xgb = XGBClassifier()

In [138]:
classifiers = [lore, dt, rf, ada, bag, gb, knn, nb, svc, xgb]

In [139]:
y_train.value_counts()

2    41
0    37
1    34
Name: Species, dtype: int64

In [140]:
from sklearn.metrics import f1_score, accuracy_score
d_train = {}
d_test = {}

for model in classifiers:
    m = str(model)
    model.fit(X_train, y_train)
    d_train[m[:m.find("(")]] = accuracy_score(y_train, model.predict(X_train))
    d_test[m[:m.find("(")]] = accuracy_score(y_test, model.predict(X_test))
    

In [141]:
d_train

{'LogisticRegression': 0.9196428571428571,
 'DecisionTreeClassifier': 1.0,
 'RandomForestClassifier': 1.0,
 'AdaBoostClassifier': 0.9553571428571429,
 'BaggingClassifier': 1.0,
 'GradientBoostingClassifier': 1.0,
 'KNeighborsClassifier': 0.9553571428571429,
 'GaussianNB': 0.9464285714285714,
 'SVC': 0.9642857142857143,
 'XGBClassifier': 1.0}

In [142]:
d_test

{'LogisticRegression': 0.8157894736842105,
 'DecisionTreeClassifier': 0.9736842105263158,
 'RandomForestClassifier': 0.9736842105263158,
 'AdaBoostClassifier': 0.9736842105263158,
 'BaggingClassifier': 0.9736842105263158,
 'GradientBoostingClassifier': 0.9736842105263158,
 'KNeighborsClassifier': 0.9736842105263158,
 'GaussianNB': 0.9736842105263158,
 'SVC': 0.9736842105263158,
 'XGBClassifier': 0.9736842105263158}

In [143]:
confusion_matrix(y_test, lore.predict(X_test))

array([[13,  0,  0],
       [ 0,  9,  7],
       [ 0,  0,  9]], dtype=int64)

----

### Implementing XGBoost

In [16]:
import xgboost as xgb
from xgboost import XGBClassifier

In [17]:
mod = XGBClassifier(n_jobs = -1, objective = "multi:softmax", random_state = 1, silent = 1)

In [18]:
mod.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 100,
 'n_jobs': -1,
 'nthread': None,
 'objective': 'multi:softmax',
 'random_state': 1,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': 1,
 'subsample': 1,
 'verbosity': 1}

In [19]:
mod.fit(X_test, y_test)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
              nthread=None, objective='multi:softprob', random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=1, subsample=1, verbosity=1)

In [20]:
accuracy_score(y_test,mod.predict(X_test))

1.0

----

### Voting classifier

In [144]:
from sklearn.ensemble import VotingClassifier

In [174]:
lore = LogisticRegression()
rf = RandomForestClassifier()
dt = DecisionTreeClassifier()
ada = AdaBoostClassifier()
bag = BaggingClassifier()
gb = GradientBoostingClassifier()
#vot = VotingClassifier()
knn = KNeighborsClassifier()
nb = GaussianNB()
svc = SVC()
xgb = XGBClassifier()

In [175]:
classifiers = [lore, dt, rf, ada, bag, gb, knn, nb, svc, xgb]
model_name = [str(model)[:str(model).find("(")] for model in classifiers]
print(model_name)
estimators = list(zip(model_name, classifiers))

['LogisticRegression', 'DecisionTreeClassifier', 'RandomForestClassifier', 'AdaBoostClassifier', 'BaggingClassifier', 'GradientBoostingClassifier', 'KNeighborsClassifier', 'GaussianNB', 'SVC', 'XGBClassifier']


In [178]:
vc = VotingClassifier(estimators)

In [179]:
vc.fit(X_train,y_train)

VotingClassifier(estimators=[('LogisticRegression',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='warn', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('DecisionTreeClassifier',
                              DecisionTreeClassifier(class_weight=N...
                                            colsample_bynode=1,
                                            colsample_bytree=1, gamma=0,
           

In [180]:
confusion_matrix(y_test, y_pred = vc.predict(X_test))

array([[13,  0,  0],
       [ 0, 15,  1],
       [ 0,  0,  9]], dtype=int64)

In [181]:
vc.estimators

[('LogisticRegression',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=100,
                     multi_class='warn', n_jobs=None, penalty='l2',
                     random_state=None, solver='warn', tol=0.0001, verbose=0,
                     warm_start=False)),
 ('DecisionTreeClassifier',
  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                         max_features=None, max_leaf_nodes=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, presort=False,
                         random_state=None, splitter='best')),
 ('RandomForestClassifier',
  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                         max_depth=None, max_features='auto', max_leaf_nodes=None,
   