Task:
Apply and compare, Naïve Bayesian, DT's, Logistic regression  and SVM by *hyper parameter tuning* for the classification of IRIS flower category.

In [1]:
#Importing packages
from sklearn.model_selection import train_test_split
from sklearn import svm, datasets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

In [2]:
iris = datasets.load_iris()

In [3]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [4]:
iris.target_names #label

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [5]:
df = pd.DataFrame(iris.data,columns=iris.feature_names) #columns=iris.feature_names : used to get column names in 2D array
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [6]:
df.shape

(150, 4)

In [7]:
df['target'] = iris.target #adding target column to the dataset
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [8]:
df[df.target==1].head() # print top 5 rows with target==1

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
50,7.0,3.2,4.7,1.4,1
51,6.4,3.2,4.5,1.5,1
52,6.9,3.1,4.9,1.5,1
53,5.5,2.3,4.0,1.3,1
54,6.5,2.8,4.6,1.5,1


Spliting into X & Y

In [9]:
X = df.drop(['target'],axis='columns') #dropping label from X
Y = df.target

In [10]:
X.shape

(150, 4)

In [11]:
Y.shape

(150,)

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=5)

In [13]:
len(X_train)

112

In [14]:
len(X_test)

38

Grid Search CV

In [15]:
#Python scikit-learn provides a Pipeline utility to help automate machine learning workflows.
#Pipelines work by allowing for a linear sequence of data transforms to be chained together 
#culminating in a modeling process that can be evaluated.

pipe = Pipeline([('classifier' , svm.SVC())])

# Create param grid
param_grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__solver' : ['liblinear']},
     {'classifier' : [DecisionTreeClassifier()]},
    {'classifier' : [GaussianNB()]},
    {'classifier' : [svm.SVC(C=[0.1,1,10,100],
                             gamma=[1,0.1,0.01,0.001],
                             kernel=['linear', 'poly', 'rbf', 'sigmoid'])]}]


# Create grid search object
clf = GridSearchCV(pipe, param_grid = param_grid, cv = 2, verbose=True, n_jobs=-1)

# Fit on data
best_clf = clf.fit(X_train, Y_train)

Fitting 2 folds for each of 43 candidates, totalling 86 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  86 out of  86 | elapsed:    1.8s finished


In [16]:
best_clf.best_estimator_.get_params()['classifier'] #To see which algo is best

LogisticRegression(C=4.281332398719396, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:
grid_predictions = best_clf.predict(X_test) 

In [18]:
print('Model accuracy is',best_clf.score(X_test, Y_test)) #Test accuracy

Model accuracy is 0.8947368421052632


In [19]:
print(best_clf.score(X_train,Y_train)) #Train accuracy

0.9732142857142857


In [20]:
print(classification_report(Y_test,grid_predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       0.92      0.79      0.85        14
           2       0.79      0.92      0.85        12

    accuracy                           0.89        38
   macro avg       0.90      0.90      0.90        38
weighted avg       0.90      0.89      0.89        38

