### Classification Metrics

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [3]:
# Binary Classification

In [5]:
dataset = load_iris()

In [6]:
print(dataset['DESCR'])

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

:Number of Instances: 150 (50 in each of three classes)
:Number of Attributes: 4 numeric, predictive attributes and the class
:Attribute Information:
    - sepal length in cm
    - sepal width in cm
    - petal length in cm
    - petal width in cm
    - class:
            - Iris-Setosa
            - Iris-Versicolour
            - Iris-Virginica

:Summary Statistics:

                Min  Max   Mean    SD   Class Correlation
sepal length:   4.3  7.9   5.84   0.83    0.7826
sepal width:    2.0  4.4   3.05   0.43   -0.4194
petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

:Missing Attribute Values: None
:Class Distribution: 33.3% for each of 3 classes.
:Creator: R.A. Fisher
:Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
:Date: July, 1988

The famous Iris database, first used by Sir R.A. Fisher. The dataset is taken
from Fis

In [7]:
X = dataset["data"]

In [8]:
X[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [9]:
y = dataset["target"]

In [10]:
y[:5]

array([0, 0, 0, 0, 0])

In [13]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score, classification_report

In [21]:
np.unique(y)

array([0, 1, 2])

In [22]:
y_b = (y == 1).astype(int)
# 1 = versicolor, y == 0 -> versicolor -> True -> 1, False -> 0
# y_b = 1 -> versicolor, 0 -> not versicolor

In [23]:
np.unique(y_b)

array([0, 1])

In [24]:
y_b[:5]

array([0, 0, 0, 0, 0])

In [35]:
models = {
    "logistic regression": LogisticRegression(),
    "Support vector classifier": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

In [40]:
for model_name, model in models.items():
    
    model.fit(X, y_b)
    y_hat = model.predict(X)

    print(model_name.center(90))
    print("Confusion Matrix")
    print(confusion_matrix(y_b, y_hat))
    print("precision: ", precision_score(y_b, y_hat))
    print("Recall: ", recall_score(y_b, y_hat))
    print("F1 Score: ", f1_score(y_b, y_hat))
    print("Accuracy: ", accuracy_score(y_b, y_hat))
    print("-"*90)
    # 1st row -> not versicolor (neg)
    # 2nd row -> versicolor (pos)

                                   logistic regression                                    
Confusion Matrix
[[88 12]
 [29 21]]
precision:  0.6363636363636364
Recall:  0.42
F1 Score:  0.5060240963855421
Accuracy:  0.7266666666666667
------------------------------------------------------------------------------------------
                                Support vector classifier                                 
Confusion Matrix
[[94  6]
 [ 2 48]]
precision:  0.8888888888888888
Recall:  0.96
F1 Score:  0.9230769230769231
Accuracy:  0.9466666666666667
------------------------------------------------------------------------------------------
                                      Decision Tree                                       
Confusion Matrix
[[100   0]
 [  0  50]]
precision:  1.0
Recall:  1.0
F1 Score:  1.0
Accuracy:  1.0
------------------------------------------------------------------------------------------
                                      Random Forest                      

In [32]:
print(classification_report(y_b, y_hat))

              precision    recall  f1-score   support

           0       0.75      0.88      0.81       100
           1       0.64      0.42      0.51        50

    accuracy                           0.73       150
   macro avg       0.69      0.65      0.66       150
weighted avg       0.71      0.73      0.71       150

