# Confusion Matrix

In [2]:
import numpy as np
import pandas as pd

In [3]:
# Misalkan terdapat data aktual (y) & data hasil prediksi model (yp)
y = ['Yes', 'No', 'Yes', 'No', 'Yes', 'No']
yp = ['Yes', 'Yes', 'Yes', 'No', 'Yes', 'Yes']

<hr>

### 1. Create confusion matrix manually

- | __pred "Yes"__ | __pred "No"__
- | - | -
__aktual "Yes"__ | __*3*__ | __*0*__ 
__aktual "No"__ | __*2*__ | __*1*__

<hr>

### 2. Create confusion matrix using Sklearn

In [4]:
from sklearn.metrics import confusion_matrix

x = confusion_matrix(y, yp, labels=["Yes", 'No'])
df = pd.DataFrame(
    x, columns=['Pred YES', 'Pred NO'], index=['Actual YES', 'Actual NO'])
df

Unnamed: 0,Pred YES,Pred NO
Actual YES,3,0
Actual NO,2,1


<hr>

### True, False, Positif, Negatif

- __True Positives (TP)__: Prediksi YES, Aktual YES.
- __True Negatives (TN)__: Prediksi NO, Aktual NO.
- __False Positives (FP)__: Prediksi YES, Aktual NO.
- __False Negatives (FN)__: Prediksi NO, Aktual YES.


- | __pred "Yes"__ | __pred "No"__
- | - | -
__aktual "Yes"__ | __*TP = 3*__ | __*FN = 0*__ 
__aktual "No"__ | __*FP = 2*__ | __*TN = 1*__


In [13]:
# show TN, FP, FN, TP from confusion matrix 
tn, fp, fn, tp = confusion_matrix(y, yp).ravel()
tn, fp, fn, tp

(1, 2, 0, 3)

<hr>

### 3. Evaluation Metrics from Confusion Matrix

- __Accuracy__: Overall, how often is the classifier correct?
    
    ```bash
    (TP + TN) / total_data = (3 + 1)/6 = 0.67
    ```
    
    
- __Misclassification Rate / Error Rate__: Overall, how often is it wrong?

    ```bash
    (FP + FN) / total_data = (2 + 0)/6 = 0.33
    
    or
    
    1 - Accuracy = 1 - 0.67 = 0.33
    ```
    
    
- __TP Rate / Sensitivity / Recall (positif)__: When it's actually yes, how often does it predict yes?

    ```bash
    TP / total_actual_YES = TP / (TP + FN) = 3/3 = 1 = 100%
    ```
    
    
- __FP Rate__: When it's actually no, how often does it predict yes?

    ```bash
    FP / total_actual_NO = FP / (FP + TN) = 2/3 = 0.67
    ```
    
    
- __TN Rate / Specificity / Selectivity / Recall (negatif)__: When it's actually no, how often does it predict no?

    ```bash
    TN / total_actual_NO = TN / (FP + TN) = 1/3 = 0.33
    
    or
    
    1 - False Positive Rate
    ```
     
    
- __FN Rate__: When it's actually yes, how often does it predict no?

    ```bash
    FN / total_actual_YES = FN / (TP + FN) = 0/3 = ~
    
    or
    
    1 - True Positive Rate
    ```
    
    
- __Precision(+)__: When it predicts yes, how often is it correct?

    ```bash
    TP / total_predict_YES = TP / (TP + FP) = 3/5 = 0.6
    ```
    
    
- __Precision(-)__: When it predicts no, how often is it correct?

    ```bash
    TN / total_predict_NO = TN / (TN + FN) = 1/1 = 1
    ```
    
    
- __Prevalence__: How often does the YES condition actually occur in our sample?

    ```bash
    actual_YES / total_data = (TP + FN) / total_data = 3/6 = 0.5 = 50%
    ```
    
    
- __Null Error Rate__: How often does the NO condition actually occur in our sample? How often you would be wrong if you always predicted the majority class. In our example, the null error rate would be: 
    
    ```bash
    aktual_NO / total_data = (FP + TN) / total_data = 3/6 = 0.5 
    
    *Coz if you always predict YES, you would only be wrong for the 3 "NO" cases!
    
    1 - Prevalence
    ```
    
    
- __F1 Score__: weighted average of the true positive rate (recall) and precision. The F1 score is the harmonic mean of the precision and recall, where an F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0.
    
    ```bash
    F1 Score = 2 * ((precision(+) * recall(+)) / (precision(+) + recall(+)))

             = 2 * ((0.6 * 1) / (0.6 + 1)) = 2 * 0.375 = 0.75
    ```
    
    
- __Balanced Accuracy__: Accuracy is not a good metric for imbalanced data sets. For example, if you have 95 negative and 5 positive samples, classifying all as negative gives 0.95 accuracy score. Balanced Accuracy[9] (bACC) overcomes this problem, by normalizing true positive and true negative predictions by the number of positive and negative samples, respectively, and divides their sum into two.

    ```bash
    Balanced Acc = recallP (TP rate) + recallN (TN rate) / 2 = 1 + 0.3 / 2 = 0.66 
    ```
    

In [14]:
# akurasi = (TP + TN) / total_data
print('Akurasi =', (tp + tn) / (len(y)))

# Misclassification Rate / Error Rate = (FP + FN) / total_data
print('Error Rate =', (fp + fn) / (len(y)))

# TP Rate / Sensitivity / Recall (positif) = TP / total_actual_YES = TP / (TP + FN)
print('Recall(+) = ', tp / (tp + fn))

# FP Rate = FP / total_actual_NO = FP / (FP + TN)
print('FP Rate =', fp / (fp + tn))

# TN Rate / Specificity / Recall (negatif) = TN / total_actual_NO = TN / (FP + TN)
print('Recall(-) =', tn / (fp + tn))

# FN Rate = FN / total_actual_YES = FN / (TP + FN) = 0/3 = ~
print('FN Rate =', fn / (tp + fn))

# Precision(+) = TP / total_predict_YES = TP / (TP + FP)
print('Precision(+) =', tp / (tp + fp))

# Precision(-) = TN / total_predict_NO = TN / (TN + FN)
print('Precision(-) =', tn / (tn + fn))

# Prevalence = actual_YES / total_data = (TP + FN) / total_data
print('Prevalence =', (tp + fn) / len(y))

# Null Error Rate = aktual_NO / total_data = (FP + TN) / total_data
print('Null Error Rate =', (fp + tn) / len(y))

# F1 score = 2 * ((precision * recall) / (precision + recall))
print(
    'F1score =', 2 * ( ((tp / (tp + fp)) * (tp / (tp + fn))) / ((tp / (tp + fp)) + (tp / (tp + fn))) )
)

# Balanced accuracy = recall(+) + recall(-) / 2
print('Balanced acc =', ((tp / (tp + fn)) + (tn / (fp + tn))) / 2 )

Akurasi = 0.6666666666666666
Error Rate = 0.3333333333333333
Recall(+) =  1.0
FP Rate = 0.6666666666666666
Recall(-) = 0.3333333333333333
FN Rate = 0.0
Precision(+) = 0.6
Precision(-) = 1.0
Prevalence = 0.5
Null Error Rate = 0.5
F1score = 0.7499999999999999
Balanced acc = 0.6666666666666666


<hr>

### 4. Evaluation Metrics from Confusion Matrix with Sklearn

- #### __Accuracy & Misclassification Rate/Error Rate__

In [15]:
from sklearn.metrics import accuracy_score
print('Akurasi =', accuracy_score(y, yp))

# same as accuracy score
# print('Akurasi =', model.score(y, yp))

print('Error Rate =', 1 - accuracy_score(y, yp))

Akurasi = 0.6666666666666666
Error Rate = 0.33333333333333337


- #### __Recall__

In [16]:
from sklearn.metrics import recall_score

print('Recall(+) =', recall_score(y, yp, pos_label='Yes')) # TP Rate / Recall positif
print('Recall(-) =', recall_score(y, yp, pos_label='No'))  # TN Rate / Recall Negatif
# pos_label = positif label

Recall(+) = 1.0
Recall(-) = 0.3333333333333333


- #### __Precision__

In [17]:
from sklearn.metrics import precision_score

print('Precision(+) =', precision_score(y, yp, pos_label='Yes')) # Precision positif
print('Precision(-) =', precision_score(y, yp, pos_label='No'))  # Precision Negatif

Precision(+) = 0.6
Precision(-) = 1.0


- #### __F1 Score__

In [18]:
from sklearn.metrics import f1_score

print('F1 Score (+) =', f1_score(y, yp, pos_label='Yes'))
# 2 * ((precision(+) * recall(+)) / (precision(+) + recall(+)))

print('F1 Score (-) =', f1_score(y, yp, pos_label='No'))
# 2 * ((precision(-) * recall(-)) / (precision(-) + recall(-)))

F1 Score (+) = 0.7499999999999999
F1 Score (-) = 0.5


- #### __Balanced accuracy__

In [19]:
from sklearn.metrics import balanced_accuracy_score

print('Balanced acc =', balanced_accuracy_score(y, yp))

Balanced acc = 0.6666666666666666


<hr>

### 5. Evaluation Metrics from precision_recall_fscore_support

In [20]:
from sklearn.metrics import precision_recall_fscore_support
precision, recall, fscore, support = precision_recall_fscore_support(y, yp)
# SUPPORT is the number of occurrences of each class in y_true

print(precision_recall_fscore_support(y, yp))
print()
print('Precision ["No", "Yes"] =', precision)
print('Recall    ["No", "Yes"] =', recall)
print('Fscore    ["No", "Yes"] =', fscore)
print('Support   ["No", "Yes"] =', support)

(array([1. , 0.6]), array([0.33333333, 1.        ]), array([0.5 , 0.75]), array([3, 3], dtype=int64))

Precision ["No", "Yes"] = [1.  0.6]
Recall    ["No", "Yes"] = [0.33333333 1.        ]
Fscore    ["No", "Yes"] = [0.5  0.75]
Support   ["No", "Yes"] = [3 3]


In [21]:
print(precision_recall_fscore_support(y, yp, average='micro'))
print(precision_recall_fscore_support(y, yp, average='macro'))
print(precision_recall_fscore_support(y, yp, average='weighted'))

(0.6666666666666666, 0.6666666666666666, 0.6666666666666666, None)
(0.8, 0.6666666666666666, 0.625, None)
(0.7999999999999999, 0.6666666666666666, 0.6249999999999999, None)


In [23]:
# Micro average vs macro average vs weighted average
# tp, fp, fn, tn
# (3, 2, 0, 1)

precisionP = tp / (tp + fp)
precisionN = tn / (tn + fn)

preMicro = (tp + tn) / (tp + fp + tn + fn)
preMacro = (precisionP + precisionN) / 2
preWeighted = ((1 * precisionP) + (1 * precisionN)) / (1 + 1)

print(preMicro)
print(preMacro)
print(preWeighted)

0.6666666666666666
0.8
0.8


<hr>

### 6. Evaluation Metrics from Classification Report

In [24]:
from sklearn.metrics import classification_report

print(classification_report(y, yp))
# SUPPORT is the number of occurrences of each class in y_true

              precision    recall  f1-score   support

          No       1.00      0.33      0.50         3
         Yes       0.60      1.00      0.75         3

    accuracy                           0.67         6
   macro avg       0.80      0.67      0.62         6
weighted avg       0.80      0.67      0.62         6

