# Evaluation Lesson

In [1]:
#Imports
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
#Create dataframe with mock data
glasses = pd.DataFrame(
                       {'preds': ['no', 'no', 'yes', 'no', 'no', 'yes', 'yes', 'no', 'yes', 'yes'],
                        'actual': ['no', 'yes', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no', 'yes']}
                      )

## Confusion matrix

In [3]:
#Confirm dataframe meets expectations
glasses

Unnamed: 0,preds,actual
0,no,no
1,no,yes
2,yes,yes
3,no,no
4,no,no
5,yes,no
6,yes,yes
7,no,no
8,yes,no
9,yes,yes


In [4]:
#Generate confusion matrix using pd.crosstab
pd.crosstab(glasses['preds'], glasses['actual'])

actual,no,yes
preds,Unnamed: 1_level_1,Unnamed: 2_level_1
no,4,1
yes,2,3


In [5]:
#Generate confusion matrix using sklearn function
confusion_matrix(glasses['preds'], glasses['actual'])

array([[4, 1],
       [2, 3]])

## Baseline

In [6]:
#Show value counts of actual labels to determine baseline
glasses['actual'].value_counts()

no     6
yes    4
Name: actual, dtype: int64

In [7]:
#Most prevalent label becomes baseline
glasses['base'] = 'no'
glasses

Unnamed: 0,preds,actual,base
0,no,no,no
1,no,yes,no
2,yes,yes,no
3,no,no,no
4,no,no,no
5,yes,no,no
6,yes,yes,no
7,no,no,no
8,yes,no,no
9,yes,yes,no


## Accuracy

How many correct guesses over total number of guesses

In [8]:
#Check for matching labels between baseline and actual labels
glasses['base'] == glasses['actual']

0     True
1    False
2    False
3     True
4     True
5     True
6    False
7     True
8     True
9    False
dtype: bool

In [9]:
#Compute baseline accuracy
baseline_acc = (glasses['base'] == glasses['actual']).mean()

In [10]:
#Multiply by 100 to get percentage
baseline_acc * 100

60.0

In [11]:
#Repeat with predictions column
(glasses['preds'] == glasses['actual']).mean() * 100

70.0

## Precision

Of all the times I guess the positive case, how many times am I correct?

TP / (TP + FP)

In [12]:
#Create mask for all the times the model predicted the positive case
subset = glasses['preds'] == 'yes'
subset

0    False
1    False
2     True
3    False
4    False
5     True
6     True
7    False
8     True
9     True
Name: preds, dtype: bool

In [13]:
#Create subset of dataframe from the mask
prec_df = glasses[subset]

In [14]:
#Compare predictions to actual labels
prec_df['preds'] == prec_df['actual']

2     True
5    False
6     True
8    False
9     True
dtype: bool

In [15]:
#Compute precision
(prec_df['preds'] == prec_df['actual']).mean()

0.6

## Recall

Of all actual positive cases, how many did I correctly identify?

TP / (TP + FN)

In [16]:
#Create mask for all actual positive cases
subset2 = glasses['actual'] == 'yes'
subset2

0    False
1     True
2     True
3    False
4    False
5    False
6     True
7    False
8    False
9     True
Name: actual, dtype: bool

In [17]:
#Use mask to subset original dataframe
recall_df = glasses[subset2]

In [18]:
#Compare predictions to actual labels
recall_df['preds'] == recall_df['actual']

1    False
2     True
6     True
9     True
dtype: bool

In [19]:
#Calculate recall
(recall_df['preds'] == recall_df['actual']).mean()

0.75

## Other metrics

Misclassification rate

1 - accuracy


Sensitivity

Detection of the positive class, aka recall


Specifity

Detection of the negative class, recall for the negative class


False positive rate

The rate at which a model produces false positives


F1 score

Harmonic mean of precision and recall


AUC-ROC

Area under ROC curve

## Multiclass classification

Multiclass classification will repeat the calculations for precision and recall, treating each possible outcome as the positive case once. These results can be averaged (simple average, or weighted average factoring in support) to evaluate the performance of the model overall.

In [20]:
#Classification report
print(classification_report(glasses['actual'], glasses['preds']))

              precision    recall  f1-score   support

          no       0.80      0.67      0.73         6
         yes       0.60      0.75      0.67         4

    accuracy                           0.70        10
   macro avg       0.70      0.71      0.70        10
weighted avg       0.72      0.70      0.70        10

