# Model Evaluation Exercises

In [8]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

#### 1. Given the following confusion matrix, evaluate (by hand) the model's performance.


|               | pred dog   | pred cat   |
|:------------  |-----------:|-----------:|
| actual dog    |         46 |         7  |
| actual cat    |         13 |         34 |

- In the context of this problem, what is a false positive?
- In the context of this problem, what is a false negative?
- How would you describe this model?

##### False Positive:
A false positive would be it predicted a dog but it was actually a cat (count = 13)

##### False Negative:
A false negative would be that it predicted a cat but it was actuall a dog (count = 7)

##### Model Description:
Accuracy = (46 + 34) / (46 + 7 + 13 + 34) = 80 / 100 = 0.8

Precision = 46 / (46 + 13) = 46 / 59 = 0.78

Recall = 46 / (46 + 7) = 46 / 53 = 0.87

---

#### 2. You are working as a datascientist working for Codeup Cody Creator (C3 for short), a rubber-duck manufacturing plant.

Unfortunately, some of the rubber ducks that are produced will have defects. Your team has built several models that try to predict those defects, and the data from their predictions can be found here.

Use the predictions dataset and pandas to help answer the following questions:

- An internal team wants to investigate the cause of the manufacturing defects. They tell you that they want to identify as many of the ducks that have a defect as possible. Which evaluation metric would be appropriate here? Which model would be the best fit for this use case?
- Recently several stories in the local news have come out highlighting customers who received a rubber duck with a defect, and portraying C3 in a bad light. The PR team has decided to launch a program that gives customers with a defective duck a vacation to Hawaii. They need you to predict which ducks will have defects, but tell you the really don't want to accidentally give out a vacation package when the duck really doesn't have a defect. Which evaluation metric would be appropriate here? Which model would be the best fit for this use case?

In [4]:
c3_df = pd.read_csv('c3.csv')

In [6]:
c3_df.head()

Unnamed: 0,actual,model1,model2,model3
0,No Defect,No Defect,Defect,No Defect
1,No Defect,No Defect,Defect,Defect
2,No Defect,No Defect,Defect,No Defect
3,No Defect,Defect,Defect,Defect
4,No Defect,No Defect,Defect,No Defect


In [22]:
c3_df.actual.value_counts()

No Defect    184
Defect        16
Name: actual, dtype: int64

In [7]:
#model 1 evaluation
# - positive = has a defect

#create a crosstab of the actual against model1
m1 = pd.crosstab(c3_df.model1, c3_df.actual)
m1

actual,Defect,No Defect
model1,Unnamed: 1_level_1,Unnamed: 2_level_1
Defect,8,2
No Defect,8,182


In [12]:
m1_acc = (c3_df.model1 == c3_df.actual).mean()
subset1 = c3_df[c3_df.model1 == 'Defect']
m1_pre = (subset1.model1 == subset1.actual).mean()
subset2 = c3_df[c3_df.actual == 'Defect']
m1_rec = (subset2.model1 == subset2.actual).mean()

print(f'model 1 accuracy: {m1_acc:.2%}')
print('-----')
print(f'model 1 recall: {m1_rec:.2%}')
print('-----')
print(f'model 1 precision: {m1_pre:.2%}')

model 1 accuracy: 95.00%
-----
model 1 recall: 50.00%
-----
model 1 precision: 80.00%


In [13]:
#model 2 evaluation
# - positive = has a defect

#create a crosstab of the actual against model2
m2 = pd.crosstab(c3_df.model2, c3_df.actual)
m2

actual,Defect,No Defect
model2,Unnamed: 1_level_1,Unnamed: 2_level_1
Defect,9,81
No Defect,7,103


In [14]:
m2_acc = (c3_df.model2 == c3_df.actual).mean()
subset3 = c3_df[c3_df.model2 == 'Defect']
m2_pre = (subset3.model2 == subset3.actual).mean()
subset4 = c3_df[c3_df.actual == 'Defect']
m2_rec = (subset4.model2 == subset4.actual).mean()

print(f'model 2 accuracy: {m2_acc:.2%}')
print('-----')
print(f'model 2 recall: {m2_rec:.2%}')
print('-----')
print(f'model 2 precision: {m2_pre:.2%}')

model 2 accuracy: 56.00%
-----
model 2 recall: 56.25%
-----
model 2 precision: 10.00%


In [15]:
#model 3 evaluation
# - positive = has a defect

#create a crosstab of the actual against model3
m3 = pd.crosstab(c3_df.model3, c3_df.actual)
m3

actual,Defect,No Defect
model3,Unnamed: 1_level_1,Unnamed: 2_level_1
Defect,13,86
No Defect,3,98


In [16]:
m3_acc = (c3_df.model3 == c3_df.actual).mean()
subset5 = c3_df[c3_df.model3 == 'Defect']
m3_pre = (subset5.model3 == subset5.actual).mean()
subset6 = c3_df[c3_df.actual == 'Defect']
m3_rec = (subset6.model3 == subset6.actual).mean()

print(f'model 3 accuracy: {m3_acc:.2%}')
print('-----')
print(f'model 3 recall: {m3_rec:.2%}')
print('-----')
print(f'model 3 precision: {m3_pre:.2%}')

model 3 accuracy: 55.50%
-----
model 3 recall: 81.25%
-----
model 3 precision: 13.13%


In [18]:
#add a baseline column to df to make comparisons
#baseline value = "No Defect"
c3_df['baseline'] = 'No Defect'
c3_df.head()

Unnamed: 0,actual,model1,model2,model3,baseline
0,No Defect,No Defect,Defect,No Defect,No Defect
1,No Defect,No Defect,Defect,Defect,No Defect
2,No Defect,No Defect,Defect,No Defect,No Defect
3,No Defect,Defect,Defect,Defect,No Defect
4,No Defect,No Defect,Defect,No Defect,No Defect


In [19]:
base_acc = (c3_df.baseline == c3_df.actual).mean()
subset7 = c3_df[c3_df.baseline == 'Defect']
base_pre = (subset7.baseline == subset7.actual).mean()
subset8 = c3_df[c3_df.actual == 'Defect']
base_rec = (subset8.baseline == subset8.actual).mean()

print(f'baseline accuracy: {base_acc:.2%}')
print('-----')
print(f'baseline recall: {base_rec:.2%}')
print('-----')
print(f'baseline precision: {base_pre:.2%}')

baseline accuracy: 92.00%
-----
baseline recall: 0.00%
-----
baseline precision: nan%


In [20]:
#Summary for all three models together
#baseline = No Defect
print(f'baseline accuracy: {base_acc:.2%}')
print(f'model 1 accuracy: {m1_acc:.2%}')
print(f'model 2 accuracy: {m2_acc:.2%}')
print(f'model 3 accuracy: {m3_acc:.2%}')
print('-----')
print(f'baseline recall: {base_rec:.2%}')
print(f'model 1 recall: {m1_rec:.2%}')
print(f'model 2 recall: {m2_rec:.2%}')
print(f'model 3 recall: {m3_rec:.2%}')
print('-----')
print(f'baseline precision: {base_pre:.2%}')
print(f'model 1 precision: {m1_pre:.2%}')
print(f'model 2 precision: {m2_pre:.2%}')
print(f'model 3 precision: {m3_pre:.2%}')

baseline accuracy: 92.00%
model 1 accuracy: 95.00%
model 2 accuracy: 56.00%
model 3 accuracy: 55.50%
-----
baseline recall: 0.00%
model 1 recall: 50.00%
model 2 recall: 56.25%
model 3 recall: 81.25%
-----
baseline precision: nan%
model 1 precision: 80.00%
model 2 precision: 10.00%
model 3 precision: 13.13%


##### Which model would be the best to detect the most defects possible?
- Model 3 has the highest recall value meaning it would have the least chance of making a false negative prediction and would catch the most ducks with a defect

##### Which model would have the least chance of a false positive and thus giving a free vacation to someone with a duck that does not have a defect
- Model 1 has the highest precision value meaning it would have the least chance of making a false positive prediction

---

#### 3. You are working as a data scientist for Gives You Paws ™, a subscription based service that shows you cute pictures of dogs or cats (or both for an additional fee).

At Gives You Paws, anyone can upload pictures of their cats or dogs. The photos are then put through a two step process. First an automated algorithm tags pictures as either a cat or a dog (Phase I). Next, the photos that have been initially identified are put through another round of review, possibly with some human oversight, before being presented to the users (Phase II).

Several models have already been developed with the data, and you can find their results here.

Given this dataset, use pandas to create a baseline model (i.e. a model that just predicts the most common class) and answer the following questions:

- In terms of accuracy, how do the various models compare to the baseline model? Are any of the models better than the baseline?
- Suppose you are working on a team that solely deals with dog pictures. Which of these models would you recomend for Phase I? For Phase II?
- Suppose you are working on a team that solely deals with cat pictures. Which of these models would you recomend for - Phase I? For Phase II?

In [2]:
paws = pd.read_csv('gives_you_paws.csv')
paws.head()

Unnamed: 0,actual,model1,model2,model3,model4
0,cat,cat,dog,cat,dog
1,dog,dog,cat,cat,dog
2,dog,cat,cat,cat,dog
3,dog,dog,dog,cat,dog
4,cat,cat,cat,dog,dog


In [24]:
paws.actual.value_counts()

dog    3254
cat    1746
Name: actual, dtype: int64

In [25]:
#model 1 evaluation
# - positive = is dog

#create a crosstab of the actual against model1
pm1 = pd.crosstab(paws.model1, paws.actual)
pm1

actual,cat,dog
model1,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,1423,640
dog,323,2614


In [26]:
pm1_acc = (paws.model1 == paws.actual).mean()
p_subset1 = paws[paws.model1 == 'dog']
pm1_pre = (p_subset1.model1 == p_subset1.actual).mean()
p_subset2 = paws[paws.actual == 'dog']
pm1_rec = (p_subset2.model1 == p_subset2.actual).mean()

print(f'Paws model 1 accuracy: {pm1_acc:.2%}')
print('-----')
print(f'Paws model 1 recall: {pm1_rec:.2%}')
print('-----')
print(f'Paws model 1 precision: {pm1_pre:.2%}')

Paws model 1 accuracy: 80.74%
-----
Paws model 1 recall: 80.33%
-----
Paws model 1 precision: 89.00%


In [27]:
#model 2 evaluation
# - positive = is dog

#create a crosstab of the actual against model2
pm2 = pd.crosstab(paws.model2, paws.actual)
pm2

actual,cat,dog
model2,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,1555,1657
dog,191,1597


In [28]:
pm2_acc = (paws.model2 == paws.actual).mean()
p_subset3 = paws[paws.model2 == 'dog']
pm2_pre = (p_subset3.model2 == p_subset3.actual).mean()
p_subset4 = paws[paws.actual == 'dog']
pm2_rec = (p_subset4.model2 == p_subset4.actual).mean()

print(f'Paws model 2 accuracy: {pm2_acc:.2%}')
print('-----')
print(f'Paws model 2 recall: {pm2_rec:.2%}')
print('-----')
print(f'Paws model 2 precision: {pm2_pre:.2%}')

Paws model 2 accuracy: 63.04%
-----
Paws model 2 recall: 49.08%
-----
Paws model 2 precision: 89.32%


In [29]:
#model 3 evaluation
# - positive = is dog

#create a crosstab of the actual against model3
pm3 = pd.crosstab(paws.model3, paws.actual)
pm3

actual,cat,dog
model3,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,893,1599
dog,853,1655


In [30]:
pm3_acc = (paws.model3 == paws.actual).mean()
p_subset5 = paws[paws.model3 == 'dog']
pm3_pre = (p_subset5.model3 == p_subset5.actual).mean()
p_subset6 = paws[paws.actual == 'dog']
pm3_rec = (p_subset6.model3 == p_subset6.actual).mean()

print(f'Paws model 3 accuracy: {pm3_acc:.2%}')
print('-----')
print(f'Paws model 3 recall: {pm3_rec:.2%}')
print('-----')
print(f'Paws model 3 precision: {pm3_pre:.2%}')

Paws model 3 accuracy: 50.96%
-----
Paws model 3 recall: 50.86%
-----
Paws model 3 precision: 65.99%


In [31]:
#model 4 evaluation
# - positive = is dog

#create a crosstab of the actual against model3
pm4 = pd.crosstab(paws.model4, paws.actual)
pm4

actual,cat,dog
model4,Unnamed: 1_level_1,Unnamed: 2_level_1
cat,603,144
dog,1143,3110


In [32]:
pm4_acc = (paws.model4 == paws.actual).mean()
p_subset7 = paws[paws.model4 == 'dog']
pm4_pre = (p_subset7.model4 == p_subset7.actual).mean()
p_subset8 = paws[paws.actual == 'dog']
pm4_rec = (p_subset8.model4 == p_subset8.actual).mean()

print(f'Paws model 4 accuracy: {pm4_acc:.2%}')
print('-----')
print(f'Paws model 4 recall: {pm4_rec:.2%}')
print('-----')
print(f'Paws model 4 precision: {pm4_pre:.2%}')

Paws model 4 accuracy: 74.26%
-----
Paws model 4 recall: 95.57%
-----
Paws model 4 precision: 73.12%


In [5]:
#add a baseline column with the value of 'dog'
paws['baseline'] = 'dog'
paws.head()

Unnamed: 0,actual,model1,model2,model3,model4,baseline
0,cat,cat,dog,cat,dog,dog
1,dog,dog,cat,cat,dog,dog
2,dog,cat,cat,cat,dog,dog
3,dog,dog,dog,cat,dog,dog
4,cat,cat,cat,dog,dog,dog


In [34]:
p_base_acc = (paws.baseline == paws.actual).mean()
p_subset9 = paws[paws.baseline == 'dog']
p_base_pre = (p_subset9.baseline == p_subset9.actual).mean()
p_subset10 = paws[paws.actual == 'dog']
p_base_rec = (p_subset10.baseline == p_subset10.actual).mean()

print(f'Paws baseline accuracy: {p_base_acc:.2%}')
print('-----')
print(f'Paws baseline recall: {p_base_rec:.2%}')
print('-----')
print(f'Paws baseline precision: {p_base_pre:.2%}')

Paws baseline accuracy: 65.08%
-----
Paws baseline recall: 100.00%
-----
Paws baseline precision: 65.08%


In [36]:
#summary of all model evaluations with a prediction of 'dog'
print('Positive = dog')
print('=====')
print(f'Paws baseline accuracy: {p_base_acc:.2%}')
print(f'Paws model 1 accuracy: {pm1_acc:.2%}')
print(f'Paws model 2 accuracy: {pm2_acc:.2%}')
print(f'Paws model 3 accuracy: {pm3_acc:.2%}')
print(f'Paws model 4 accuracy: {pm4_acc:.2%}')
print('-----')
print(f'Paws baseline recall: {p_base_rec:.2%}')
print(f'Paws model 1 recall: {pm1_rec:.2%}')
print(f'Paws model 2 recall: {pm2_rec:.2%}')
print(f'Paws model 3 recall: {pm3_rec:.2%}')
print(f'Paws model 4 recall: {pm4_rec:.2%}')
print('-----')
print(f'Paws baseline precision: {p_base_pre:.2%}')
print(f'Paws model 1 precision: {pm1_pre:.2%}')
print(f'Paws model 2 precision: {pm2_pre:.2%}')
print(f'Paws model 3 precision: {pm3_pre:.2%}')
print(f'Paws model 4 precision: {pm4_pre:.2%}')

Positive = dog
=====
Paws baseline accuracy: 65.08%
Paws model 1 accuracy: 80.74%
Paws model 2 accuracy: 63.04%
Paws model 3 accuracy: 50.96%
Paws model 4 accuracy: 74.26%
-----
Paws baseline recall: 100.00%
Paws model 1 recall: 80.33%
Paws model 2 recall: 49.08%
Paws model 3 recall: 50.86%
Paws model 4 recall: 95.57%
-----
Paws baseline precision: 65.08%
Paws model 1 precision: 89.00%
Paws model 2 precision: 89.32%
Paws model 3 precision: 65.99%
Paws model 4 precision: 73.12%


In [7]:
#put it all together and make an option to enter the 'positive' variable value

positive = 'cat'
#evaluate baseline
p_base_acc = (paws.baseline == paws.actual).mean()
p_subset9 = paws[paws.baseline == positive]
p_base_pre = (p_subset9.baseline == p_subset9.actual).mean()
p_subset10 = paws[paws.actual == positive]
p_base_rec = (p_subset10.baseline == p_subset10.actual).mean()
#evaluate model 1
pm1_acc = (paws.model1 == paws.actual).mean()
p_subset1 = paws[paws.model1 == positive]
pm1_pre = (p_subset1.model1 == p_subset1.actual).mean()
p_subset2 = paws[paws.actual == positive]
pm1_rec = (p_subset2.model1 == p_subset2.actual).mean()
#evaluate model 2
pm2_acc = (paws.model2 == paws.actual).mean()
p_subset3 = paws[paws.model2 == positive]
pm2_pre = (p_subset3.model2 == p_subset3.actual).mean()
p_subset4 = paws[paws.actual == positive]
pm2_rec = (p_subset4.model2 == p_subset4.actual).mean()
#evaluate model 3
pm3_acc = (paws.model3 == paws.actual).mean()
p_subset5 = paws[paws.model3 == positive]
pm3_pre = (p_subset5.model3 == p_subset5.actual).mean()
p_subset6 = paws[paws.actual == positive]
pm3_rec = (p_subset6.model3 == p_subset6.actual).mean()
#evaluate model 4
pm4_acc = (paws.model4 == paws.actual).mean()
p_subset7 = paws[paws.model4 == positive]
pm4_pre = (p_subset7.model4 == p_subset7.actual).mean()
p_subset8 = paws[paws.actual == positive]
pm4_rec = (p_subset8.model4 == p_subset8.actual).mean()

#print the evaluation results
print(f'positive: {positive}')
print('=====')
print(f'Paws baseline accuracy: {p_base_acc:.2%}')
print(f'Paws model 1 accuracy: {pm1_acc:.2%}')
print(f'Paws model 2 accuracy: {pm2_acc:.2%}')
print(f'Paws model 3 accuracy: {pm3_acc:.2%}')
print(f'Paws model 4 accuracy: {pm4_acc:.2%}')
print('-----')
print(f'Paws baseline recall: {p_base_rec:.2%}')
print(f'Paws model 1 recall: {pm1_rec:.2%}')
print(f'Paws model 2 recall: {pm2_rec:.2%}')
print(f'Paws model 3 recall: {pm3_rec:.2%}')
print(f'Paws model 4 recall: {pm4_rec:.2%}')
print('-----')
print(f'Paws baseline precision: {p_base_pre:.2%}')
print(f'Paws model 1 precision: {pm1_pre:.2%}')
print(f'Paws model 2 precision: {pm2_pre:.2%}')
print(f'Paws model 3 precision: {pm3_pre:.2%}')
print(f'Paws model 4 precision: {pm4_pre:.2%}')

positive: cat
=====
Paws baseline accuracy: 65.08%
Paws model 1 accuracy: 80.74%
Paws model 2 accuracy: 63.04%
Paws model 3 accuracy: 50.96%
Paws model 4 accuracy: 74.26%
-----
Paws baseline recall: 0.00%
Paws model 1 recall: 81.50%
Paws model 2 recall: 89.06%
Paws model 3 recall: 51.15%
Paws model 4 recall: 34.54%
-----
Paws baseline precision: nan%
Paws model 1 precision: 68.98%
Paws model 2 precision: 48.41%
Paws model 3 precision: 35.83%
Paws model 4 precision: 80.72%


##### Conceptual question answers:
In terms of accuracy, how do the various models compare to the baseline model? Are any of the models better than the baseline?
- model 1 and 4 have better accuracy than baseline 

Suppose you are working on a team that solely deals with dog pictures. Which of these models would you recomend for Phase I? For Phase II?
- phase 1 should prioritize reducing false negatives and phase 2 should prioritize reducing false positives
- for dog pictures model 4 should be used for phase 1 because it has the highest recall value and model 2 should be used for phase 2 because it has the highest precision value

Suppose you are working on a team that solely deals with cat pictures. Which of these models would you recomend for - Phase I? For Phase II?
- phase 1 should prioritize reducing false negatives and phase 2 should prioritize reducing false positives
- for cat pictures model 2 should be used for phase 1 because it has the highest recall value and model 4 should be used for phase 2 because it has the highest precision value

#note - if we decided to remove models 2 and 3 because the accuracy was lower than baseline we would substitute model 1 for model 2 in the above responses.

---

#### 4. Follow the links below to read the documentation about each function, then apply those functions to the data from the previous problem.

- sklearn.metrics.accuracy_score
- sklearn.metrics.precision_score
- sklearn.metrics.recall_score
- sklearn.metrics.classification_report

In [15]:
from sklearn.metrics import accuracy_score
#sklearn.metrics.accuracy_score
paws_actual = paws.actual
paws_m1 = paws.model1
paws_m2 = paws.model2
paws_m3 = paws.model3
paws_m4 = paws.model4

print(f'paws model 1 accuracy: {accuracy_score(paws_actual, paws_m1): .2%}')
print(f'paws model 2 accuracy: {accuracy_score(paws_actual, paws_m2): .2%}')
print(f'paws model 3 accuracy: {accuracy_score(paws_actual, paws_m3): .2%}')
print(f'paws model 4 accuracy: {accuracy_score(paws_actual, paws_m4): .2%}')

paws model 1 accuracy:  80.74%
paws model 2 accuracy:  63.04%
paws model 3 accuracy:  50.96%
paws model 4 accuracy:  74.26%


In [26]:
from sklearn.metrics import precision_score
#sklearn.metrics.precision_score - using above created variables

positive = 'dog'

print(f'positive: {positive}')
print('=====')
print(f'paws model 1 precision: {precision_score(paws_actual, paws_m1, pos_label=positive): .2%}')
print(f'paws model 2 precision: {precision_score(paws_actual, paws_m2, pos_label=positive): .2%}')
print(f'paws model 3 precision: {precision_score(paws_actual, paws_m3, pos_label=positive): .2%}')
print(f'paws model 4 precision: {precision_score(paws_actual, paws_m4, pos_label=positive): .2%}')

positive: dog
=====
paws model 1 precision:  89.00%
paws model 2 precision:  89.32%
paws model 3 precision:  65.99%
paws model 4 precision:  73.12%


In [27]:
from sklearn.metrics import recall_score
#sklearn.metrics.recall_score

positive = 'dog'

print(f'positive: {positive}')
print('=====')
print(f'paws model 1 recall: {recall_score(paws_actual, paws_m1, pos_label=positive): .2%}')
print(f'paws model 2 recall: {recall_score(paws_actual, paws_m2, pos_label=positive): .2%}')
print(f'paws model 3 recall: {recall_score(paws_actual, paws_m3, pos_label=positive): .2%}')
print(f'paws model 4 recall: {recall_score(paws_actual, paws_m4, pos_label=positive): .2%}')

positive: dog
=====
paws model 1 recall:  80.33%
paws model 2 recall:  49.08%
paws model 3 recall:  50.86%
paws model 4 recall:  95.57%


In [28]:
from sklearn.metrics import classification_report
#sklearn.metrics.classification_report

#classification report for model 1
target_names = ['dog', 'cat']
print(classification_report(paws.actual, paws.model1, target_names=target_names))

              precision    recall  f1-score   support

         dog       0.69      0.82      0.75      1746
         cat       0.89      0.80      0.84      3254

    accuracy                           0.81      5000
   macro avg       0.79      0.81      0.80      5000
weighted avg       0.82      0.81      0.81      5000



In [29]:
#classification report for model 2
target_names = ['dog', 'cat']
print(classification_report(paws.actual, paws.model2, target_names=target_names))

              precision    recall  f1-score   support

         dog       0.48      0.89      0.63      1746
         cat       0.89      0.49      0.63      3254

    accuracy                           0.63      5000
   macro avg       0.69      0.69      0.63      5000
weighted avg       0.75      0.63      0.63      5000



In [30]:
#classification report for model 3
target_names = ['dog', 'cat']
print(classification_report(paws.actual, paws.model3, target_names=target_names))

              precision    recall  f1-score   support

         dog       0.36      0.51      0.42      1746
         cat       0.66      0.51      0.57      3254

    accuracy                           0.51      5000
   macro avg       0.51      0.51      0.50      5000
weighted avg       0.55      0.51      0.52      5000



In [31]:
#classification report for model 4
target_names = ['dog', 'cat']
print(classification_report(paws.actual, paws.model4, target_names=target_names))

              precision    recall  f1-score   support

         dog       0.81      0.35      0.48      1746
         cat       0.73      0.96      0.83      3254

    accuracy                           0.74      5000
   macro avg       0.77      0.65      0.66      5000
weighted avg       0.76      0.74      0.71      5000

