In [1]:
import pandas as pd
import numpy as np
from scipy import stats

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# import splitting and imputing functions
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# import local files
import env
import acquire

import os

# turn off pink boxes
import warnings
warnings.filterwarnings('ignore')

np.random.seed(123)

# 2. Hand Compute Confusion Matrix
Given the following confusion matrix, evaluate (by hand) the model's performance.


|               | pred dog   | pred cat   |
|:------------  |-----------:|-----------:|
| actual dog    |         46 |         7  |
| actual cat    |         13 |         34 |

### Positive and Negative Values
- Positive: actually a dog
- Negative: actually a cat

### T/F // P/N
- TP: Predict dog and, it is actually a dog.
    - 46
- TN: Predict cat and, it is actually a cat.
    - 34
- <b>FP: Predict dog but, it actually is a cat.
    - 13
- FN: Predict cat but, it is actually a dog.
    - 7 </b>
    
### How to describe the model
We are predicting whether the model will correctly identify a dog as a dog (vs a cat).

### Predicting Performance

In [69]:
# total observations:
total_obs = 46 + 7 + 13 + 34

# for baseline, actual value totals
actual_dog = 46 + 7
actual_cat = 13 + 34

# confusion matrix values
tp = 46 
tn = 34 
fp = 13
fn = 7 

In [12]:
# accuracy
#     percentage of total correct (positive or negative) predictions of total predictions
#     (TP + TN) / (total_obs)

accuracy = (tp + tn) / total_obs
print(f'Accuracy: {(accuracy * 100)}%')

Accuracy: 80.0%


In [16]:
# precision
#     percentage of positive predictions that are correct
#     minimizing -----------> FP's
#     TP / (TP + FP)

precision = tp / (tp + fp)
print(f'Precision: {(precision * 100)}%')

Precision: 77.96610169491525%


In [18]:
# recall
#     percentage of positive cases we accurately predicted (comparing true positives with total actual positives)
#     minimizing -----------> FN's
#     TP / (TP + FN)

recall = tp / (tp + fn)
print(f'Recall: {(recall * 100)}%')

Recall: 86.79245283018868%


# 3. C3 Rubber Ducks
You are working as a datascientist working for Codeup Cody Creator (C3 for short), a rubber-duck manufacturing plant.<p>

Unfortunately, some of the rubber ducks that are produced will have defects. Your team has built several models that try to predict those defects.<p>
-  <b>Positive: Rubber duck is actually defected.</b>
    - TP: We predicted a defect and the rubber duck was actually defected.
    - FP: We predicted a defect but the rubber duck was not actually defected.
- <b>Negative: Rubber duck is not actually defected.</b>
    - TN: We predicted the rubber duck was not defected and it was not actually defected.
    - FN: We predicted the rubber duck was defected and it was not actually defected.
- <b>Costs</b>
    - FP: wasted money/resources; throwing away a good duck
    - FN: sending out a defected duck; customer dissatisfaction, time and money to resolve, lost customer, reputation

In [28]:
# reading in the c3.csv file as DataFrame

rubber_ducks_df = pd.read_csv('c3.csv')
rubber_ducks_df

Unnamed: 0,actual,model1,model2,model3
0,No Defect,No Defect,Defect,No Defect
1,No Defect,No Defect,Defect,Defect
2,No Defect,No Defect,Defect,No Defect
3,No Defect,Defect,Defect,Defect
4,No Defect,No Defect,Defect,No Defect
...,...,...,...,...
195,No Defect,No Defect,Defect,Defect
196,Defect,Defect,No Defect,No Defect
197,No Defect,No Defect,No Defect,No Defect
198,No Defect,No Defect,Defect,Defect


In [30]:
# creating column for baseline

rubber_ducks_df['baseline'] = 'Defect'
rubber_ducks_df

Unnamed: 0,actual,model1,model2,model3,baseline
0,No Defect,No Defect,Defect,No Defect,Defect
1,No Defect,No Defect,Defect,Defect,Defect
2,No Defect,No Defect,Defect,No Defect,Defect
3,No Defect,Defect,Defect,Defect,Defect
4,No Defect,No Defect,Defect,No Defect,Defect
...,...,...,...,...,...
195,No Defect,No Defect,Defect,Defect,Defect
196,Defect,Defect,No Defect,No Defect,Defect
197,No Defect,No Defect,No Defect,No Defect,Defect
198,No Defect,No Defect,Defect,Defect,Defect


In [1]:
# accuracy

model1_accuracy = (rubber_ducks_df.actual == rubber_ducks_df.model1).mean()
print(f'model1 Accuracy: {model1_accuracy}')

model2_accuracy = (rubber_ducks_df.actual == rubber_ducks_df.model2).mean()
print(f'model2 Accuracy: {model2_accuracy}')

model3_accuracy = (rubber_ducks_df.actual == rubber_ducks_df.model3).mean()
print(f'model3 Accuracy: {model3_accuracy}')

baseline_accuracy = (rubber_ducks_df.actual == rubber_ducks_df.baseline).mean()
print(f'baseline Accuracy: {baseline_accuracy}')

NameError: name 'rubber_ducks_df' is not defined

In [66]:
# precision
#     minimizing FP's
#     looking at the subset of data where we have made a positive prediction --> top row

# creating precision subset, only looking at the observations/rows where we made a positive prediction
subset1 = rubber_ducks_df[rubber_ducks_df.model1 == 'Defect']
subset2 = rubber_ducks_df[rubber_ducks_df.model2 == 'Defect']
subset3 = rubber_ducks_df[rubber_ducks_df.model3 == 'Defect']

# calculating the precision by comparing the actual and model predictions
model1_precision = (subset1.actual == subset1.model1).mean()
print(f'model1 Precision: {model1_precision}')

model2_precision = (subset2.actual == subset2.model2).mean()
print(f'model1 Precision: {model2_precision}')

model3_precision = (subset3.actual == subset3.model3).mean()
print(f'model1 Precision: {model3_precision}')

model1 Precision: 0.8
model1 Precision: 0.1
model1 Precision: 0.13131313131313133
