In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import metrics 
import sys
import os
import time
import concurrent.futures
from functools import partial
from subprocess import check_output

os.chdir('..')

In [2]:
def gen_labels(df):
    df['Label'] = np.where(df['Arsenic'] > 10, 'positive', 'negative')
    return df

In [12]:
src = gen_labels(pd.read_csv('well_data/src_data.csv'))
positive_datapoints = src['Label'].value_counts().positive
negative_datapoints = src['Label'].value_counts().negative
datapoints = len(src)
print(f'positive datapoints: {positive_datapoints}')
print(f'negative datapoints: {negative_datapoints}')
print(f'total datapoints:    {datapoints}')
print(f'prevalence:          {positive_datapoints / datapoints}')

positive datapoints: 393496
negative datapoints: 475182
total datapoints:    868678
prevalence:          0.4529825781244604


In [60]:
def magic_model(df, accuracy):
    np.random.seed(0)
    random_numbers = np.random.rand(len(df))
    
    df['Prediction'] = np.where(
        random_numbers <= accuracy,
        # random_number on this row is below accuracy, keep label the same
        df['Label'], 
        # random_number on this row is above accuracy, invert label
        np.where(
            df['Label'] == 'positive', 
            'negative', 
            'positive',
        ),
    )

    true_positive_count  = ((df['Label'] == 'positive') & (df['Prediction'] == 'positive')).sum()
    false_negative_count = ((df['Label'] == 'positive') & (df['Prediction'] == 'negative')).sum()
    true_negative_count  = ((df['Label'] == 'negative') & (df['Prediction'] == 'negative')).sum()
    false_positive_count = ((df['Label'] == 'negative') & (df['Prediction'] == 'positive')).sum()
    
    print(f'true positive count   : {true_positive_count}')
    print(f'false negative count  : {false_negative_count}')
    print(f'true negative count   : {true_negative_count}')
    print(f'false positive count  : {false_positive_count}')
    
    net_benefit = 
        (true_positive_count + true_negative_count) - (false_positive_count + false_negative_count)
    
    print('\n')
    print(f'net benefit           : {net_benefit}')

In [61]:
magic_model(src, 0.51)

true positive count   : 200319
false negative count  : 193177
true negative count   : 242052
false positive count  : 233130


net benefit           : 16064


In [62]:
magic_model(src, 0.837)

true positive count   : 329570
false negative count  : 63926
true negative count   : 397381
false positive count  : 77801


net benefit           : 585224
