In [9]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [10]:
df = pd.read_csv('threshold_byerror_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,code,pred,prob
0,8569,LAB RESULTS,LAB RESULTS,0.754411
1,4669,QUERY ON CURRENT APPOINTMENT,MEDICATION RELATED,0.356858
2,3916,NEW APPOINTMENT,NEW APPOINTMENT,0.824907
3,2586,MEDICATION RELATED,NEW APPOINTMENT,0.881867
4,2149,QUERIES FROM PHARMACY,REFILL,0.847968


In [11]:
# based on the error rate we want to predict at we find what our autocoding rate will be lowered to
def threshold_byerror(error_tolerance, code, pred, prob):
    df = pd.DataFrame({'code': code, 'pred': pred, 'prob': prob})
    
    # create a threshold report
    range = np.arange(0.5, 1.0, 0.01).tolist()
    range.sort(reverse=True)
    df_scores = pd.DataFrame(range, columns=['confidence'])

    # find the threshold 
    def threshold_error(data, pred, prob, value):
        df_temp = data[data[prob] > value]
        accuracy = round(accuracy_score(df_temp['code'], df_temp[pred]) * 100, 2)
        error_rate = 100 - accuracy
        return error_rate

    def threshold_percent(data, prob, value):
        count_overall = data.shape[0]
        df_temp = data[data[prob] > value]
        count_threshold = df_temp.shape[0]
        percent = round((count_threshold / count_overall) * 100, 2)
        return percent
    
    df_scores['error'] =  df_scores.apply(lambda row: threshold_error(df, 'pred', 'prob', row['confidence']), axis=1)
    df_scores['rate'] =  df_scores.apply(lambda row: threshold_percent(df, 'prob', row['confidence']), axis=1)
    df_scores = df_scores.replace(np.nan,0)
    
    df_selected = df_scores.iloc[(df_scores['error'] - error_tolerance).abs().argsort()[:2]]
    threshold_selected = round(df_selected.confidence.iloc[0]*100,2)
    error_selected = round(df_selected.error.iloc[0],2)
    rate_selected = round(df_selected.rate.iloc[0],2)
    #text = ("Threshold: " + str(threshold_selected) +
    #       "%. Error: " + str(error_selected) + "%. Autocoding: " + str(rate_selected) + "%.")
    return threshold_selected, error_selected, rate_selected

In [14]:
threshold_selected, error_selected, rate_selected = threshold_byerror(10,df.code,df.pred,df.prob)
text = ("Threshold: " + str(threshold_selected) +
          "%. Error: " + str(error_selected) + "%. Autocoding: " + str(rate_selected) + "%.")
print(text)

Threshold: 86.0%. Error: 9.48%. Autocoding: 23.2%.
