In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from joblib import dump

# Tasks
- Build final logistic regression models for each problem
- Save the models using `joblib`

In [2]:
def build_pipe():
    encoder = OneHotEncoder(sparse=False, drop='first')
    model = RandomForestClassifier(random_state=42)
    pipe = Pipeline([('encoder', encoder),
                     ('model', model)])
    return pipe   

## Claim Approval

In [3]:
claims_df = pd.read_csv('../data/processed/dim_claims_train.csv')

claims_X = claims_df[['bin', 'drug']]
claims_y = claims_df['pharmacy_claim_approved']

In [4]:
claims_final_pipe = build_pipe()
claims_final_pipe.fit(claims_X, claims_y)
dump(claims_final_pipe, r'./saved-models/random-forest-claim-approval.joblib')

['./saved-models/random-forest-claim-approval.joblib']

In [5]:
claims_test_df = pd.read_csv('../data/processed/dim_claims_test.csv')

claims_X_test = claims_test_df[['bin', 'drug']]
claims_y_test = claims_test_df['pharmacy_claim_approved']

In [6]:
test_accuracy = accuracy_score(claims_y_test, claims_final_pipe.predict(claims_X_test))
test_precision = precision_score(claims_y_test, claims_final_pipe.predict(claims_X_test))
test_recall = recall_score(claims_y_test, claims_final_pipe.predict(claims_X_test))
test_roc_auc = roc_auc_score(claims_y_test, claims_final_pipe.predict_proba(claims_X_test)[:, 1])

print(f'Test Accuracy = {round(test_accuracy, 2)}')
print(f'Test Precision = {round(test_precision, 2)}')
print(f'Test Recall = {round(test_recall, 2)}')
print(f'Test ROC AUC = {round(test_roc_auc, 2)}')

Test Accuracy = 0.94
Test Precision = 0.9
Test Recall = 1.0
Test ROC AUC = 0.92


## Reject Code

In [7]:
reject_code_df = pd.read_csv('../data/processed/dim_claims_train.csv').fillna(0)
reject_code_df.loc[:, 'reject_code'] = reject_code_df['reject_code'].astype(int)
# only the rejected claims should be examined when determining reject code
reject_code_df = reject_code_df.loc[reject_code_df['pharmacy_claim_approved'] == 0]

reject_code_X = reject_code_df[['bin', 'drug']]
reject_code_y = reject_code_df['reject_code']

In [8]:
reject_code_final_pipe = build_pipe()
reject_code_final_pipe.fit(reject_code_X, reject_code_y)
dump(reject_code_final_pipe, r'./saved-models/random-forest-reject-code.joblib')

['./saved-models/random-forest-reject-code.joblib']

In [9]:
reject_code_test_df = pd.read_csv('../data/processed/dim_claims_test.csv').fillna(0)
reject_code_test_df.loc[:, 'reject_code'] = reject_code_test_df['reject_code'].astype(int)
# only the rejected claims should be examined when determining reject code
reject_code_test_df = reject_code_test_df.loc[reject_code_test_df['pharmacy_claim_approved'] == 0]

reject_code_X_test = reject_code_test_df[['bin', 'drug']]
reject_code_y_test = reject_code_test_df['reject_code']

In [10]:
test_accuracy = accuracy_score(reject_code_y_test, reject_code_final_pipe.predict(reject_code_X_test))
test_precision = precision_score(reject_code_y_test, reject_code_final_pipe.predict(reject_code_X_test), average='macro')
test_recall = recall_score(reject_code_y_test, reject_code_final_pipe.predict(reject_code_X_test), average='macro')
test_roc_auc = roc_auc_score(reject_code_y_test, reject_code_final_pipe.predict_proba(reject_code_X_test), average='macro', multi_class='ovo')

print(f'Test Accuracy = {round(test_accuracy, 2)}')
print(f'Test Precision = {round(test_precision, 2)}')
print(f'Test Recall = {round(test_recall, 2)}')
print(f'Test ROC AUC = {round(test_roc_auc, 2)}')

Test Accuracy = 1.0
Test Precision = 1.0
Test Recall = 1.0
Test ROC AUC = 1.0


## PA Approval

In [11]:
claims_df = pd.read_csv('../data/processed/dim_claims_train.csv').fillna(0)
claims_df.loc[:, 'reject_code'] = claims_df['reject_code'].astype(int)

pa_df = pd.read_csv('../data/processed/dim_pa_train.csv')
bridge_df = pd.read_csv('../data/processed/bridge_train.csv')

combined_df = bridge_df.merge(claims_df, on='dim_claim_id').merge(pa_df, on='dim_pa_id')

pa_X = combined_df[['bin', 'drug', 'correct_diagnosis', 'tried_and_failed', 'contraindication']]
pa_y = combined_df['pa_approved']

In [12]:
pa_final_pipe = build_pipe()
pa_final_pipe.fit(pa_X, pa_y)
dump(pa_final_pipe, r'./saved-models/random-forest-pa-approval.joblib')

['./saved-models/random-forest-pa-approval.joblib']

In [13]:
claims_test_df = pd.read_csv('../data/processed/dim_claims_test.csv').fillna(0)
claims_test_df.loc[:, 'reject_code'] = claims_df['reject_code'].astype(int)

pa_test_df = pd.read_csv('../data/processed/dim_pa_test.csv')
bridge_test_df = pd.read_csv('../data/processed/bridge_test.csv')

combined_test_df = bridge_test_df.merge(claims_test_df, on='dim_claim_id').merge(pa_test_df, on='dim_pa_id')

pa_X_test = combined_test_df[['bin', 'drug', 'correct_diagnosis', 'tried_and_failed', 'contraindication']]
pa_y_test = combined_test_df['pa_approved']

In [14]:
test_accuracy = accuracy_score(pa_y_test, pa_final_pipe.predict(pa_X_test))
test_precision = precision_score(pa_y_test, pa_final_pipe.predict(pa_X_test))
test_recall = recall_score(pa_y_test, pa_final_pipe.predict(pa_X_test))
test_roc_auc = roc_auc_score(pa_y_test, pa_final_pipe.predict_proba(pa_X_test)[:, 1])

print(f'Test Accuracy = {round(test_accuracy, 2)}')
print(f'Test Precision = {round(test_precision, 2)}')
print(f'Test Recall = {round(test_recall, 2)}')
print(f'Test ROC AUC = {round(test_roc_auc, 2)}')

Test Accuracy = 0.81
Test Precision = 0.83
Test Recall = 0.94
Test ROC AUC = 0.88
