In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

## How well does a Random Forest predict a claim being approved (`pharmacy_claim_approved`)?

In [2]:
claims_df = pd.read_csv('../data/processed/dim_claims_train.csv').fillna(0)
claims_df.loc[:, 'reject_code'] = claims_df['reject_code'].astype(int)

pa_df = pd.read_csv('../data/processed/dim_pa_train.csv')
bridge_df = pd.read_csv('../data/processed/bridge_train.csv')

combined_df = bridge_df.merge(claims_df, on='dim_claim_id').merge(pa_df, on='dim_pa_id').drop(columns=['dim_claim_id', 'dim_pa_id', 'dim_date_id', 'pharmacy_claim_approved'])

In [3]:
claims_X = claims_df[['bin', 'drug']].values
claims_y = claims_df['pharmacy_claim_approved'].values

bin_le = LabelEncoder()
claims_X[:, 0] = bin_le.fit_transform(claims_X[:, 0])

drug_le = LabelEncoder()
claims_X[:, 1] = drug_le.fit_transform(claims_X[:, 1])

cv = KFold(n_splits=5, random_state=42, shuffle=True)
cv.get_n_splits(claims_X)

accuracy = []
precision = []
recall = []

for train_index, test_index in cv.split(claims_X):
    X_train, X_test = claims_X[train_index], claims_X[test_index]
    y_train, y_test = claims_y[train_index], claims_y[test_index]
    
    model = RandomForestClassifier(random_state=42)
    
    model.fit(X_train, y_train)
    
    accuracy.append(accuracy_score(y_test, model.predict(X_test)))
    precision.append(precision_score(y_test, model.predict(X_test)))
    recall.append(recall_score(y_test, model.predict(X_test)))

    
accuracy = np.array(accuracy)
recall = np.array(recall)
precision = np.array(precision)

In [4]:
print(f'Logistic Regression accuracy = {round(np.mean(accuracy), 2)}')
print(f'Logistic Regression precision = {round(np.mean(precision), 2)}')
print(f'Logistic Regression recall = {round(np.mean(recall), 2)}')

Logistic Regression accuracy = 0.94
Logistic Regression precision = 0.9
Logistic Regression recall = 1.0


### Does 1-hot encoding improve the prediction?
- 1-hot encoding improved the precision by 0.4, but reduced the accuracy by 0.3 and recall by 0.12.

In [6]:
claims_X = claims_df[['bin', 'drug']].values
claims_y = claims_df['pharmacy_claim_approved'].values

ohe = OneHotEncoder(sparse=False, drop='first')
claims_X = ohe.fit_transform(claims_X.tolist())

cv = KFold(n_splits=5, random_state=42, shuffle=True)
cv.get_n_splits(claims_X)

accuracy = []
precision = []
recall = []

for train_index, test_index in cv.split(claims_X):
    X_train, X_test = claims_X[train_index], claims_X[test_index]
    y_train, y_test = claims_y[train_index], claims_y[test_index]
    
    model = RandomForestClassifier(random_state=42)
    
    model.fit(X_train, y_train)
    
    accuracy.append(accuracy_score(y_test, model.predict(X_test)))
    precision.append(precision_score(y_test, model.predict(X_test)))
    recall.append(recall_score(y_test, model.predict(X_test)))

    
accuracy = np.array(accuracy)
recall = np.array(recall)
precision = np.array(precision)

In [7]:
print(f'Logistic Regression accuracy = {round(np.mean(accuracy), 2)}')
print(f'Logistic Regression precision = {round(np.mean(precision), 2)}')
print(f'Logistic Regression recall = {round(np.mean(recall), 2)}')

Logistic Regression accuracy = 0.94
Logistic Regression precision = 0.9
Logistic Regression recall = 1.0


## How well does a Decision Tree predict a PA being approved (`pa_approved`)?

In [8]:
claims_X = combined_df.drop(columns='pa_approved').values
claims_y = combined_df['pa_approved'].values

bin_le = LabelEncoder()
claims_X[:, 0] = bin_le.fit_transform(claims_X[:, 0])

drug_le = LabelEncoder()
claims_X[:, 1] = drug_le.fit_transform(claims_X[:, 1])

reject_code_le = LabelEncoder()
claims_X[:, 2] = reject_code_le.fit_transform(claims_X[:, 2])

cv = KFold(n_splits=5, random_state=42, shuffle=True)
cv.get_n_splits(claims_X)

accuracy = []
precision = []
recall = []

for train_index, test_index in cv.split(claims_X):
    X_train, X_test = claims_X[train_index], claims_X[test_index]
    y_train, y_test = claims_y[train_index], claims_y[test_index]
    
    model = RandomForestClassifier(random_state=42)
    
    model.fit(X_train, y_train)
    
    accuracy.append(accuracy_score(y_test, model.predict(X_test)))
    precision.append(precision_score(y_test, model.predict(X_test)))
    recall.append(recall_score(y_test, model.predict(X_test)))

    
accuracy = np.array(accuracy)
recall = np.array(recall)
precision = np.array(precision)

In [9]:
print(f'Logistic Regression accuracy = {round(np.mean(accuracy), 2)}')
print(f'Logistic Regression precision = {round(np.mean(precision), 2)}')
print(f'Logistic Regression recall = {round(np.mean(recall), 2)}')

Logistic Regression accuracy = 0.81
Logistic Regression precision = 0.83
Logistic Regression recall = 0.93


### Does 1-hot encoding improve the prediction?
- 1-hot encoding causes the Bernoulli and Categorical Naive Bayes to become equivalent.

In [10]:
claims_X = combined_df.drop(columns='pa_approved').values
claims_y = combined_df['pa_approved'].values

ohe = OneHotEncoder(sparse=False, drop='first')
claims_X = ohe.fit_transform(claims_X.tolist())

cv = KFold(n_splits=5, random_state=42, shuffle=True)
cv.get_n_splits(claims_X)

accuracy = []
precision = []
recall = []

for train_index, test_index in cv.split(claims_X):
    X_train, X_test = claims_X[train_index], claims_X[test_index]
    y_train, y_test = claims_y[train_index], claims_y[test_index]
    
    model = RandomForestClassifier(random_state=42)
    
    model.fit(X_train, y_train)
    
    accuracy.append(accuracy_score(y_test, model.predict(X_test)))
    precision.append(precision_score(y_test, model.predict(X_test)))
    recall.append(recall_score(y_test, model.predict(X_test)))

accuracy = np.array(accuracy)
recall = np.array(recall)
precision = np.array(precision)

In [11]:
print(f'Logistic Regression accuracy = {round(np.mean(accuracy), 2)}')
print(f'Logistic Regression precision = {round(np.mean(precision), 2)}')
print(f'Logistic Regression recall = {round(np.mean(recall), 2)}')

Logistic Regression accuracy = 0.81
Logistic Regression precision = 0.83
Logistic Regression recall = 0.93
