In [68]:
import polars as pl
import altair as alt
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

In [69]:
data_dir = Path("data")
df = pl.read_parquet(data_dir / "DSU-Dataset.parquet") 

In [70]:
df = df.with_columns((pl.col('Paid Date') - pl.col('Recieved Date')).alias('Paid - Recieved').dt.total_days(),
                     (pl.col('Paid Date') - pl.col('Service Date')).alias('Paid - Service').dt.total_days(),
                     (pl.col('Recieved Date') - pl.col('Service Date')).alias('Service - Recieved').dt.total_days())

In [71]:
df[df.columns[-3:]].describe()

statistic,Paid - Recieved,Paid - Service,Service - Recieved
str,f64,f64,f64
"""count""",253686.0,253686.0,253686.0
"""null_count""",0.0,0.0,0.0
"""mean""",7.695572,32.960447,25.264875
"""std""",8.906887,51.582527,49.554884
"""min""",0.0,1.0,0.0
"""25%""",4.0,11.0,5.0
"""50%""",7.0,17.0,10.0
"""75%""",8.0,34.0,25.0
"""max""",980.0,1033.0,994.0


Set up outlier flags for the Paid-Service and Service-Recieved date as they both have a skewed distribution.

In [72]:
ps_iqr = df['Paid - Service'].quantile(0.75) - df['Paid - Service'].quantile(0.25) 
ps_threshold = df['Paid - Service'].quantile(0.75) + iqr

In [73]:
sr_iqr = df['Service - Recieved'].quantile(0.75) - df['Service - Recieved'].quantile(0.25) 
sr_threshold = df['Service - Recieved'].quantile(0.75)

In [75]:
df = df.with_columns(
    (pl.col("Paid - Service") > ps_threshold).cast(pl.Int64).alias("p-s outlier"),
    (pl.col("Service - Recieved") > ps_threshold).cast(pl.Int64).alias("s-r outlier")
)

In [76]:
list(df.columns)

['Claim ID',
 'Service Date',
 'Recieved Date',
 'Paid Date',
 'Patient ID',
 'Member Age',
 'Gender',
 'Marital Status',
 'Ethnicity',
 'LOB',
 'Network Status',
 'Claim Category',
 'Claim Subcategory',
 'Claim Line',
 'Place of Service',
 'Provider Type',
 'Provider Specialty',
 'ICD10 Code 1',
 'ICD10 Code 2',
 'ICD10 Code 3',
 'ICD10 Code 4',
 'ICD10 Code 5',
 'ICD10 Code 6',
 'ICD10 Code 7',
 'ICD10 Code 8',
 'ICD10 Code 9',
 'ICD10 Code 10',
 'Service Type',
 'Service Code',
 'Modifiers',
 'High Cost Claim',
 'Paid - Recieved',
 'Paid - Service',
 'Service - Recieved',
 'p-s outlier',
 's-r outlier']

In [77]:
df = df.to_pandas()

In [78]:
df['Month'] = pd.to_datetime(df['Service Date']).dt.month
df['Day of Week'] = pd.to_datetime(df['Service Date']).dt.dayofweek

In [79]:
df.columns

Index(['Claim ID', 'Service Date', 'Recieved Date', 'Paid Date', 'Patient ID',
       'Member Age', 'Gender', 'Marital Status', 'Ethnicity', 'LOB',
       'Network Status', 'Claim Category', 'Claim Subcategory', 'Claim Line',
       'Place of Service', 'Provider Type', 'Provider Specialty',
       'ICD10 Code 1', 'ICD10 Code 2', 'ICD10 Code 3', 'ICD10 Code 4',
       'ICD10 Code 5', 'ICD10 Code 6', 'ICD10 Code 7', 'ICD10 Code 8',
       'ICD10 Code 9', 'ICD10 Code 10', 'Service Type', 'Service Code',
       'Modifiers', 'High Cost Claim', 'Paid - Recieved', 'Paid - Service',
       'Service - Recieved', 'p-s outlier', 's-r outlier', 'Month',
       'Day of Week'],
      dtype='object')

In [80]:
relevant_cols = [ 
 'ICD10 Code 1',
 'ICD10 Code 2',
 'ICD10 Code 3',
 'ICD10 Code 4',
 'ICD10 Code 5',
 'ICD10 Code 6',
 'ICD10 Code 7',
 'ICD10 Code 8',
 'ICD10 Code 9',
 'ICD10 Code 10',
 'Month',
 'Day of Week',
 'Network Status',
 'Service Code',
 'Claim Category',
 'High Cost Claim',
 'p-s outlier',
 's-r outlier'
 ]

In [84]:
df_pd = df[relevant_cols]

In [85]:
import polars as pl
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [87]:
# Identify categorical and numerical features
categorical_cols = ["Network Status", "Service Code", "Claim Category"]
numerical_cols = [col for col in df_pd.columns if col not in categorical_cols + ["p-s outlier", "s-r outlier"]]

# Apply One-Hot Encoding to categorical features
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_cats = encoder.fit_transform(df_pd[categorical_cols])

# Create new column names for one-hot encoded features
encoded_col_names = encoder.get_feature_names_out(categorical_cols)

# Convert the encoded data into a DataFrame
encoded_df = pd.DataFrame(encoded_cats, columns=encoded_col_names)

# Combine numerical and encoded categorical features
X = pd.concat([df_pd[numerical_cols], encoded_df], axis=1)

# Define target labels
y_p_s = df_pd["p-s outlier"]
y_s_r = df_pd["s-r outlier"]


In [88]:
X_train, X_test, y_p_s_train, y_p_s_test = train_test_split(X, y_p_s, test_size=0.2, random_state=42)
X_train, X_test, y_s_r_train, y_s_r_test = train_test_split(X, y_s_r, test_size=0.2, random_state=42)


In [89]:
# Train Decision Tree for "p-s outlier"
clf_p_s = DecisionTreeClassifier(random_state=42)
clf_p_s.fit(X_train, y_p_s_train)

# Train Decision Tree for "s-r outlier"
clf_s_r = DecisionTreeClassifier(random_state=42)
clf_s_r.fit(X_train, y_s_r_train)


In [90]:
# Predict on test set
y_p_s_pred = clf_p_s.predict(X_test)
y_s_r_pred = clf_s_r.predict(X_test)

# Compute accuracy scores
acc_p_s = accuracy_score(y_p_s_test, y_p_s_pred)
acc_s_r = accuracy_score(y_s_r_test, y_s_r_pred)

print(f"Accuracy for 'p-s outlier': {acc_p_s:.4f}")
print(f"Accuracy for 's-r outlier': {acc_s_r:.4f}")


Accuracy for 'p-s outlier': 0.9309
Accuracy for 's-r outlier': 0.9417


In [91]:
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc

# Compute precision, recall, and F1-score
report_p_s = classification_report(y_p_s_test, y_p_s_pred, digits=4)
report_s_r = classification_report(y_s_r_test, y_s_r_pred, digits=4)

print("Classification Report for 'p-s outlier':\n", report_p_s)
print("Classification Report for 's-r outlier':\n", report_s_r)

# Compute ROC-AUC score
roc_auc_p_s = roc_auc_score(y_p_s_test, clf_p_s.predict_proba(X_test)[:, 1])
roc_auc_s_r = roc_auc_score(y_s_r_test, clf_s_r.predict_proba(X_test)[:, 1])

print(f"ROC-AUC Score for 'p-s outlier': {roc_auc_p_s:.4f}")
print(f"ROC-AUC Score for 's-r outlier': {roc_auc_s_r:.4f}")

# Compute Precision-Recall AUC
precision_p_s, recall_p_s, _ = precision_recall_curve(y_p_s_test, clf_p_s.predict_proba(X_test)[:, 1])
pr_auc_p_s = auc(recall_p_s, precision_p_s)

precision_s_r, recall_s_r, _ = precision_recall_curve(y_s_r_test, clf_s_r.predict_proba(X_test)[:, 1])
pr_auc_s_r = auc(recall_s_r, precision_s_r)

print(f"PR-AUC Score for 'p-s outlier': {pr_auc_p_s:.4f}")
print(f"PR-AUC Score for 's-r outlier': {pr_auc_s_r:.4f}")


Classification Report for 'p-s outlier':
               precision    recall  f1-score   support

           0     0.9585    0.9632    0.9609     44724
           1     0.7163    0.6901    0.7029      6014

    accuracy                         0.9309     50738
   macro avg     0.8374    0.8266    0.8319     50738
weighted avg     0.9298    0.9309    0.9303     50738

Classification Report for 's-r outlier':
               precision    recall  f1-score   support

           0     0.9667    0.9691    0.9679     45990
           1     0.6929    0.6763    0.6845      4748

    accuracy                         0.9417     50738
   macro avg     0.8298    0.8227    0.8262     50738
weighted avg     0.9410    0.9417    0.9413     50738

ROC-AUC Score for 'p-s outlier': 0.8309
ROC-AUC Score for 's-r outlier': 0.8267
PR-AUC Score for 'p-s outlier': 0.7173
PR-AUC Score for 's-r outlier': 0.6941
