In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# The provided sample data will now be read directly from a CSV file.
# Make sure the file 'data.csv' is in the same directory as this script.
try:
    # Read the CSV file, specifying that the header is on the third row (index 2).
    df = pd.read_csv('data.csv', header=2)
    print("Data loaded successfully from data.csv")
except FileNotFoundError:
    print("Error: data.csv not found. Please ensure the file is in the same directory as the script.")
    exit()

# 1. Data Cleaning and Feature Engineering
print("Starting data cleaning and preprocessing...")

# Drop the '#' column first, before any name cleaning.
df = df.drop(columns=['#'])

# Now, clean the remaining column names by removing spaces and special characters.
df.columns = df.columns.str.strip().str.replace(' ', '_').str.replace('[^a-zA-Z0-9_]', '', regex=True)

# Explicitly convert CPT_Code to a string type to avoid TypeError during encoding.
df['CPT_Code'] = df['CPT_Code'].astype(str)

# Convert currency strings to floats. We'll handle any missing values later.
df['Payment_Amount'] = df['Payment_Amount'].replace('[\$,]', '', regex=True).astype(float)
df['Balance'] = df['Balance'].replace('[\$,]', '', regex=True).astype(float)
# Create a boolean target variable: True if the claim was denied, False otherwise
df['Is_Denied'] = df['Denial_Reason'].fillna('').apply(lambda x: x.strip() != '')

# Define features (X) and target (y)
features = ['CPT_Code', 'Insurance_Company', 'Physician_Name', 'Payment_Amount', 'Balance']
target = 'Is_Denied'
X = df[features]
y = df[target]

# 2. Preprocessing Pipeline for Categorical and Numerical features
# This step is crucial. It converts the text-based categorical data into a numerical format
# that the machine learning model can understand.
categorical_features = ['CPT_Code', 'Insurance_Company', 'Physician_Name']
numerical_features = ['Payment_Amount', 'Balance']

# Create a pipeline for categorical features to first impute missing values, then one-hot encode.
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create a pipeline for numerical features to impute missing values with the mean.
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)],
    remainder='passthrough' # Keep any remaining columns as they are
)

# 3. Build the Machine Learning Pipeline
# A pipeline streamlines the workflow, combining preprocessing and model training.
# We'll use RandomForestClassifier, which is a powerful ensemble model.
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(random_state=42))])

# 4. Train the model
# Split the data into a training set and a testing set to evaluate the model's performance
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training the model...")
model.fit(X_train, y_train)

# 5. Evaluate the model
y_pred = model.predict(X_test)
print("\nModel Performance on Test Data:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# 6. Save the trained model to a file
# joblib is used to efficiently serialize and save the model object to a file.
model_filename = 'denial_model.joblib'
joblib.dump(model, model_filename)
print(f"\nModel saved successfully as '{model_filename}'")


  df['Payment_Amount'] = df['Payment_Amount'].replace('[\$,]', '', regex=True).astype(float)
  df['Balance'] = df['Balance'].replace('[\$,]', '', regex=True).astype(float)


Data loaded successfully from data.csv
Starting data cleaning and preprocessing...
Training the model...

Model Performance on Test Data:
Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00         3
        True       1.00      1.00      1.00         3

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6


Model saved successfully as 'denial_model.joblib'
