In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# ----------------- Load Data -----------------
df = pd.read_csv("synthetic_claims_dataset.csv", header=0)

# Drop '#' column if present
if '#' in df.columns:
    df = df.drop(columns=['#'])

# Clean column names
df.columns = df.columns.str.strip().str.replace(' ', '_').str.replace('[^a-zA-Z0-9_]', '', regex=True)

print("Cleaned Columns:", df.columns.tolist())

# Convert money columns to float (if they contain $)
for col in ["Payment_Amount", "Balance"]:
    if col in df.columns:
        df[col] = df[col].astype(str).str.replace("$", "", regex=False).astype(float)

# Detect target column (Denial_Reason)
target_col = None
for col in df.columns:
    if "denial" in col.lower():
        target_col = col
        break

if target_col is None:
    raise ValueError(f"No column found for 'Denial Reason'. Found columns: {df.columns.tolist()}")

# ----------------- Features & Target -----------------
X = df.drop(target_col, axis=1)
y = df[target_col].fillna("No Denial")

# Force CPT_Code to categorical
X["CPT_Code"] = X["CPT_Code"].astype(str)

# Identify categorical & numeric columns
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numeric_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

print("Categorical columns:", categorical_cols)
print("Numeric columns:", numeric_cols)

# ----------------- Preprocessing -----------------
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("encoder", OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_cols),
        
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="mean"))
        ]), numeric_cols)
    ]
)

# ----------------- Model Pipeline -----------------
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(
        n_estimators=300, random_state=42, max_depth=None
    ))
])

# ----------------- Train/Test Split -----------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ----------------- Train Model -----------------
model.fit(X_train, y_train)

# ----------------- Evaluate -----------------
y_pred = model.predict(X_test)

print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ----------------- Save Model -----------------
joblib.dump(model, "claims_model_synthetic.pkl")
print("✅ Model trained & saved as 'claims_model_synthetic.pkl'")


Cleaned Columns: ['CPT_Code', 'Insurance_Company', 'Physician_Name', 'Payment_Amount', 'Balance', 'Denial_Reason']
Categorical columns: ['CPT_Code', 'Insurance_Company', 'Physician_Name']
Numeric columns: ['Payment_Amount', 'Balance']
✅ Accuracy: 1.0

Classification Report:
                                   precision    recall  f1-score   support

        16 - Missing information       1.00      1.00      1.00       108
45 - Charge exceeds fee schedule       1.00      1.00      1.00       121
        96 - Non-covered service       1.00      1.00      1.00       117
                       No Denial       1.00      1.00      1.00       254

                        accuracy                           1.00       600
                       macro avg       1.00      1.00      1.00       600
                    weighted avg       1.00      1.00      1.00       600

✅ Model trained & saved as 'claims_model_synthetic.pkl'
