In [None]:

# Step 1: Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import xgboost as xgb

# Step 2: Load Dataset
df = pd.read_csv("medical_claims_dataset.csv")

# Step 3: Preprocessing
df = df.drop(columns=['claim_id', 'patient_id', 'provider_id', 'service_date', 'claim_submission_date'])

label_encoders = {}
target_col = 'claim_status'

# Encode target
df[target_col] = df[target_col].astype(str)
le_status = LabelEncoder()
df[target_col] = le_status.fit_transform(df[target_col])
label_encoders[target_col] = le_status

# Encode categorical columns
for col in df.select_dtypes(include='object').columns:
    if col != target_col:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le

# Step 4: Split features and target
X = df.drop(columns=['claim_status', 'denial_reason', 'denial_category', 'is_denied'])
y = df['claim_status']

# Step 5: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Step 6: Apply SMOTE
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

# Step 7: Train Models
models = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "SVM": SVC(kernel='linear', probability=True, random_state=42),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}

# Step 8: Evaluate Models
results = []
for name, model in models.items():
    model.fit(X_train_resampled, y_train_resampled)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=le_status.classes_, output_dict=True)
    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": report["weighted avg"]["precision"],
        "Recall": report["weighted avg"]["recall"],
        "F1-Score": report["weighted avg"]["f1-score"]
    })

# Step 9: Show Comparison Table
results_df = pd.DataFrame(results)
display(results_df)
