In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import shap


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the dataset
df = pd.read_csv('healthcare_dataset.csv')

In [7]:
# Feature Engineering
df['Duration of Stay'] = (pd.to_datetime(df['Discharge Date']) - pd.to_datetime(df['Date of Admission'])).dt.days
df['Duration of Stay'] = df['Duration of Stay'].apply(lambda x: df['Duration of Stay'].median() if x < 0 else x)
df['Month of Admission'] = pd.to_datetime(df['Date of Admission']).dt.month

In [12]:
# List of categorical features
categorical_features = ['Medical Condition', 'Admission Type', 'Insurance Provider', 'Hospital', 'Gender', 'Blood Type']

# Convert all string values in categorical features to lowercase
# Fill NaN with an empty string and convert to string type before applying .str.lower()
df[categorical_features] = df[categorical_features].fillna('').astype(str).apply(lambda x: x.str.lower())


In [13]:
# One-Hot Encode categorical features
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)

In [14]:
# Prepare features and target variable
X = df.drop(columns=['Billing Amount', 'Discharge Date', 'Date of Admission'])
y = df['Billing Amount']

In [15]:
# Remove outliers in 'Billing Amount'
Q1 = df['Billing Amount'].quantile(0.25)
Q3 = df['Billing Amount'].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df['Billing Amount'] < (Q1 - 1.5 * IQR)) | (df['Billing Amount'] > (Q3 + 1.5 * IQR)))]

In [16]:
# Prepare features and target variable again after outlier removal
X = df.drop(columns=['Billing Amount', 'Discharge Date', 'Date of Admission'])
y = df['Billing Amount']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Check for non-numeric columns and NaN values
print("Data types of features:")
print(X_train.dtypes)

print("\nChecking for NaN values in training set:")
print(X_train.isnull().sum())

Data types of features:
Name            object
Age              int64
Doctor          object
Room Number      int64
Medication      object
                 ...  
Blood Type_3      bool
Blood Type_4      bool
Blood Type_5      bool
Blood Type_6      bool
Blood Type_7      bool
Length: 39902, dtype: object

Checking for NaN values in training set:
Name            0
Age             0
Doctor          0
Room Number     0
Medication      0
               ..
Blood Type_3    0
Blood Type_4    0
Blood Type_5    0
Blood Type_6    0
Blood Type_7    0
Length: 39902, dtype: int64


In [18]:
# Define the Gradient Boosting model
gb = GradientBoostingRegressor(random_state=42)

In [19]:
# Define the hyperparameter grid for Randomized Search
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [24]:
# Print the current columns of the DataFrame
print(df.columns.tolist())

# Check if all the specified categorical columns exist
categorical_cols = ['Gender', 'Blood Type', 'Medical Condition', 'Admission Type', 'Insurance Provider', 'Hospital']
missing_cols = [col for col in categorical_cols if col not in df.columns]

if missing_cols:
    print(f"The following columns are missing from the DataFrame: {missing_cols}")
else:
    # If all columns are present, proceed to optimize data types
    df[categorical_cols] = df[categorical_cols].astype('category')
    # Optimize numeric columns
    df['Age'] = df['Age'].astype('int32')  # or 'int16' if possible


['Name', 'Age', 'Date of Admission', 'Doctor', 'Billing Amount', 'Room Number', 'Discharge Date', 'Medication', 'Test Results', 'Duration of Stay', 'Month of Admission', 'Medical Condition_1', 'Medical Condition_2', 'Medical Condition_3', 'Medical Condition_4', 'Medical Condition_5', 'Admission Type_1', 'Admission Type_2', 'Insurance Provider_1', 'Insurance Provider_2', 'Insurance Provider_3', 'Insurance Provider_4', 'Hospital_1', 'Hospital_10', 'Hospital_100', 'Hospital_1000', 'Hospital_10000', 'Hospital_10001', 'Hospital_10002', 'Hospital_10003', 'Hospital_10004', 'Hospital_10005', 'Hospital_10006', 'Hospital_10007', 'Hospital_10008', 'Hospital_10009', 'Hospital_1001', 'Hospital_10010', 'Hospital_10011', 'Hospital_10012', 'Hospital_10013', 'Hospital_10014', 'Hospital_10015', 'Hospital_10016', 'Hospital_10017', 'Hospital_10018', 'Hospital_10019', 'Hospital_1002', 'Hospital_10020', 'Hospital_10021', 'Hospital_10022', 'Hospital_10023', 'Hospital_10024', 'Hospital_10025', 'Hospital_10026

In [26]:
# Check and optimize Data Types
print(df.columns.tolist())

# Categorical columns
categorical_cols = ['Gender', 'Blood Type', 'Medical Condition', 'Admission Type', 'Insurance Provider', 'Hospital']
missing_cols = [col for col in categorical_cols if col not in df.columns]

if missing_cols:
    print(f"The following columns are missing from the DataFrame: {missing_cols}")
else:
    # Convert categorical columns to 'category'
    df[categorical_cols] = df[categorical_cols].astype('category')
    
    # Optimize numeric columns
    df['Age'] = df['Age'].astype('int32')  # or 'int16' if possible

    # Proceed with Randomized Search CV
    from sklearn.ensemble import GradientBoostingRegressor
    from sklearn.model_selection import RandomizedSearchCV

    # Define your model
    gb = GradientBoostingRegressor()

    # Define parameter grid
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 0.9, 1.0]
    }

    # Perform Randomized Search CV
    gb_random = RandomizedSearchCV(estimator=gb, param_distributions=param_grid, n_iter=20, cv=3, random_state=42, n_jobs=-1)
    gb_random.fit(X_train, y_train)


['Name', 'Age', 'Date of Admission', 'Doctor', 'Billing Amount', 'Room Number', 'Discharge Date', 'Medication', 'Test Results', 'Duration of Stay', 'Month of Admission', 'Medical Condition_1', 'Medical Condition_2', 'Medical Condition_3', 'Medical Condition_4', 'Medical Condition_5', 'Admission Type_1', 'Admission Type_2', 'Insurance Provider_1', 'Insurance Provider_2', 'Insurance Provider_3', 'Insurance Provider_4', 'Hospital_1', 'Hospital_10', 'Hospital_100', 'Hospital_1000', 'Hospital_10000', 'Hospital_10001', 'Hospital_10002', 'Hospital_10003', 'Hospital_10004', 'Hospital_10005', 'Hospital_10006', 'Hospital_10007', 'Hospital_10008', 'Hospital_10009', 'Hospital_1001', 'Hospital_10010', 'Hospital_10011', 'Hospital_10012', 'Hospital_10013', 'Hospital_10014', 'Hospital_10015', 'Hospital_10016', 'Hospital_10017', 'Hospital_10018', 'Hospital_10019', 'Hospital_1002', 'Hospital_10020', 'Hospital_10021', 'Hospital_10022', 'Hospital_10023', 'Hospital_10024', 'Hospital_10025', 'Hospital_10026

In [33]:
df = df.drop(columns=['Discharge Date', 'Date of Admission'])

# Perform Randomized Search CV
try:
    gb_random = RandomizedSearchCV(estimator=gb, param_distributions=param_grid, n_iter=20, cv=3, random_state=42, n_jobs=-1)
    gb_random.fit(X_train, y_train)

    # Check if the fitting was successful before accessing best_params_
    if gb_random.best_params_:
        print("Best Parameters: ", gb_random.best_params_)
    else:
        print("The fitting was successful, but no best parameters were found.")
except Exception as e:
    print(f"An error occurred during fitting: {e}")


An error occurred during fitting: Unable to allocate 563. MiB for an array with shape (39894, 14800) and data type bool


In [None]:
# Train with the best parameters found
best_gb = gb_random.best_estimator_
best_gb.fit(X_train, y_train)

# Make predictions
y_pred = best_gb.predict(X_test)

# Evaluation Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Square Error: {rmse}")
print(f"R² Score: {r2}")

# SHAP analysis to understand feature importance
explainer = shap.Explainer(best_gb, X_train)
shap_values = explainer(X_train)

# Plot the summary of SHAP values
shap.summary_plot(shap_values, X_train)