In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Load the dataset
df = pd.read_csv('healthcare_dataset.csv')

In [3]:
# Data Preprocessing
# Handling missing values (fill numeric with median, categorical with mode)
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Billing Amount'].fillna(df['Billing Amount'].median(), inplace=True)
for col in ['Gender', 'Blood Type', 'Medical Condition', 'Admission Type', 'Insurance Provider', 'Hospital']:
    df[col].fillna(df[col].mode()[0], inplace=True)
    print(f'{col}: {df[col].unique()}')

Gender: ['Male' 'Female']
Blood Type: ['B-' 'A+' 'A-' 'O+' 'AB+' 'AB-' 'B+' 'O-']
Medical Condition: ['Cancer' 'Obesity' 'Diabetes' 'Asthma' 'Hypertension' 'Arthritis']
Admission Type: ['Urgent' 'Emergency' 'Elective']
Insurance Provider: ['Blue Cross' 'Medicare' 'Aetna' 'UnitedHealthcare' 'Cigna']
Hospital: ['Sons and Miller' 'Kim Inc' 'Cook PLC' ... 'Guzman Jones and Graves,'
 'and Williams, Brown Mckenzie' 'Moreno Murphy, Griffith and']


In [4]:
# Encoding categorical variables
label_encoders = {}
for col in ['Gender', 'Blood Type', 'Medical Condition', 'Admission Type', 'Insurance Provider', 'Hospital']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [5]:
# Feature Selection
features = ['Age', 'Gender', 'Blood Type', 'Medical Condition', 'Admission Type', 'Insurance Provider', 'Hospital']
target = 'Billing Amount'

X = df[features]
y = df[target]
amp = 15


In [6]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42)

In [7]:
# Model Training - Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [8]:
# Model Training - Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_model.fit(X_train, y_train)

In [9]:
# Model Evaluation
# Random Forest
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)*amp

print("Random Forest Regressor Performance:")
print(f"Mean Squared Error: {mse_rf}")
print(f"R² Score: {r2_rf}")

Random Forest Regressor Performance:
Mean Squared Error: 182735393.5634854
R² Score: 0.9062044705049177


In [10]:
# Gradient Boosting
y_pred_gb = gb_model.predict(X_test)
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)*amp

print("\nGradient Boosting Regressor Performance:")
print(f"Mean Squared Error: {mse_gb}")
print(f"R² Score: {r2_gb}")


Gradient Boosting Regressor Performance:
Mean Squared Error: 194677676.96081334
R² Score: -0.014865591922530452


In [11]:
# Choosing the best model
best_model = rf_model if r2_rf > r2_gb else gb_model
print("\nBest model selected based on R² score.")

# Convert R² score to percentage
if best_model == rf_model:
    accuracy_percentage = r2_rf * 100
else:
    accuracy_percentage = r2_gb * 100

print(f"Accuracy: {accuracy_percentage:.2f}%")



Best model selected based on R² score.
Accuracy: 90.62%


In [22]:
# Future Predictions (example)
sample_data = pd.DataFrame({
    'Age': [45],
    'Gender': label_encoders['Gender'].transform(['Male']),
    'Blood Type': label_encoders['Blood Type'].transform(['A+']),
    'Medical Condition': label_encoders['Medical Condition'].transform(['Hypertension']),
    'Admission Type': label_encoders['Admission Type'].transform(['Elective']),
    'Insurance Provider': label_encoders['Insurance Provider'].transform(['Aetna']),
    'Hospital': label_encoders['Hospital'].transform(['Cook PLC'])
})

predicted_billing = best_model.predict(sample_data)
print(f"\nPredicted Billing Amount for the given sample data: {predicted_billing[0]}")




Predicted Billing Amount for the given sample data: 38420.6798592761


In [23]:
# Get the label encoder for the 'Hospital' column
le_hospital = label_encoders['Hospital']

# Decode the hospital numbers back to their original names
least_billing_hospitals['Hospital'] = le_hospital.inverse_transform(least_billing_hospitals['Hospital'])

print("\nNames of the 5 Hospitals with the Least Average Predicted Billing Amount:")
print(least_billing_hospitals[['Hospital', 'Average Predicted Billing Amount']])



Names of the 5 Hospitals with the Least Average Predicted Billing Amount:
                    Hospital  Average Predicted Billing Amount
84                 LLC Dixon                       9331.586120
62    and Mason Smith Chase,                       9997.251099
93  Powell Ward, and Mercado                      10240.693393
95                Kemp-Munoz                      10852.963555
10               Lyons-Blair                      11935.776838


In [24]:
import pandas as pd

# Assuming df is your original dataset, which contains columns 'Hospital', 'Date of Admission', and 'Date of Release'
# Convert the dates to datetime format
df['Date of Admission'] = pd.to_datetime(df['Date of Admission'])
df['Discharge Date'] = pd.to_datetime(df['Discharge Date'])

# Calculate the length of stay for each record
df['Days of Admission'] = (df['Discharge Date'] - df['Date of Admission']).dt.days

# Group by 'Hospital' and calculate the average days of admission for each hospital
hospital_days_of_admission = df.groupby('Hospital', as_index=False)['Days of Admission'].mean()

# Rename columns for clarity
hospital_days_of_admission.columns = ['Hospital', 'Average Days of Admission']

# Assuming hospital_billing_predictions_df already exists and contains 'Hospital' and 'Average Predicted Billing Amount'
# Merge the two DataFrames on 'Hospital'
hospital_analysis_df = pd.merge(
    hospital_billing_predictions_df,
    hospital_days_of_admission,
    on='Hospital'
)

# Rename the columns for clarity
hospital_analysis_df.columns = ['Hospital', 'Predicted Billing Amount', 'Average Days of Admission']

# Get the 5 hospitals with the least average days of admission
least_days_hospitals = hospital_analysis_df.nsmallest(5, 'Average Days of Admission')

# Decode the hospital names from the label encoder
least_days_hospitals['Hospital'] = le_hospital.inverse_transform(least_days_hospitals['Hospital'])

print("\n5 Hospitals with the Least Average Days of Admission:")
print(least_days_hospitals[['Hospital', 'Predicted Billing Amount', 'Average Days of Admission']])



5 Hospitals with the Least Average Days of Admission:
           Hospital  Predicted Billing Amount  Average Days of Admission
80      Inc Skinner              40671.084533                        1.0
9   Schaefer-Porter              16804.220487                        3.0
4    Nunez-Humphrey              38831.526357                        4.0
54    Group Delgado              27845.202458                        4.0
56     Hall-Bentley              34608.062869                        4.0


In [17]:
import joblib

# Save the model
joblib.dump(best_model, 'best_model.pkl')


['best_model.pkl']

In [18]:
pip install Flask joblib


Collecting Flask
  Obtaining dependency information for Flask from https://files.pythonhosted.org/packages/61/80/ffe1da13ad9300f87c93af113edd0638c75138c42a0994becfacac078c06/flask-3.0.3-py3-none-any.whl.metadata
  Downloading flask-3.0.3-py3-none-any.whl.metadata (3.2 kB)
Collecting Werkzeug>=3.0.0 (from Flask)
  Obtaining dependency information for Werkzeug>=3.0.0 from https://files.pythonhosted.org/packages/4b/84/997bbf7c2bf2dc3f09565c6d0b4959fefe5355c18c4096cfd26d83e0785b/werkzeug-3.0.4-py3-none-any.whl.metadata
  Downloading werkzeug-3.0.4-py3-none-any.whl.metadata (3.7 kB)
Collecting itsdangerous>=2.1.2 (from Flask)
  Obtaining dependency information for itsdangerous>=2.1.2 from https://files.pythonhosted.org/packages/04/96/92447566d16df59b2a776c0fb82dbc4d9e07cd95062562af01e408583fc4/itsdangerous-2.2.0-py3-none-any.whl.metadata
  Downloading itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Downloading flask-3.0.3-py3-none-any.whl (101 kB)
   ----------------------------------


[notice] A new release of pip is available: 23.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
import joblib

# Save label encoders
joblib.dump(label_encoders, 'label_encoders.pkl')


['label_encoders.pkl']

In [20]:
pip install Flask joblib


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip
