<a href="https://colab.research.google.com/github/KritiM7/Healthcare-Cost-Predictor/blob/main/Medical_Treatment_Cost_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
# Install the library that handles large Google Drive files
!pip install -U --no-cache-dir gdown

import gdown
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeCV
from sklearn.metrics import r2_score, mean_absolute_error
import joblib

# This is SPARCS file ID
file_id = '1O9-FsD5MO9M2QQXs_zvRdFmen-CD7pql'
url = f'https://drive.google.com/uc?id=1O9-FsD5MO9M2QQXs_zvRdFmen-CD7pql'

output = 'medical_data.csv'
gdown.download(url, output, quiet=False)

# Now we read the local file
df = pd.read_csv(output, low_memory=False)

print("\n✅ DATA LOADED SUCCESSFULLY!")
print(f"Dataset Size: {df.shape}")
display(df.head())



Downloading...
From (original): https://drive.google.com/uc?id=1O9-FsD5MO9M2QQXs_zvRdFmen-CD7pql
From (redirected): https://drive.google.com/uc?id=1O9-FsD5MO9M2QQXs_zvRdFmen-CD7pql&confirm=t&uuid=82baf81a-847f-47b3-96e5-e74d5d247b26
To: /content/medical_data.csv
100%|██████████| 832M/832M [00:10<00:00, 81.1MB/s]



✅ DATA LOADED SUCCESSFULLY!
Dataset Size: (2101588, 33)


Unnamed: 0,Hospital Service Area,Hospital County,Operating Certificate Number,Permanent Facility Id,Facility Name,Age Group,Zip Code - 3 digits,Gender,Race,Ethnicity,...,APR Severity of Illness Description,APR Risk of Mortality,APR Medical Surgical Description,Payment Typology 1,Payment Typology 2,Payment Typology 3,Birth Weight,Emergency Department Indicator,Total Charges,Total Costs
0,New York City,Bronx,7000006.0,1169.0,Montefiore Medical Center - Henry & Lucy Moses...,70 or Older,104,M,Other Race,Spanish/Hispanic,...,Major,Extreme,Medical,Medicare,Medicaid,,,Y,320922.43,60241.34
1,New York City,Bronx,7000006.0,1169.0,Montefiore Medical Center - Henry & Lucy Moses...,50 to 69,104,F,White,Not Span/Hispanic,...,Moderate,Minor,Medical,Private Health Insurance,,,,Y,61665.22,9180.69
2,New York City,Bronx,7000006.0,1168.0,Montefiore Medical Center-Wakefield Hospital,18 to 29,104,F,Other Race,Spanish/Hispanic,...,Minor,Minor,Surgical,Medicaid,,,,N,42705.34,11366.5
3,New York City,Bronx,7000006.0,3058.0,Montefiore Med Center - Jack D Weiler Hosp of ...,70 or Older,104,M,Other Race,Spanish/Hispanic,...,Major,Major,Medical,Medicare,Medicaid,,,Y,72700.17,12111.75
4,New York City,Bronx,7000006.0,1169.0,Montefiore Medical Center - Henry & Lucy Moses...,50 to 69,104,F,Black/African American,Not Span/Hispanic,...,Moderate,Minor,Medical,Medicare,Medicaid,,,Y,55562.51,8339.72




In [28]:
# Handling missing values

core=['Permanent Facility Id', 'Age Group', 'APR Severity of Illness Description', 'Total Charges', 'Total Costs']
df = df.dropna(subset=core).copy()    # deleting the data with missing values in the columns listed in subset clause

In [29]:
num_cols = ['Permanent Facility Id', 'Total Charges']  # Numerical columns
for col in num_cols:
   df[col] = df[col].astype(str).str.replace(',', '', regex=False).astype(float) # Convert the column to string, remove commas, then convert to float for calculations

cat_cols = ['APR Severity of Illness Description']      # categorical column

target = 'Total Costs'                                  # Target column
df[target] = df[target].astype(str).str.replace(',', '', regex=False).astype(float) # Convert the column to float

X = df[num_cols + cat_cols]      # X has feature columns
y = df[target]                   # y has the target column

In [30]:
# Pre‑processing, model seletion & model pipeline

pre = ColumnTransformer([
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols),
        ('num', StandardScaler(), num_cols)    # ohe the categorical columns and scaling the numerical columns
])

alphas = [0.1, 1, 10, 50, 100, 250]       # Define a list of alpha values to test
ridge  = RidgeCV(alphas=alphas, cv=5)     # Initialize a Ridge regression model with 5 fold Cross-Validation

pipe = Pipeline([                         # Pipeline chains the pre-processing and the model steps together
        ('prep',  pre),
        ('model', ridge)
])

In [31]:
# Train test split

X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, shuffle=True) # using the train test split method to spilt the data for training and testing the model, seting 20% of data to test the model, remaining 80% to train
                                                            # Random state ensures reproducable data. Shuffle ensures the rows of the data are randomly reordered before the split
pipe.fit(X_train, y_train)                                  # Trains the machine learning pipeline (pipe) using the training features (X_train) and target values (y_train)



In [33]:
y_pred = pipe.predict(X_test) # Uses the trained pipeline to generate predictions on the test features
print(f"Chosen α (L2)      : {pipe.named_steps['model'].alpha_}") # Print the α value chosen by the cross-validated model
print(f"R² on hold‑out set : {r2_score(y_test, y_pred):.3f}") # Calculates the R square score using the y_test and y_pred
print(f"MAE on hold‑out    : ${mean_absolute_error(y_test, y_pred):,.0f}")  # Calculates MAE using y_test and y_pred


Chosen α (L2)      : 100.0
R² on hold‑out set : 0.708
MAE on hold‑out    : $8,043


In [None]:
# Persist the trained pipeline

joblib.dump(pipe, "ridge_treatment_cost_model.pkl")

['ridge_treatment_cost_model.pkl']