# Importing Libraries



In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
import sklearn

# Data Reading

In [51]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [52]:
path= '/content/drive/MyDrive/Datasets/healthcare_dataset.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal


# Data Inspection

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                55500 non-null  object 
 1   Age                 55500 non-null  int64  
 2   Gender              55500 non-null  object 
 3   Blood Type          55500 non-null  object 
 4   Medical Condition   55500 non-null  object 
 5   Date of Admission   55500 non-null  object 
 6   Doctor              55500 non-null  object 
 7   Hospital            55500 non-null  object 
 8   Insurance Provider  55500 non-null  object 
 9   Billing Amount      55500 non-null  float64
 10  Room Number         55500 non-null  int64  
 11  Admission Type      55500 non-null  object 
 12  Discharge Date      55500 non-null  object 
 13  Medication          55500 non-null  object 
 14  Test Results        55500 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 6.4

In [54]:
df.duplicated().sum()

np.int64(534)

In [32]:
df.isna().sum()

Unnamed: 0,0
Name,0
Age,0
Gender,0
Blood Type,0
Medical Condition,0
Date of Admission,0
Doctor,0
Hospital,0
Insurance Provider,0
Billing Amount,0


In [33]:
df.describe()

Unnamed: 0,Age,Billing Amount,Room Number
count,55500.0,55500.0,55500.0
mean,51.539459,25539.316097,301.134829
std,19.602454,14211.454431,115.243069
min,13.0,-2008.49214,101.0
25%,35.0,13241.224652,202.0
50%,52.0,25538.069376,302.0
75%,68.0,37820.508436,401.0
max,89.0,52764.276736,500.0


In [34]:
df.describe(include='object')

Unnamed: 0,Name,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Admission Type,Discharge Date,Medication,Test Results
count,55500,55500,55500,55500,55500,55500,55500,55500,55500,55500,55500,55500
unique,49992,2,8,6,1827,40341,39876,5,3,1856,5,3
top,DAvId muNoZ,Male,A-,Arthritis,2024-03-16,Michael Smith,LLC Smith,Cigna,Elective,2020-03-15,Lipitor,Abnormal
freq,3,27774,6969,9308,50,27,44,11249,18655,53,11140,18627


# Cleaning Data


In [35]:
df.drop_duplicates(inplace=True)

In [36]:
df.duplicated().sum()

np.int64(0)

In [37]:
drop_cols = ["Name","Doctor","Hospital","Room Number"]

df.drop(columns=drop_cols, inplace=True)


# Handling outlier



In [38]:
df = df[df["Billing Amount"] >= 0]

In [39]:
Q1 = df["Billing Amount"].quantile(0.25)
Q3 = df["Billing Amount"].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR

df = df[(df["Billing Amount"] >= lower) & (df["Billing Amount"] <= upper)]

# Feature engineering

In [40]:
df["Date of Admission"] = pd.to_datetime(df["Date of Admission"])
df["Discharge Date"] = pd.to_datetime(df["Discharge Date"])

df["Length_of_Stay"] = (df["Discharge Date"] - df["Date of Admission"]).dt.days

df.drop(columns=["Date of Admission", "Discharge Date"], inplace=True)

In [41]:
target = "Billing Amount"

X = df.drop(target, axis=1)
y = df[target]

# Preprocessing

In [42]:
target = "Billing Amount"

X = df.drop(target, axis=1)
y = df[target]

In [43]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

cat_cols = X.select_dtypes(include="object").columns
num_cols = X.select_dtypes(exclude="object").columns


In [44]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OrdinalEncoder(), cat_cols)
    ]
)

# Train-Test Split

In [45]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Training Models

# LinearRegression

In [46]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

LR_model = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", LinearRegression())
])

# Train
LR_model.fit(X_train, y_train)

# Predict
y_pred = LR_model.predict(X_test)

# Evaluate
print("Linear Regression Results:")
print("MAE :", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2  :", r2_score(y_test, y_pred))



Linear Regression Results:
MAE : 12267.204651659433
RMSE: 14168.830722144556
R2  : -0.0001488656142523137


# KNN

In [47]:
from sklearn.neighbors import KNeighborsRegressor

knn_model = Pipeline([
    ('prep', preprocessor),
    ('knn', KNeighborsRegressor(n_neighbors=5))  # default k = 5
])

knn_model.fit(X_train, y_train)

y_pred_knn = (knn_model.predict(X_test))

print("KNN Results:")
print("MAE:", mean_absolute_error(y_test, y_pred_knn))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_knn)))
print("R2 Score:", r2_score(y_test, y_pred_knn))

KNN Results:
MAE: 12825.419817492508
RMSE: 15320.57318265222
R2 Score: -0.1693557368448888


# Improve KNN with Hyperparameter Tuning

In [48]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'knn__n_neighbors': [3, 5, 7],
    'knn__weights': ['uniform'],
    'knn__p': [2]
}

grid = GridSearchCV(knn_model, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

grid.fit(X_train, y_train)

print("Best Params:", grid.best_params_)

best_knn = grid.best_estimator_
y_pred_best = best_knn.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred_best))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_best)))
print("R2:", r2_score(y_test, y_pred_best))

Best Params: {'knn__n_neighbors': 7, 'knn__p': 2, 'knn__weights': 'uniform'}
MAE: 12670.385322047665
RMSE: 14985.029444293863
R2: -0.11869532845998854


# Compare Predictions vs Actuals

In [49]:
# plt.figure(figsize=(14,6))

# # Left: Linear Regression
# plt.subplot(1, 2, 1)
# plt.scatter(y_test, y_pred, alpha=0.6, color='blue')
# plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
# plt.xlabel("Actual Charges")
# plt.ylabel("Predicted Charges")
# plt.title("Linear Regression: Actual vs Predicted")

# # Right: KNN Regressor
# plt.subplot(1, 2, 2)
# plt.scatter(y_test, y_pred_knn, alpha=0.6, color='green')
# plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
# plt.xlabel("Actual Charges")
# plt.ylabel("Predicted Charges")
# plt.title("KNN Regressor: Actual vs Predicted")

# plt.tight_layout()
# plt.show()