## 1. Imports & Configuration

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import (
    LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler
)
import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1000)


## 2. Load Dataset

In [2]:
df = pd.read_csv("healthcare_dataset.csv")

print("Shape:", df.shape)
display(df.head())
df.info()

Shape: (55500, 15)


Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.78241,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55500 entries, 0 to 55499
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                55500 non-null  object 
 1   Age                 55500 non-null  int64  
 2   Gender              55500 non-null  object 
 3   Blood Type          55500 non-null  object 
 4   Medical Condition   55500 non-null  object 
 5   Date of Admission   55500 non-null  object 
 6   Doctor              55500 non-null  object 
 7   Hospital            55500 non-null  object 
 8   Insurance Provider  55500 non-null  object 
 9   Billing Amount      55500 non-null  float64
 10  Room Number         55500 non-null  int64  
 11  Admission Type      55500 non-null  object 
 12  Discharge Date      55500 non-null  object 
 13  Medication          55500 non-null  object 
 14  Test Results        55500 non-null  object 
dtypes: float64(1), int64(2), object(12)
memory usage: 6.4

## 3. Data Cleaning

In [3]:
df_clean = df.copy()

# Clean column names
df_clean.columns = (
    df_clean.columns.str.strip()
    .str.lower()
    .str.replace(" ", "_")
)

# Clean names
def clean_name(name):
    if pd.isna(name):
        return name
    titles = ["mr", "mrs", "ms", "dr", "phd", "md"]
    name = str(name).lower()
    for t in titles:
        name = name.replace(t + ".", "").replace(t, "")
    return " ".join(name.split()).title()

df_clean["name"] = df_clean["name"].apply(clean_name)

# Gender normalization
gender_map = {
    "Male": "Male", "M": "Male",
    "Female": "Female", "F": "Female"
}
df_clean["gender"] = (
    df_clean["gender"].str.strip().str.title().map(gender_map)
)

# String standardization
text_cols = [
    "blood_type", "medical_condition", "doctor",
    "hospital", "insurance_provider", "admission_type",
    "medication", "test_results"
]

for col in text_cols:
    df_clean[col] = df_clean[col].astype(str).str.strip().str.title()

# Date parsing
df_clean["date_of_admission"] = pd.to_datetime(df_clean["date_of_admission"])
df_clean["discharge_date"] = pd.to_datetime(df_clean["discharge_date"])

# Billing amount correction
df_clean["billing_amount"] = df_clean["billing_amount"].abs()

## 4. Missing Value Handling

In [4]:
# Categorical → mode, Numerical → median
for col in df_clean.columns:
    if df_clean[col].isnull().any():
        if df_clean[col].dtype == "object":
            df_clean[col].fillna(df_clean[col].mode()[0], inplace=True)
        else:
            df_clean[col].fillna(df_clean[col].median(), inplace=True)

print("Remaining missing values:", df_clean.isnull().sum().sum())


Remaining missing values: 0


## 5. Business Validation Rules

In [5]:
df_clean["length_of_stay"] = (
    df_clean["discharge_date"] - df_clean["date_of_admission"]
).dt.days

df_clean["age_valid"] = df_clean["age"].between(0, 120)
df_clean["stay_valid"] = df_clean["length_of_stay"] >= 0
df_clean["billing_valid"] = df_clean["billing_amount"] > 0

df_clean["validation_score"] = (
    df_clean[["age_valid", "stay_valid", "billing_valid"]]
    .mean(axis=1)
)

df_clean["validation_score"].describe()


count    55500.0
mean         1.0
std          0.0
min          1.0
25%          1.0
50%          1.0
75%          1.0
max          1.0
Name: validation_score, dtype: float64

## 6. Encoding

In [6]:

# Label Encoding
le_cols = ["gender", "admission_type", "test_results"]
for col in le_cols:
    le = LabelEncoder()
    df_clean[col + "_enc"] = le.fit_transform(df_clean[col])

# One-hot Encoding
df_encoded = pd.get_dummies(
    df_clean,
    columns=["blood_type", "medical_condition", "insurance_provider", "medication"],
    drop_first=True
)

df_encoded.shape


(55500, 39)

## 7. Feature Scaling

In [7]:

num_cols = ["age", "billing_amount", "room_number", "length_of_stay"]

df_clean[[c + "_minmax" for c in num_cols]] = MinMaxScaler().fit_transform(df_clean[num_cols])
df_clean[[c + "_zscore" for c in num_cols]] = StandardScaler().fit_transform(df_clean[num_cols])
df_clean[[c + "_robust" for c in num_cols]] = RobustScaler().fit_transform(df_clean[num_cols])

df_clean[[c + "_zscore" for c in num_cols]].describe()


Unnamed: 0,age_zscore,billing_amount_zscore,room_number_zscore,length_of_stay_zscore
count,55500.0,55500.0,55500.0,55500.0
mean,7.783964e-17,-1.705303e-16,2.365915e-16,-4.8649770000000004e-17
std,1.000009,1.000009,1.000009,1.000009
min,-1.966071,-1.797038,-1.736648,-1.675498
25%,-0.8437519,-0.865722,-0.8602315,-0.867139
50%,0.02349424,-0.0002245203,0.007507427,-0.05878027
75%,0.8397259,0.8642591,0.866569,0.8650582
max,1.91103,1.916057,1.725631,1.673417


## 8. Save Outputs

In [8]:
df_clean.to_csv("healthcare_dataset_preprocessed.csv", index=False)
df_encoded.to_csv("healthcare_dataset_encoded.csv", index=False)

print("Files saved successfully.")

Files saved successfully.
