In [2]:
## Load Dataset

import pandas as pd
import numpy as np

df = pd.read_csv("/cleaned_hip_replacement_data.csv")

In [3]:

df.head()

Unnamed: 0,health_service_area,hospital_county,operating_certificate_number,facility_id,facility_name,age_group,zip_code_3_digits,gender,race,ethnicity,...,apr_severity_of_illness_code,apr_severity_of_illness_description,apr_risk_of_mortality,apr_medical_surgical_description,attending_provider_license_number,operating_provider_license_number,total_charges,total_costs,charge_to_cost_ratio,efficiency_index
0,western ny,allegany,228000,39,memorial hosp of wm f & gertrude f jones a/k/a...,50 to 69,148,f,white,not span/hispanic,...,2,moderate,minor,surgical,213053,213053,34289.25,16657.95,2.058432,1.5e-05
1,western ny,allegany,228000,39,memorial hosp of wm f & gertrude f jones a/k/a...,50 to 69,147,m,white,not span/hispanic,...,2,moderate,minor,surgical,213053,213053,30436.0,14703.62,2.069966,2.3e-05
2,western ny,allegany,228000,39,memorial hosp of wm f & gertrude f jones a/k/a...,50 to 69,147,m,white,not span/hispanic,...,1,minor,minor,surgical,213053,213053,28699.0,13903.3,2.064186,2.4e-05
3,western ny,allegany,228000,39,memorial hosp of wm f & gertrude f jones a/k/a...,50 to 69,148,m,white,not span/hispanic,...,1,minor,minor,surgical,213053,213053,18421.0,8258.43,2.230569,6.1e-05
4,western ny,allegany,228000,39,memorial hosp of wm f & gertrude f jones a/k/a...,50 to 69,148,m,white,not span/hispanic,...,2,moderate,minor,surgical,213053,213053,29040.0,13950.49,2.081647,2.4e-05


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23080 entries, 0 to 23079
Data columns (total 32 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   health_service_area                  23080 non-null  object 
 1   hospital_county                      23080 non-null  object 
 2   operating_certificate_number         23080 non-null  int64  
 3   facility_id                          23080 non-null  int64  
 4   facility_name                        23080 non-null  object 
 5   age_group                            23080 non-null  object 
 6   zip_code_3_digits                    23080 non-null  object 
 7   gender                               23080 non-null  object 
 8   race                                 23080 non-null  object 
 9   ethnicity                            23080 non-null  object 
 10  length_of_stay                       23080 non-null  int64  
 11  type_of_admission           

In [5]:
df.sum().isnull()

Unnamed: 0,0
health_service_area,False
hospital_county,False
operating_certificate_number,False
facility_id,False
facility_name,False
age_group,False
zip_code_3_digits,False
gender,False
race,False
ethnicity,False


In [6]:

df.describe()

Unnamed: 0,operating_certificate_number,facility_id,length_of_stay,discharge_year,ccs_diagnosis_code,ccs_procedure_code,apr_drg_code,apr_mdc_code,apr_severity_of_illness_code,attending_provider_license_number,operating_provider_license_number,total_charges,total_costs,charge_to_cost_ratio,efficiency_index
count,23080.0,23080.0,23080.0,23080.0,23080.0,23080.0,23080.0,23080.0,23080.0,23080.0,23080.0,23080.0,23080.0,23080.0,23080.0
mean,4304858.0,876.237348,2.269107,2016.0,204.888692,153.131109,301.0,8.0,1.579506,476525.5,443966.1,51921.713553,18029.95312,2.84194,3.5e-05
std,2412024.0,596.771273,0.866484,0.0,8.540616,2.498245,0.0,0.0,0.560098,4788595.0,4577340.0,25004.366546,5978.461075,0.811238,0.000105
min,101000.0,1.0,1.0,2016.0,1.0,58.0,301.0,8.0,1.0,92382.0,92382.0,458.0,64.45,0.66248,7e-06
25%,2701001.0,409.0,2.0,2016.0,203.0,153.0,301.0,8.0,1.0,170254.0,170239.0,30780.2675,13121.7025,2.317888,1.8e-05
50%,3950000.0,752.0,2.0,2016.0,203.0,153.0,301.0,8.0,2.0,206940.0,206940.0,47351.925,17306.585,2.76615,2.5e-05
75%,7002012.0,1447.0,3.0,2016.0,203.0,153.0,301.0,8.0,2.0,244799.0,244799.0,65836.265,22300.18,3.165175,4.3e-05
max,7004010.0,3376.0,4.0,2016.0,238.0,231.0,301.0,8.0,4.0,90272060.0,90636840.0,131375.09,38018.7,7.106284,0.015516


In [7]:
df.shape

(23080, 32)

In [8]:

## define target and problem type

target = "length_of_stay"

In [None]:
## The objective is to predict length of hospital stay, making this a regression problem

In [10]:
## Identify Feature Types

X = df.drop(columns=[target])

numerical_cols = X.select_dtypes(include=['int64','float64']).columns
categorical_cols = X.select_dtypes(include=['object','bool']).columns

print("numerical:",len(numerical_cols))
print("categorical:",len(categorical_cols))

numerical: 14
categorical: 17


In [None]:
## Features were categorized to apply appropriate transformations and scaling techniques

In [11]:
## remove irrelevant / ID cols

drop_cols = ['operating_certificate_number', 'facility_id']
df.drop(columns=drop_cols, inplace=True, errors='ignore')

In [None]:
## Identifier columns do not add predictive values and may cause data leakage.

In [12]:
## Handle multicollinearity

numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

id_cols = ['facility_id', 'operating_certificate_number']
numerical_cols = numerical_cols.drop([col for col in id_cols if col in numerical_cols])

corr = df[numerical_cols].corr().abs()
upper = corr.where(np.triu(np.ones(corr.shape),k=1).astype(bool))

high_corr_cols = [col for col in upper.columns if any(upper[col] > 0.85)]
df.drop(columns=high_corr_cols, inplace=True)

In [None]:
## Highly correlated features were removed to reduce redundancy.

In [13]:
## Feature Importance (Selection)

from sklearn.ensemble import RandomForestRegressor

X = df.drop(columns=[target])
y = df[target]

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X.select_dtypes(include=['int64','float64']),y)

importance = pd.Series(
    rf.feature_importances_,
    index = X.select_dtypes(include=['int64','float64']).columns
).sort_values(ascending=False)

importance.head(10)

Unnamed: 0,0
efficiency_index,0.785406
total_costs,0.214207
charge_to_cost_ratio,0.00016
total_charges,0.00012
operating_provider_license_number,4.2e-05
attending_provider_license_number,4.2e-05
apr_severity_of_illness_code,1e-05
ccs_diagnosis_code,9e-06
ccs_procedure_code,3e-06
discharge_year,0.0


In [None]:
## Feature importance helped identify the most influential predictors of length of stay.

In [15]:
## Create New Features

# Age severity score
age_group_mapping = {
    '18 to 29': 1,
    '30 to 49': 2,
    '50 to 69': 3,
    '70 or older': 4
}
df['age_score'] = df['age_group'].map(age_group_mapping)

# risk score
# The 'mortality_score' column is created in a later step after one-hot encoding.
# This calculation will fail until 'mortality_score' is available, as it also depends on encoded columns.
# df['risk_score'] = (
#     df['age_score'] * 0.5 +
#     df['mortality_score'] * 0.5
# )

# A composite risk score was created to summarize patient health severity.

# Length of Stay Category

df['los_category'] = pd.cut(
    df['length_of_stay'],
    bins=[0,2,5,10,50],
    labels=['Short','Medium','Long','Very Long']
)

In [16]:
## Feature Transformation

df['log_length_of_stay'] = np.log1p(df['length_of_stay'])

In [None]:
## Log transformation reduces skewness and improves model stability.

In [17]:
## Encode Categorical Variables

df = pd.get_dummies(df, drop_first=True)

# Categorical variables were converted into numerical format using one-hot encoding.

In [18]:
## Feature Scaling

from sklearn.preprocessing import RobustScaler

num_cols = df.select_dtypes(include=['int64','float64']).columns
scaler = RobustScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

In [None]:
## RobustScaler was chosen to handle outliers common in healthcare data.

In [19]:
df['mortality_score'] = (
    df['apr_risk_of_mortality_minor'] * 1 +
    df['apr_risk_of_mortality_moderate'] * 2 +
    df['apr_risk_of_mortality_major'] * 3
)

In [20]:
df['mortality_score'].isna().sum()

np.int64(0)

In [22]:
los_map = {
    'Short': 1,
    'Medium': 2,
    'Long': 3,
    'Very Long': 4
}

df['los_category_encoded'] = (
    df.get('los_category_Short', 0) * los_map['Short'] +
    df.get('los_category_Medium', 0) * los_map['Medium'] +
    df.get('los_category_Long', 0) * los_map['Long'] +
    df.get('los_category_Very Long', 0) * los_map['Very Long']
)

In [23]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')  # median works well for ordinal
df['los_category_encoded'] = imputer.fit_transform(df[['los_category_encoded']])

In [27]:
y = df[target] # Define the target variable 'y'

# Identify columns to drop from X
cols_to_drop_from_X = [target, 'log_length_of_stay']

# Get all one-hot encoded 'los_category_' columns, but exclude 'los_category_encoded' itself from this list
one_hot_los_cols_to_drop = [col for col in df.columns if col.startswith('los_category_') and col != 'los_category_encoded']
cols_to_drop_from_X.extend(one_hot_los_cols_to_drop)

# Create X by dropping these columns
# The 'los_category_encoded' column will remain in X
X = df.drop(columns=cols_to_drop_from_X, errors='ignore')

# Rename 'los_category_encoded' to 'los_category' in X, if it exists
if 'los_category_encoded' in X.columns:
    X = X.rename(columns={'los_category_encoded': 'los_category'})

In [30]:
## Feature Reduction

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer # Import SimpleImputer

# Ensure X contains only numerical columns before scaling
X_numeric = X.select_dtypes(include=np.number)

# Impute NaN values in X_numeric
imputer = SimpleImputer(strategy='median')
X_numeric_imputed = imputer.fit_transform(X_numeric)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric_imputed) # Use the imputed data here

pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_scaled)

# PCA reduced dimensionality while preserving 95% of variance.

In [31]:

## Final Dataset Creation

final_df = pd.concat(
    [pd.DataFrame(X_pca), y.reset_index(drop=True)],
    axis=1
)

In [32]:
final_df.to_csv("final_feature_engineered_dataset.csv", index=False)