# hospital data curation project
## phase 5: data transformation and feature engineering

create analytical features:
- derive length of stay and readmission flags
- create age groups and categorical buckets
- one-hot encode categorical variables
- engineer features for predictive modeling

In [1]:
# import required libraries
import sys
import os
from pathlib import Path

# add src directory to python path
notebook_dir = Path(os.getcwd())
src_dir = notebook_dir / 'src'
if str(src_dir) not in sys.path:
    sys.path.insert(0, str(src_dir))

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# import project modules
import config
import utils

# use imported modules
PREPROCESSED_DATA_DIR = config.PREPROCESSED_DATA_DIR
READMISSION_THRESHOLD_DAYS = config.READMISSION_THRESHOLD_DAYS
AGE_GROUPS = config.AGE_GROUPS
setup_logging = utils.setup_logging
print_section_header = utils.print_section_header
save_dataframe = utils.save_dataframe
load_dataframe = utils.load_dataframe
create_age_groups = utils.create_age_groups


## 1. load preprocessed master dataset

In [2]:
# setup logging
logger = setup_logging()

# load master dataset
print_section_header("loading master dataset")

master_df = load_dataframe(PREPROCESSED_DATA_DIR / 'master_patient_visits.csv')

print(f"master dataset loaded:")
print(f"  rows: {len(master_df)}")
print(f"  columns: {len(master_df.columns)}")
print(f"\ninitial columns: {list(master_df.columns)}")

2025-11-10 17:48:49,103 - root - INFO - loaded dataframe from d:\Github Desktop\Python\Hospital Data Curation\data\preprocessed\master_patient_visits.csv: 2481 rows, 24 columns



                             loading master dataset                             

master dataset loaded:
  rows: 2481
  columns: 24

initial columns: ['patient_id', 'name', 'dob', 'gender', 'contact_number', 'email', 'address', 'visit_id', 'admission_date', 'discharge_date', 'admission_type', 'hospital_unit_id', 'attending_physician_id', 'length_of_stay', 'diagnosis_count', 'primary_diagnosis', 'all_diagnoses', 'medication_count', 'all_medications', 'staff_id', 'name_staff', 'designation', 'unit_id', 'joining_date']


## 2. derive length of stay

In [3]:
# calculate length of stay if not already present
print_section_header("deriving length of stay")

if 'admission_date' in master_df.columns and 'discharge_date' in master_df.columns:
    # ensure dates are in datetime format
    master_df['admission_date'] = pd.to_datetime(master_df['admission_date'], errors='coerce')
    master_df['discharge_date'] = pd.to_datetime(master_df['discharge_date'], errors='coerce')
    
    # calculate los if not present
    if 'length_of_stay' not in master_df.columns:
        master_df['length_of_stay'] = (master_df['discharge_date'] - master_df['admission_date']).dt.days
    
    print("length of stay statistics:")
    print(f"  mean: {master_df['length_of_stay'].mean():.2f} days")
    print(f"  median: {master_df['length_of_stay'].median():.2f} days")
    print(f"  std: {master_df['length_of_stay'].std():.2f} days")
    print(f"  min: {master_df['length_of_stay'].min()} days")
    print(f"  max: {master_df['length_of_stay'].max()} days")
else:
    print("date columns not found for los calculation")


                            deriving length of stay                             

length of stay statistics:
  mean: 182.27 days
  median: 180.00 days
  std: 105.07 days
  min: 0 days
  max: 365 days


## 3. identify readmissions

In [4]:
# identify readmissions within threshold period
print_section_header("identifying readmissions")

if 'patient_id' in master_df.columns and 'admission_date' in master_df.columns:
    # sort by patient and admission date
    master_df = master_df.sort_values(['patient_id', 'admission_date'])
    
    # calculate days since last admission for same patient
    master_df['days_since_last_admission'] = master_df.groupby('patient_id')['admission_date'].diff().dt.days
    
    # flag readmissions (within threshold days)
    master_df['is_readmitted'] = (
        (master_df['days_since_last_admission'] <= READMISSION_THRESHOLD_DAYS) & 
        (master_df['days_since_last_admission'].notna())
    ).astype(int)
    
    readmission_count = master_df['is_readmitted'].sum()
    readmission_rate = (readmission_count / len(master_df)) * 100
    
    print(f"readmission analysis (within {READMISSION_THRESHOLD_DAYS} days):")
    print(f"  total readmissions: {readmission_count}")
    print(f"  readmission rate: {readmission_rate:.2f}%")
    print(f"  unique patients with readmissions: {master_df[master_df['is_readmitted']==1]['patient_id'].nunique()}")
else:
    print("required columns not found for readmission analysis")


                            identifying readmissions                            

readmission analysis (within 30 days):
  total readmissions: 117
  readmission rate: 4.72%
  unique patients with readmissions: 108


## 4. create age groups

In [5]:
# create age group categories
print_section_header("creating age groups")

if 'age' in master_df.columns:
    master_df['age_group'] = create_age_groups(master_df['age'], AGE_GROUPS)
    
    print("age group distribution:")
    age_dist = master_df['age_group'].value_counts().sort_index()
    for group, count in age_dist.items():
        pct = (count / len(master_df)) * 100
        print(f"  {group}: {count} ({pct:.2f}%)")
else:
    print("age column not found")


                              creating age groups                               

age column not found


## 5. create length of stay categories

In [6]:
# categorize length of stay
print_section_header("categorizing length of stay")

if 'length_of_stay' in master_df.columns:
    # create los categories
    def categorize_los(days):
        if days <= 1:
            return 'short_stay'
        elif days <= 7:
            return 'medium_stay'
        elif days <= 14:
            return 'long_stay'
        else:
            return 'extended_stay'
    
    master_df['los_category'] = master_df['length_of_stay'].apply(categorize_los)
    
    print("length of stay category distribution:")
    los_dist = master_df['los_category'].value_counts()
    for category, count in los_dist.items():
        pct = (count / len(master_df)) * 100
        print(f"  {category}: {count} ({pct:.2f}%)")


                          categorizing length of stay                           

length of stay category distribution:
  extended_stay: 2387 (96.21%)
  long_stay: 45 (1.81%)
  medium_stay: 40 (1.61%)
  short_stay: 9 (0.36%)


## 6. create high-risk patient flag

In [7]:
# identify high-risk patients based on multiple criteria
print_section_header("identifying high-risk patients")

# define high-risk criteria
risk_factors = []

# factor 1: multiple diagnoses
if 'diagnosis_count' in master_df.columns:
    high_diagnosis = master_df['diagnosis_count'] >= 3
    risk_factors.append(high_diagnosis)
    print(f"patients with 3+ diagnoses: {high_diagnosis.sum()}")

# factor 2: long length of stay
if 'length_of_stay' in master_df.columns:
    long_stay = master_df['length_of_stay'] > 10
    risk_factors.append(long_stay)
    print(f"patients with los > 10 days: {long_stay.sum()}")

# factor 3: multiple medications
if 'medication_count' in master_df.columns:
    high_meds = master_df['medication_count'] >= 5
    risk_factors.append(high_meds)
    print(f"patients with 5+ medications: {high_meds.sum()}")

# factor 4: elderly patients
if 'age' in master_df.columns:
    elderly = master_df['age'] >= 65
    risk_factors.append(elderly)
    print(f"patients aged 65+: {elderly.sum()}")

# combine risk factors (at least 2 factors present)
if len(risk_factors) > 0:
    risk_score = sum(risk_factors)
    master_df['is_high_risk'] = (risk_score >= 2).astype(int)
    
    high_risk_count = master_df['is_high_risk'].sum()
    high_risk_rate = (high_risk_count / len(master_df)) * 100
    
    print(f"\ntotal high-risk patients: {high_risk_count} ({high_risk_rate:.2f}%)")


                         identifying high-risk patients                         

patients with 3+ diagnoses: 555
patients with los > 10 days: 2413
patients with 5+ medications: 0

total high-risk patients: 544 (21.93%)


## 7. one-hot encode categorical variables

In [8]:
# one-hot encode categorical variables for modeling
print_section_header("one-hot encoding categorical variables")

# identify categorical columns to encode
categorical_columns = []

if 'gender' in master_df.columns:
    categorical_columns.append('gender')

if 'age_group' in master_df.columns:
    categorical_columns.append('age_group')

if 'los_category' in master_df.columns:
    categorical_columns.append('los_category')

if 'admission_type' in master_df.columns:
    categorical_columns.append('admission_type')

print(f"encoding columns: {categorical_columns}")

# create encoded dataframe
encoded_df = master_df.copy()

for col in categorical_columns:
    if col in encoded_df.columns:
        # create dummy variables
        dummies = pd.get_dummies(encoded_df[col], prefix=col, drop_first=False)
        encoded_df = pd.concat([encoded_df, dummies], axis=1)
        print(f"  {col}: created {len(dummies.columns)} dummy variables")

print(f"\ntotal columns after encoding: {len(encoded_df.columns)}")


                     one-hot encoding categorical variables                     

encoding columns: ['gender', 'los_category', 'admission_type']
  gender: created 4 dummy variables
  los_category: created 4 dummy variables
  admission_type: created 4 dummy variables

total columns after encoding: 40


## 8. normalize numerical features

In [9]:
# normalize numerical features for modeling
print_section_header("creating normalized features")

from sklearn.preprocessing import StandardScaler

numerical_features = []

# identify numerical columns
for col in ['age', 'length_of_stay', 'diagnosis_count', 'medication_count']:
    if col in master_df.columns:
        numerical_features.append(col)

print(f"numerical features to normalize: {numerical_features}")

if len(numerical_features) > 0:
    scaler = StandardScaler()
    
    # create normalized versions
    for col in numerical_features:
        master_df[f'{col}_normalized'] = scaler.fit_transform(master_df[[col]])
        print(f"  {col}_normalized created")


                          creating normalized features                          

numerical features to normalize: ['length_of_stay', 'diagnosis_count', 'medication_count']
  length_of_stay_normalized created
  diagnosis_count_normalized created
  medication_count_normalized created
numerical features to normalize: ['length_of_stay', 'diagnosis_count', 'medication_count']
  length_of_stay_normalized created
  diagnosis_count_normalized created
  medication_count_normalized created


## 9. create temporal features

In [10]:
# extract temporal features from dates
print_section_header("extracting temporal features")

if 'admission_date' in master_df.columns:
    master_df['admission_year'] = master_df['admission_date'].dt.year
    master_df['admission_month'] = master_df['admission_date'].dt.month
    master_df['admission_day_of_week'] = master_df['admission_date'].dt.dayofweek
    master_df['admission_quarter'] = master_df['admission_date'].dt.quarter
    
    # weekend admission flag
    master_df['is_weekend_admission'] = (master_df['admission_day_of_week'] >= 5).astype(int)
    
    print("temporal features created:")
    print("  - admission_year")
    print("  - admission_month")
    print("  - admission_day_of_week")
    print("  - admission_quarter")
    print("  - is_weekend_admission")
    
    print(f"\nweekend admissions: {master_df['is_weekend_admission'].sum()} ({master_df['is_weekend_admission'].sum()/len(master_df)*100:.2f}%)")


                          extracting temporal features                          

temporal features created:
  - admission_year
  - admission_month
  - admission_day_of_week
  - admission_quarter
  - is_weekend_admission

weekend admissions: 712 (28.70%)


## 10. save transformed datasets

In [11]:
# save transformed datasets
print_section_header("saving transformed datasets")

# save main transformed dataset
save_dataframe(master_df, PREPROCESSED_DATA_DIR / 'transformed_master_dataset.csv')
print(f"✓ transformed master dataset saved: {len(master_df)} rows, {len(master_df.columns)} columns")

# save encoded dataset (for ML)
save_dataframe(encoded_df, PREPROCESSED_DATA_DIR / 'encoded_dataset_for_ml.csv')
print(f"✓ encoded dataset saved: {len(encoded_df)} rows, {len(encoded_df.columns)} columns")

# create and save feature-ready dataset (only relevant features)
ml_features = [col for col in master_df.columns if any(x in col for x in [
    'age', 'length_of_stay', 'diagnosis_count', 'medication_count', 
    'is_readmitted', 'is_high_risk', '_normalized'
])]

# add target columns if they exist
for target_col in ['is_readmitted', 'is_high_risk', 'length_of_stay']:
    if target_col in master_df.columns and target_col not in ml_features:
        ml_features.append(target_col)

ml_ready_df = master_df[ml_features].copy()
save_dataframe(ml_ready_df, PREPROCESSED_DATA_DIR / 'ml_ready_features.csv')
print(f"✓ ml-ready features saved: {len(ml_ready_df)} rows, {len(ml_ready_df.columns)} columns")

2025-11-10 17:48:51,094 - root - INFO - saved dataframe to d:\Github Desktop\Python\Hospital Data Curation\data\preprocessed\transformed_master_dataset.csv
2025-11-10 17:48:51,128 - root - INFO - saved dataframe to d:\Github Desktop\Python\Hospital Data Curation\data\preprocessed\encoded_dataset_for_ml.csv
2025-11-10 17:48:51,144 - root - INFO - saved dataframe to d:\Github Desktop\Python\Hospital Data Curation\data\preprocessed\ml_ready_features.csv
2025-11-10 17:48:51,128 - root - INFO - saved dataframe to d:\Github Desktop\Python\Hospital Data Curation\data\preprocessed\encoded_dataset_for_ml.csv
2025-11-10 17:48:51,144 - root - INFO - saved dataframe to d:\Github Desktop\Python\Hospital Data Curation\data\preprocessed\ml_ready_features.csv



                          saving transformed datasets                           

✓ transformed master dataset saved: 2481 rows, 36 columns
✓ encoded dataset saved: 2481 rows, 40 columns
✓ ml-ready features saved: 2481 rows, 8 columns


## 11. feature engineering summary

In [12]:
# generate feature engineering summary
print_section_header("feature engineering summary")

print("engineered features:")
print("\nderived features:")
print("  ✓ length_of_stay")
print("  ✓ is_readmitted")
print("  ✓ days_since_last_admission")
print("  ✓ age_group")
print("  ✓ los_category")
print("  ✓ is_high_risk")

print("\ntemporal features:")
print("  ✓ admission_year")
print("  ✓ admission_month")
print("  ✓ admission_day_of_week")
print("  ✓ admission_quarter")
print("  ✓ is_weekend_admission")

print("\nnormalized features:")
for col in numerical_features:
    print(f"  ✓ {col}_normalized")

print("\none-hot encoded categories:")
for col in categorical_columns:
    print(f"  ✓ {col}")

print(f"\ntotal features in transformed dataset: {len(master_df.columns)}")
print(f"total features in encoded dataset: {len(encoded_df.columns)}")
print(f"ml-ready features: {len(ml_ready_df.columns)}")


                          feature engineering summary                           

engineered features:

derived features:
  ✓ length_of_stay
  ✓ is_readmitted
  ✓ days_since_last_admission
  ✓ age_group
  ✓ los_category
  ✓ is_high_risk

temporal features:
  ✓ admission_year
  ✓ admission_month
  ✓ admission_day_of_week
  ✓ admission_quarter
  ✓ is_weekend_admission

normalized features:
  ✓ length_of_stay_normalized
  ✓ diagnosis_count_normalized
  ✓ medication_count_normalized

one-hot encoded categories:
  ✓ gender
  ✓ los_category
  ✓ admission_type

total features in transformed dataset: 36
total features in encoded dataset: 40
ml-ready features: 8


## summary

data transformation completed:
- ✓ length of stay calculated
- ✓ readmission flags generated
- ✓ age groups and categories created
- ✓ high-risk patient identification
- ✓ categorical variables encoded
- ✓ numerical features normalized
- ✓ temporal features extracted
- ✓ ml-ready datasets created

next phase: data validation and quality assurance