# hospital data curation project
## phase 3: data cleaning and standardization

comprehensive data cleaning including:
- schema alignment and standardization
- missing value imputation
- duplicate removal
- outlier detection and handling
- data type corrections
- business rule validation

In [2]:
# import required libraries
import sys
import os
from pathlib import Path

# add src directory to python path
notebook_dir = Path(os.getcwd())
src_dir = notebook_dir / 'src'
if str(src_dir) not in sys.path:
    sys.path.insert(0, str(src_dir))

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# import project modules
import config
import data_loader
import data_cleaner
import utils

# use imported modules
RAW_DATA_DIR = config.RAW_DATA_DIR
CLEANED_DATA_DIR = config.CLEANED_DATA_DIR
LOGS_DIR = config.LOGS_DIR
DataLoader = data_loader.DataLoader
DataCleaner = data_cleaner.DataCleaner
setup_logging = utils.setup_logging
print_section_header = utils.print_section_header
save_dataframe = utils.save_dataframe


## 1. load raw datasets

In [3]:
# setup logging
logger = setup_logging()

# load datasets
loader = DataLoader(data_dir=RAW_DATA_DIR)
datasets = loader.load_all_datasets()

print_section_header("data cleaning initialization")
print(f"datasets loaded: {list(datasets.keys())}")
print(f"output directory: {CLEANED_DATA_DIR}")

2025-11-10 17:47:19,322 - utils - INFO - successfully loaded patients.csv: 3000 rows, 7 columns
2025-11-10 17:47:19,324 - utils - INFO - loaded patients: 3000 rows
2025-11-10 17:47:19,324 - utils - INFO - loaded patients: 3000 rows
2025-11-10 17:47:19,342 - utils - INFO - successfully loaded visits.csv: 5000 rows, 7 columns
2025-11-10 17:47:19,346 - utils - INFO - loaded visits: 5000 rows
2025-11-10 17:47:19,342 - utils - INFO - successfully loaded visits.csv: 5000 rows, 7 columns
2025-11-10 17:47:19,346 - utils - INFO - loaded visits: 5000 rows
2025-11-10 17:47:19,373 - utils - INFO - successfully loaded diagnoses.csv: 8000 rows, 4 columns
2025-11-10 17:47:19,374 - utils - INFO - loaded diagnoses: 8000 rows
2025-11-10 17:47:19,373 - utils - INFO - successfully loaded diagnoses.csv: 8000 rows, 4 columns
2025-11-10 17:47:19,374 - utils - INFO - loaded diagnoses: 8000 rows
2025-11-10 17:47:19,395 - utils - INFO - successfully loaded medications.csv: 6000 rows, 6 columns
2025-11-10 17:47:


                          data cleaning initialization                          

datasets loaded: ['patients', 'visits', 'diagnoses', 'medications', 'staff', 'hospital_info']
output directory: d:\Github Desktop\Python\Hospital Data Curation\data\cleaned


## 2. clean patients dataset

In [4]:
# initialize cleaner
cleaner = DataCleaner()

# clean patients data
if 'patients' in datasets:
    print_section_header("cleaning patients dataset")
    
    patients_df = datasets['patients'].copy()
    print(f"initial shape: {patients_df.shape}")
    
    # display before cleaning
    print("\nbefore cleaning:")
    print(f"  missing values: {patients_df.isnull().sum().sum()}")
    print(f"  duplicate rows: {patients_df.duplicated().sum()}")
    
    # clean patients data
    patients_clean = cleaner.clean_patients_data(patients_df)
    
    # display after cleaning
    print("\nafter cleaning:")
    print(f"  final shape: {patients_clean.shape}")
    print(f"  missing values: {patients_clean.isnull().sum().sum()}")
    print(f"  duplicate rows: {patients_clean.duplicated().sum()}")
    
    # save cleaned data
    save_dataframe(patients_clean, CLEANED_DATA_DIR / 'clean_patients.csv')
    
    # store for later use
    datasets['patients'] = patients_clean

2025-11-10 17:47:19,442 - utils - INFO - cleaning patients data...
2025-11-10 17:47:19,449 - utils - INFO - standardized gender values
2025-11-10 17:47:19,450 - utils - INFO - removed 0 duplicate patient records
2025-11-10 17:47:19,450 - utils - INFO - patients data cleaned: 3000 -> 3000 rows
2025-11-10 17:47:19,449 - utils - INFO - standardized gender values
2025-11-10 17:47:19,450 - utils - INFO - removed 0 duplicate patient records
2025-11-10 17:47:19,450 - utils - INFO - patients data cleaned: 3000 -> 3000 rows
2025-11-10 17:47:19,483 - root - INFO - saved dataframe to d:\Github Desktop\Python\Hospital Data Curation\data\cleaned\clean_patients.csv
2025-11-10 17:47:19,483 - root - INFO - saved dataframe to d:\Github Desktop\Python\Hospital Data Curation\data\cleaned\clean_patients.csv



                           cleaning patients dataset                            

initial shape: (3000, 7)

before cleaning:
  missing values: 0
  duplicate rows: 0

after cleaning:
  final shape: (3000, 7)
  missing values: 0
  duplicate rows: 0


## 3. clean visits dataset

In [5]:
# clean visits data
if 'visits' in datasets:
    print_section_header("cleaning visits dataset")
    
    visits_df = datasets['visits'].copy()
    print(f"initial shape: {visits_df.shape}")
    
    # display before cleaning
    print("\nbefore cleaning:")
    print(f"  missing values: {visits_df.isnull().sum().sum()}")
    print(f"  duplicate rows: {visits_df.duplicated().sum()}")
    
    # clean visits data
    visits_clean = cleaner.clean_visits_data(visits_df)
    
    # display after cleaning
    print("\nafter cleaning:")
    print(f"  final shape: {visits_clean.shape}")
    print(f"  missing values: {visits_clean.isnull().sum().sum()}")
    print(f"  duplicate rows: {visits_clean.duplicated().sum()}")
    
    if 'length_of_stay' in visits_clean.columns:
        print(f"\nlength of stay statistics:")
        print(f"  mean: {visits_clean['length_of_stay'].mean():.2f} days")
        print(f"  median: {visits_clean['length_of_stay'].median():.2f} days")
        print(f"  range: {visits_clean['length_of_stay'].min()} - {visits_clean['length_of_stay'].max()} days")
    
    # save cleaned data
    save_dataframe(visits_clean, CLEANED_DATA_DIR / 'clean_visits.csv')
    
    # store for later use
    datasets['visits'] = visits_clean

2025-11-10 17:47:19,504 - utils - INFO - cleaning visits data...
2025-11-10 17:47:19,525 - utils - INFO - removed 1254 records with discharge before admission
2025-11-10 17:47:19,533 - utils - INFO - removed 1265 records with invalid length of stay
2025-11-10 17:47:19,538 - utils - INFO - removed 0 duplicate visit records
2025-11-10 17:47:19,525 - utils - INFO - removed 1254 records with discharge before admission
2025-11-10 17:47:19,533 - utils - INFO - removed 1265 records with invalid length of stay
2025-11-10 17:47:19,538 - utils - INFO - removed 0 duplicate visit records
2025-11-10 17:47:19,540 - utils - INFO - visits data cleaned: 5000 -> 2481 rows
2025-11-10 17:47:19,540 - utils - INFO - visits data cleaned: 5000 -> 2481 rows
2025-11-10 17:47:19,564 - root - INFO - saved dataframe to d:\Github Desktop\Python\Hospital Data Curation\data\cleaned\clean_visits.csv
2025-11-10 17:47:19,564 - root - INFO - saved dataframe to d:\Github Desktop\Python\Hospital Data Curation\data\cleaned\


                            cleaning visits dataset                             

initial shape: (5000, 7)

before cleaning:
  missing values: 0
  duplicate rows: 0

after cleaning:
  final shape: (2481, 8)
  missing values: 0
  duplicate rows: 0

length of stay statistics:
  mean: 182.27 days
  median: 180.00 days
  range: 0 - 365 days


## 4. clean diagnoses dataset

In [6]:
# clean diagnoses data
if 'diagnoses' in datasets:
    print_section_header("cleaning diagnoses dataset")
    
    diagnoses_df = datasets['diagnoses'].copy()
    print(f"initial shape: {diagnoses_df.shape}")
    
    # display before cleaning
    print("\nbefore cleaning:")
    print(f"  missing values: {diagnoses_df.isnull().sum().sum()}")
    print(f"  duplicate rows: {diagnoses_df.duplicated().sum()}")
    
    # clean diagnoses data
    diagnoses_clean = cleaner.clean_diagnoses_data(diagnoses_df)
    
    # display after cleaning
    print("\nafter cleaning:")
    print(f"  final shape: {diagnoses_clean.shape}")
    print(f"  missing values: {diagnoses_clean.isnull().sum().sum()}")
    print(f"  duplicate rows: {diagnoses_clean.duplicated().sum()}")
    
    if 'icd_code_valid' in diagnoses_clean.columns:
        valid_count = diagnoses_clean['icd_code_valid'].sum()
        total_count = len(diagnoses_clean)
        print(f"\nicd-10 code validation:")
        print(f"  valid codes: {valid_count}/{total_count} ({valid_count/total_count*100:.2f}%)")
    
    # save cleaned data
    save_dataframe(diagnoses_clean, CLEANED_DATA_DIR / 'clean_diagnoses.csv')
    
    # store for later use
    datasets['diagnoses'] = diagnoses_clean

2025-11-10 17:47:19,584 - utils - INFO - cleaning diagnoses data...



                           cleaning diagnoses dataset                           

initial shape: (8000, 4)

before cleaning:
  missing values: 0
  duplicate rows: 0


2025-11-10 17:47:19,601 - utils - INFO - found 0 invalid icd-10 codes
2025-11-10 17:47:19,620 - utils - INFO - removed 2 duplicate diagnosis records
2025-11-10 17:47:19,620 - utils - INFO - diagnoses data cleaned: 8000 -> 7998 rows
2025-11-10 17:47:19,620 - utils - INFO - removed 2 duplicate diagnosis records
2025-11-10 17:47:19,620 - utils - INFO - diagnoses data cleaned: 8000 -> 7998 rows
2025-11-10 17:47:19,675 - root - INFO - saved dataframe to d:\Github Desktop\Python\Hospital Data Curation\data\cleaned\clean_diagnoses.csv
2025-11-10 17:47:19,675 - root - INFO - saved dataframe to d:\Github Desktop\Python\Hospital Data Curation\data\cleaned\clean_diagnoses.csv



after cleaning:
  final shape: (7998, 5)
  missing values: 0
  duplicate rows: 0

icd-10 code validation:
  valid codes: 7998/7998 (100.00%)


## 5. clean medications dataset

In [7]:
# clean medications data
if 'medications' in datasets:
    print_section_header("cleaning medications dataset")
    
    medications_df = datasets['medications'].copy()
    print(f"initial shape: {medications_df.shape}")
    
    # display before cleaning
    print("\nbefore cleaning:")
    print(f"  missing values: {medications_df.isnull().sum().sum()}")
    print(f"  duplicate rows: {medications_df.duplicated().sum()}")
    
    # clean medications data
    medications_clean = cleaner.clean_medications_data(medications_df)
    
    # display after cleaning
    print("\nafter cleaning:")
    print(f"  final shape: {medications_clean.shape}")
    print(f"  missing values: {medications_clean.isnull().sum().sum()}")
    print(f"  duplicate rows: {medications_clean.duplicated().sum()}")
    print(f"  unique medications: {medications_clean['medication_name'].nunique()}")
    
    # save cleaned data
    save_dataframe(medications_clean, CLEANED_DATA_DIR / 'clean_medications.csv')
    
    # store for later use
    datasets['medications'] = medications_clean

2025-11-10 17:47:19,706 - utils - INFO - cleaning medications data...
2025-11-10 17:47:19,721 - utils - INFO - removed 676 duplicate medication records
2025-11-10 17:47:19,722 - utils - INFO - medications data cleaned: 6000 -> 5324 rows
2025-11-10 17:47:19,721 - utils - INFO - removed 676 duplicate medication records
2025-11-10 17:47:19,722 - utils - INFO - medications data cleaned: 6000 -> 5324 rows
2025-11-10 17:47:19,749 - root - INFO - saved dataframe to d:\Github Desktop\Python\Hospital Data Curation\data\cleaned\clean_medications.csv
2025-11-10 17:47:19,749 - root - INFO - saved dataframe to d:\Github Desktop\Python\Hospital Data Curation\data\cleaned\clean_medications.csv



                          cleaning medications dataset                          

initial shape: (6000, 6)

before cleaning:
  missing values: 0
  duplicate rows: 0

after cleaning:
  final shape: (5324, 6)
  missing values: 0
  duplicate rows: 0
  unique medications: 5


## 6. clean staff dataset

In [8]:
# clean staff data
if 'staff' in datasets:
    print_section_header("cleaning staff dataset")
    
    staff_df = datasets['staff'].copy()
    print(f"initial shape: {staff_df.shape}")
    
    # display before cleaning
    print("\nbefore cleaning:")
    print(f"  missing values: {staff_df.isnull().sum().sum()}")
    print(f"  duplicate rows: {staff_df.duplicated().sum()}")
    
    # clean staff data
    staff_clean = cleaner.clean_staff_data(staff_df)
    
    # display after cleaning
    print("\nafter cleaning:")
    print(f"  final shape: {staff_clean.shape}")
    print(f"  missing values: {staff_clean.isnull().sum().sum()}")
    print(f"  duplicate rows: {staff_clean.duplicated().sum()}")
    
    # save cleaned data
    save_dataframe(staff_clean, CLEANED_DATA_DIR / 'clean_staff.csv')
    
    # store for later use
    datasets['staff'] = staff_clean

2025-11-10 17:47:19,770 - utils - INFO - cleaning staff data...
2025-11-10 17:47:19,775 - utils - INFO - removed 0 duplicate staff records
2025-11-10 17:47:19,777 - utils - INFO - staff data cleaned: 500 -> 500 rows
2025-11-10 17:47:19,775 - utils - INFO - removed 0 duplicate staff records
2025-11-10 17:47:19,777 - utils - INFO - staff data cleaned: 500 -> 500 rows
2025-11-10 17:47:19,786 - root - INFO - saved dataframe to d:\Github Desktop\Python\Hospital Data Curation\data\cleaned\clean_staff.csv
2025-11-10 17:47:19,786 - root - INFO - saved dataframe to d:\Github Desktop\Python\Hospital Data Curation\data\cleaned\clean_staff.csv



                             cleaning staff dataset                             

initial shape: (500, 5)

before cleaning:
  missing values: 0
  duplicate rows: 0

after cleaning:
  final shape: (500, 5)
  missing values: 0
  duplicate rows: 0


## 7. clean hospital_info dataset

In [9]:
# clean hospital_info data (if exists)
if 'hospital_info' in datasets:
    print_section_header("cleaning hospital_info dataset")
    
    hospital_df = datasets['hospital_info'].copy()
    print(f"initial shape: {hospital_df.shape}")
    
    # basic cleaning (remove duplicates, handle missing)
    hospital_clean = hospital_df.drop_duplicates()
    
    print(f"final shape: {hospital_clean.shape}")
    
    # save cleaned data
    save_dataframe(hospital_clean, CLEANED_DATA_DIR / 'clean_hospital_info.csv')
    
    datasets['hospital_info'] = hospital_clean


                         cleaning hospital_info dataset                         

initial shape: (20, 5)
final shape: (20, 5)


2025-11-10 17:47:19,810 - root - INFO - saved dataframe to d:\Github Desktop\Python\Hospital Data Curation\data\cleaned\clean_hospital_info.csv


## 8. generate cleaning summary report

In [10]:
# generate comprehensive cleaning report
print_section_header("cleaning summary report")

cleaning_report = cleaner.get_cleaning_report()
print(cleaning_report.to_string(index=False))

# save cleaning report
report_file = LOGS_DIR / 'cleaning_report.csv'
cleaning_report.to_csv(report_file, index=False)
print(f"\ncleaning report saved to: {report_file}")

# calculate overall statistics
total_initial = cleaning_report['initial_rows'].sum()
total_final = cleaning_report['final_rows'].sum()
total_removed = cleaning_report['removed_rows'].sum()

print(f"\noverall statistics:")
print(f"  total initial rows: {total_initial}")
print(f"  total final rows: {total_final}")
print(f"  total removed rows: {total_removed} ({total_removed/total_initial*100:.2f}%)")


                            cleaning summary report                             

    dataset  initial_rows  final_rows  removed_rows
   patients          3000        3000             0
     visits          5000        2481          2519
  diagnoses          8000        7998             2
medications          6000        5324           676
      staff           500         500             0

cleaning report saved to: d:\Github Desktop\Python\Hospital Data Curation\logs\cleaning_report.csv

overall statistics:
  total initial rows: 22500
  total final rows: 19303
  total removed rows: 3197 (14.21%)


## 9. final data quality check

In [11]:
# perform final quality checks on cleaned data
print_section_header("final data quality check")

for dataset_name, df in datasets.items():
    print(f"\n{dataset_name.upper()}:")
    print(f"  rows: {len(df)}")
    print(f"  columns: {len(df.columns)}")
    print(f"  missing values: {df.isnull().sum().sum()}")
    print(f"  duplicate rows: {df.duplicated().sum()}")
    print(f"  memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} mb")


                            final data quality check                            


PATIENTS:
  rows: 3000
  columns: 7
  missing values: 0
  duplicate rows: 0
  memory usage: 1.48 mb

VISITS:
  rows: 2481
  columns: 8
  missing values: 0
  duplicate rows: 0
  memory usage: 0.81 mb

DIAGNOSES:
  rows: 7998
  columns: 5
  missing values: 0
  duplicate rows: 0
  memory usage: 2.11 mb

MEDICATIONS:
  rows: 5324
  columns: 6
  missing values: 0
  duplicate rows: 0
  memory usage: 2.02 mb

STAFF:
  rows: 500
  columns: 5
  missing values: 0
  duplicate rows: 0
  memory usage: 0.15 mb

HOSPITAL_INFO:
  rows: 20
  columns: 5
  missing values: 0
  duplicate rows: 0
  memory usage: 0.00 mb
  duplicate rows: 0
  memory usage: 2.02 mb

STAFF:
  rows: 500
  columns: 5
  missing values: 0
  duplicate rows: 0
  memory usage: 0.15 mb

HOSPITAL_INFO:
  rows: 20
  columns: 5
  missing values: 0
  duplicate rows: 0
  memory usage: 0.00 mb


## summary

data cleaning completed:
- ✓ patients dataset cleaned and standardized
- ✓ visits dataset cleaned with valid date ranges
- ✓ diagnoses dataset cleaned with icd-10 validation
- ✓ medications dataset cleaned and normalized
- ✓ staff dataset cleaned
- ✓ all cleaned datasets saved to `data/cleaned/`

next phase: data integration and merging