# hospital data curation project
## phase 1: data ingestion and discovery

this notebook handles the initial data loading, validation, and metadata generation for hospital patient records.

In [1]:
# import required libraries
import sys
import os
from pathlib import Path

# add src directory to python path
notebook_dir = Path(os.getcwd())
src_dir = notebook_dir / 'src'
if str(src_dir) not in sys.path:
    sys.path.insert(0, str(src_dir))

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# import project modules
import config
import data_loader
import utils

# use imported modules
RAW_DATA_DIR = config.RAW_DATA_DIR
CLEANED_DATA_DIR = config.CLEANED_DATA_DIR
LOGS_DIR = config.LOGS_DIR
DATA_FILES = config.DATA_FILES
DataLoader = data_loader.DataLoader
setup_logging = utils.setup_logging
print_section_header = utils.print_section_header


## 1. initialize data loader

In [6]:
# setup logging
logger = setup_logging()

# initialize data loader
loader = DataLoader(data_dir=RAW_DATA_DIR)

print_section_header("hospital data ingestion")
print(f"data directory: {RAW_DATA_DIR}")
print(f"expected datasets: {list(DATA_FILES.keys())}")


                            hospital data ingestion                             

data directory: d:\Github Desktop\Python\Hospital Data Curation\data\raw
expected datasets: ['patients', 'visits', 'diagnoses', 'medications', 'staff', 'hospital_info']


## 2. load all datasets

In [7]:
# load all hospital datasets
datasets = loader.load_all_datasets()

# display loaded datasets
print("\nloaded datasets:")
for name, df in datasets.items():
    print(f"  - {name}: {len(df)} rows, {len(df.columns)} columns")

2025-11-10 17:30:18,385 - utils - INFO - successfully loaded patients.csv: 3000 rows, 7 columns
2025-11-10 17:30:18,392 - utils - INFO - loaded patients: 3000 rows
2025-11-10 17:30:18,392 - utils - INFO - loaded patients: 3000 rows
2025-11-10 17:30:18,418 - utils - INFO - successfully loaded visits.csv: 5000 rows, 7 columns
2025-11-10 17:30:18,420 - utils - INFO - loaded visits: 5000 rows
2025-11-10 17:30:18,418 - utils - INFO - successfully loaded visits.csv: 5000 rows, 7 columns
2025-11-10 17:30:18,420 - utils - INFO - loaded visits: 5000 rows
2025-11-10 17:30:18,454 - utils - INFO - successfully loaded diagnoses.csv: 8000 rows, 4 columns
2025-11-10 17:30:18,459 - utils - INFO - loaded diagnoses: 8000 rows
2025-11-10 17:30:18,454 - utils - INFO - successfully loaded diagnoses.csv: 8000 rows, 4 columns
2025-11-10 17:30:18,459 - utils - INFO - loaded diagnoses: 8000 rows
2025-11-10 17:30:18,479 - utils - INFO - successfully loaded medications.csv: 6000 rows, 6 columns
2025-11-10 17:30:


loaded datasets:
  - patients: 3000 rows, 7 columns
  - visits: 5000 rows, 7 columns
  - diagnoses: 8000 rows, 4 columns
  - medications: 6000 rows, 6 columns
  - staff: 500 rows, 5 columns
  - hospital_info: 20 rows, 5 columns


## 3. generate metadata report

In [8]:
# generate comprehensive metadata report
metadata_df = loader.generate_metadata_report()

print_section_header("metadata report")
print(metadata_df.to_string(index=False))

# save metadata report
metadata_file = LOGS_DIR / 'metadata_report.csv'
metadata_df.to_csv(metadata_file, index=False)
print(f"\nmetadata report saved to: {metadata_file}")

2025-11-10 17:30:18,619 - utils - INFO - generated metadata report for 6 datasets



                                metadata report                                 

 dataset_name         file_name                                                                  file_path  rows  columns  memory_mb  total_missing  duplicates                                                             column_list
     patients      patients.csv      d:\Github Desktop\Python\Hospital Data Curation\data\raw\patients.csv  3000        7       1.47              0           0                        patient_id, name, dob, gender, contact_number...
       visits        visits.csv        d:\Github Desktop\Python\Hospital Data Curation\data\raw\visits.csv  5000        7       2.12              0           0 visit_id, patient_id, admission_date, discharge_date, admission_type...
    diagnoses     diagnoses.csv     d:\Github Desktop\Python\Hospital Data Curation\data\raw\diagnoses.csv  8000        4       2.05              0           0                           diagnosis_id, visit_id, icd_code, d

## 4. initial data inspection

In [9]:
# inspect each dataset
for name, df in datasets.items():
    print_section_header(f"{name} dataset preview")
    print(f"\nshape: {df.shape}")
    print(f"\ncolumns: {list(df.columns)}")
    print(f"\ndata types:\n{df.dtypes}")
    print(f"\nmissing values:\n{df.isnull().sum()}")
    print(f"\nfirst 5 rows:\n{df.head()}")
    print("\n" + "="*80)


                            patients dataset preview                            


shape: (3000, 7)

columns: ['patient_id', 'name', 'dob', 'gender', 'contact_number', 'email', 'address']

data types:
patient_id        object
name              object
dob               object
gender            object
contact_number    object
email             object
address           object
dtype: object

missing values:
patient_id        0
name              0
dob               0
gender            0
contact_number    0
email             0
address           0
dtype: int64

first 5 rows:
  patient_id              name         dob gender   contact_number  \
0     P10000      Norma Fisher  1970-12-28      M  +1-728-588-5986   
1     P10001    Jorge Sullivan  2010-01-14      M     257-367-8090   
2     P10002   Elizabeth Woods  2004-06-17   Male    (628)962-6143   
3     P10003      Susan Wagner  1965-03-01  Other    (372)730-3040   
4     P10004  Peter Montgomery  2016-01-07      F  +1-058-378-3154   

   

## 5. data quality overview

In [10]:
# calculate data quality metrics
quality_metrics = []

for name, df in datasets.items():
    metrics = {
        'dataset': name,
        'total_rows': len(df),
        'total_columns': len(df.columns),
        'missing_cells': df.isnull().sum().sum(),
        'missing_percentage': round(df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100, 2),
        'duplicate_rows': df.duplicated().sum(),
        'memory_mb': round(df.memory_usage(deep=True).sum() / 1024**2, 2)
    }
    quality_metrics.append(metrics)

quality_df = pd.DataFrame(quality_metrics)
print_section_header("data quality overview")
print(quality_df.to_string(index=False))

# save quality report
quality_file = LOGS_DIR / 'initial_quality_report.csv'
quality_df.to_csv(quality_file, index=False)
print(f"\nquality report saved to: {quality_file}")


                             data quality overview                              

      dataset  total_rows  total_columns  missing_cells  missing_percentage  duplicate_rows  memory_mb
     patients        3000              7              0                 0.0               0       1.47
       visits        5000              7              0                 0.0               0       2.12
    diagnoses        8000              4              0                 0.0               0       2.05
  medications        6000              6              0                 0.0               0       2.23
        staff         500              5              0                 0.0               0       0.15
hospital_info          20              5              0                 0.0               0       0.00

quality report saved to: d:\Github Desktop\Python\Hospital Data Curation\logs\initial_quality_report.csv


## 6. validate required columns

In [11]:
# define required columns for each dataset
required_columns = {
    'patients': ['patient_id'],
    'visits': ['visit_id', 'patient_id'],
    'diagnoses': ['visit_id', 'icd_code'],
    'medications': ['visit_id', 'medication_name'],
    'staff': ['staff_id']
}

# validate required columns
print_section_header("required columns validation")
for dataset_name, req_cols in required_columns.items():
    if dataset_name in datasets:
        is_valid = loader.validate_required_columns(dataset_name, req_cols)
        status = "✓" if is_valid else "✗"
        print(f"{status} {dataset_name}: {req_cols}")

2025-11-10 17:30:18,938 - utils - INFO - patients has all required columns
2025-11-10 17:30:18,940 - utils - INFO - visits has all required columns
2025-11-10 17:30:18,940 - utils - INFO - visits has all required columns
2025-11-10 17:30:18,942 - utils - INFO - diagnoses has all required columns
2025-11-10 17:30:18,944 - utils - INFO - medications has all required columns
2025-11-10 17:30:18,946 - utils - INFO - staff has all required columns
2025-11-10 17:30:18,942 - utils - INFO - diagnoses has all required columns
2025-11-10 17:30:18,944 - utils - INFO - medications has all required columns
2025-11-10 17:30:18,946 - utils - INFO - staff has all required columns



                          required columns validation                           

✓ patients: ['patient_id']
✓ visits: ['visit_id', 'patient_id']
✓ diagnoses: ['visit_id', 'icd_code']
✓ medications: ['visit_id', 'medication_name']
✓ staff: ['staff_id']


## summary

- all datasets have been successfully loaded and validated
- metadata and quality reports have been generated
- data is ready for profiling phase