# hospital data curation project
## phase 6: data validation and quality assurance

comprehensive validation using:
- business rule validation
- data quality checks
- referential integrity verification
- automated validation reporting

In [12]:
# import required libraries
import sys
import os
from pathlib import Path

# add src directory to python path
notebook_dir = Path(os.getcwd())
src_dir = notebook_dir / 'src'
if str(src_dir) not in sys.path:
    sys.path.insert(0, str(src_dir))

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# import project modules
import config
import validators
import utils

# use imported modules
CLEANED_DATA_DIR = config.CLEANED_DATA_DIR
PREPROCESSED_DATA_DIR = config.PREPROCESSED_DATA_DIR
LOGS_DIR = config.LOGS_DIR
DataValidator = validators.DataValidator
setup_logging = utils.setup_logging
print_section_header = utils.print_section_header
load_dataframe = utils.load_dataframe


## 1. load datasets for validation

In [13]:
# setup logging
logger = setup_logging()

# load datasets
print_section_header("loading datasets for validation")

patients_df = load_dataframe(CLEANED_DATA_DIR / 'clean_patients.csv')
visits_df = load_dataframe(CLEANED_DATA_DIR / 'clean_visits.csv')
diagnoses_df = load_dataframe(CLEANED_DATA_DIR / 'clean_diagnoses.csv')
master_df = load_dataframe(PREPROCESSED_DATA_DIR / 'transformed_master_dataset.csv')

print("datasets loaded for validation")

2025-11-10 17:51:44,488 - root - INFO - loaded dataframe from d:\Github Desktop\Python\Hospital Data Curation\data\cleaned\clean_patients.csv: 3000 rows, 7 columns
2025-11-10 17:51:44,499 - root - INFO - loaded dataframe from d:\Github Desktop\Python\Hospital Data Curation\data\cleaned\clean_visits.csv: 2481 rows, 8 columns
2025-11-10 17:51:44,499 - root - INFO - loaded dataframe from d:\Github Desktop\Python\Hospital Data Curation\data\cleaned\clean_visits.csv: 2481 rows, 8 columns


2025-11-10 17:51:44,524 - root - INFO - loaded dataframe from d:\Github Desktop\Python\Hospital Data Curation\data\cleaned\clean_diagnoses.csv: 7998 rows, 5 columns
2025-11-10 17:51:44,564 - root - INFO - loaded dataframe from d:\Github Desktop\Python\Hospital Data Curation\data\preprocessed\transformed_master_dataset.csv: 2481 rows, 36 columns
2025-11-10 17:51:44,564 - root - INFO - loaded dataframe from d:\Github Desktop\Python\Hospital Data Curation\data\preprocessed\transformed_master_dataset.csv: 2481 rows, 36 columns



                        loading datasets for validation                         

datasets loaded for validation


## 2. validate patients dataset

In [14]:
# initialize validator
validator = DataValidator()

# validate patients data
print_section_header("validating patients dataset")
patients_results = validator.validate_patients(patients_df)

# display results
print("\nvalidation results:")
for check in patients_results['checks']:
    status = "✓ PASS" if check['passed'] else "✗ FAIL"
    print(f"{status} | {check['test']}")
    print(f"       {check['message']}\n")

2025-11-10 17:51:44,588 - utils - INFO - validating patients data...
2025-11-10 17:51:44,591 - utils - INFO - patients validation completed: 2/2 checks passed
2025-11-10 17:51:44,591 - utils - INFO - patients validation completed: 2/2 checks passed



                          validating patients dataset                           


validation results:
✓ PASS | unique_patient_id
       all patient ids are unique

✓ PASS | no_missing_patient_id
       no missing values in patient_id



## 3. validate visits dataset

In [15]:
# validate visits data
print_section_header("validating visits dataset")
visits_results = validator.validate_visits(visits_df)

# display results
print("\nvalidation results:")
for check in visits_results['checks']:
    status = "✓ PASS" if check['passed'] else "✗ FAIL"
    print(f"{status} | {check['test']}")
    print(f"       {check['message']}\n")

2025-11-10 17:51:44,602 - utils - INFO - validating visits data...
2025-11-10 17:51:44,602 - utils - INFO - visits validation completed: 3/3 checks passed
2025-11-10 17:51:44,602 - utils - INFO - visits validation completed: 3/3 checks passed



                           validating visits dataset                            


validation results:
✓ PASS | unique_visit_id
       all visit ids are unique

✓ PASS | discharge_after_admission
       all discharge dates are after admission dates

✓ PASS | valid_length_of_stay
       all length of stay values are between 0 and 365 days



## 4. validate diagnoses dataset

In [16]:
# validate diagnoses data
print_section_header("validating diagnoses dataset")
diagnoses_results = validator.validate_diagnoses(diagnoses_df)

# display results
print("\nvalidation results:")
for check in diagnoses_results['checks']:
    status = "✓ PASS" if check['passed'] else "✗ FAIL"
    print(f"{status} | {check['test']}")
    print(f"       {check['message']}\n")



2025-11-10 17:51:44,648 - utils - INFO - validating diagnoses data...
2025-11-10 17:51:44,664 - utils - INFO - diagnoses validation completed: 2/2 checks passed
2025-11-10 17:51:44,664 - utils - INFO - diagnoses validation completed: 2/2 checks passed



                          validating diagnoses dataset                          


validation results:
✓ PASS | valid_icd10_format
       all icd-10 codes are valid

✓ PASS | no_missing_visit_id
       no missing visit ids



## 5. validate referential integrity

In [17]:
# validate referential integrity between datasets
print_section_header("validating referential integrity")
integrity_results = validator.validate_referential_integrity(patients_df, visits_df)

# display results
print("\nvalidation results:")
for check in integrity_results['checks']:
    status = "✓ PASS" if check['passed'] else "✗ FAIL"
    print(f"{status} | {check['test']}")
    print(f"       {check['message']}\n")

2025-11-10 17:51:44,677 - utils - INFO - validating referential integrity...
2025-11-10 17:51:44,686 - utils - INFO - referential integrity validation completed: 1/1 checks passed
2025-11-10 17:51:44,686 - utils - INFO - referential integrity validation completed: 1/1 checks passed



                        validating referential integrity                        


validation results:
✓ PASS | visits_patient_id_exists
       all patient ids in visits exist in patients table



## 6. validate transformed features

In [18]:
# validate transformed dataset features
print_section_header("validating transformed features")

validation_checks = []

# check readmission logic
if 'is_readmitted' in master_df.columns:
    total_readmissions = master_df['is_readmitted'].sum()
    check_passed = total_readmissions >= 0
    validation_checks.append({
        'test': 'readmission_flag_valid',
        'passed': check_passed,
        'message': f'found {total_readmissions} readmissions'
    })

# check age groups
if 'age_group' in master_df.columns:
    valid_groups = ['0-18', '19-35', '36-60', '60+', 'unknown']
    invalid_groups = ~master_df['age_group'].isin(valid_groups)
    check_passed = invalid_groups.sum() == 0
    validation_checks.append({
        'test': 'age_group_categories_valid',
        'passed': check_passed,
        'message': 'all age groups are valid' if check_passed else f'found {invalid_groups.sum()} invalid age groups'
    })

# check high risk flag
if 'is_high_risk' in master_df.columns:
    valid_values = master_df['is_high_risk'].isin([0, 1]).all()
    validation_checks.append({
        'test': 'high_risk_flag_binary',
        'passed': valid_values,
        'message': 'high risk flag is binary' if valid_values else 'high risk flag contains invalid values'
    })

# check length of stay consistency
if 'length_of_stay' in master_df.columns and 'los_category' in master_df.columns:
    # verify los_category matches length_of_stay
    inconsistent = 0
    for idx, row in master_df.iterrows():
        los = row['length_of_stay']
        category = row['los_category']
        if pd.notna(los) and pd.notna(category):
            if los <= 1 and category != 'short_stay':
                inconsistent += 1
            elif 1 < los <= 7 and category != 'medium_stay':
                inconsistent += 1
            elif 7 < los <= 14 and category != 'long_stay':
                inconsistent += 1
            elif los > 14 and category != 'extended_stay':
                inconsistent += 1
    
    check_passed = inconsistent == 0
    validation_checks.append({
        'test': 'los_category_consistency',
        'passed': check_passed,
        'message': 'los categories match length of stay' if check_passed else f'found {inconsistent} inconsistent categorizations'
    })

# display results
print("\nvalidation results:")
for check in validation_checks:
    status = "✓ PASS" if check['passed'] else "✗ FAIL"
    print(f"{status} | {check['test']}")
    print(f"       {check['message']}\n")


                        validating transformed features                         


validation results:
✓ PASS | readmission_flag_valid
       found 117 readmissions

✓ PASS | high_risk_flag_binary
       high risk flag is binary

✓ PASS | los_category_consistency
       los categories match length of stay



## 7. data completeness analysis

In [19]:
# analyze data completeness
print_section_header("data completeness analysis")

completeness_metrics = []

for col in master_df.columns:
    non_null = master_df[col].notna().sum()
    completeness = (non_null / len(master_df)) * 100
    
    completeness_metrics.append({
        'column': col,
        'non_null_count': non_null,
        'completeness_pct': round(completeness, 2)
    })

completeness_df = pd.DataFrame(completeness_metrics)
completeness_df = completeness_df.sort_values('completeness_pct')

print("columns with less than 100% completeness:")
incomplete = completeness_df[completeness_df['completeness_pct'] < 100]
if len(incomplete) > 0:
    print(incomplete.to_string(index=False))
else:
    print("all columns are 100% complete")

# save completeness report
completeness_file = LOGS_DIR / 'completeness_report.csv'
completeness_df.to_csv(completeness_file, index=False)
print(f"\ncompleteness report saved to: {completeness_file}")


                           data completeness analysis                           

columns with less than 100% completeness:
                   column  non_null_count  completeness_pct
days_since_last_admission             789             31.80
          all_medications            1731             69.77
            all_diagnoses            2001             80.65
        primary_diagnosis            2001             80.65

completeness report saved to: d:\Github Desktop\Python\Hospital Data Curation\logs\completeness_report.csv


## 8. statistical validation

In [20]:
# perform statistical validation
print_section_header("statistical validation")

print("key statistical validations:\n")

# validate age distribution
if 'age' in master_df.columns:
    age_mean = master_df['age'].mean()
    age_std = master_df['age'].std()
    age_valid = (age_mean > 0) and (age_std > 0)
    print(f"✓ age distribution valid: mean={age_mean:.1f}, std={age_std:.1f}")

# validate length of stay distribution
if 'length_of_stay' in master_df.columns:
    los_mean = master_df['length_of_stay'].mean()
    los_median = master_df['length_of_stay'].median()
    los_valid = (los_mean > 0) and (los_median > 0)
    print(f"✓ length of stay distribution valid: mean={los_mean:.2f}, median={los_median:.2f}")

# validate diagnosis and medication counts
if 'diagnosis_count' in master_df.columns:
    diag_mean = master_df['diagnosis_count'].mean()
    print(f"✓ average diagnoses per visit: {diag_mean:.2f}")

if 'medication_count' in master_df.columns:
    med_mean = master_df['medication_count'].mean()
    print(f"✓ average medications per visit: {med_mean:.2f}")

# validate readmission rate
if 'is_readmitted' in master_df.columns:
    readmission_rate = (master_df['is_readmitted'].sum() / len(master_df)) * 100
    rate_valid = 0 <= readmission_rate <= 50  # reasonable range
    status = "✓" if rate_valid else "⚠"
    print(f"{status} readmission rate: {readmission_rate:.2f}%")

# validate high-risk patient proportion
if 'is_high_risk' in master_df.columns:
    high_risk_rate = (master_df['is_high_risk'].sum() / len(master_df)) * 100
    print(f"✓ high-risk patient rate: {high_risk_rate:.2f}%")


                             statistical validation                             

key statistical validations:

✓ length of stay distribution valid: mean=182.27, median=180.00
✓ average diagnoses per visit: 1.63
✓ average medications per visit: 1.06
✓ readmission rate: 4.72%
✓ high-risk patient rate: 21.93%


## 9. generate comprehensive validation report

In [21]:
# generate and save validation report
print_section_header("generating validation report")

report_text = validator.generate_validation_report()
print("\nvalidation report generated:")
print(report_text)

print(f"\nvalidation report saved to: {LOGS_DIR / 'validation_report.txt'}")

2025-11-10 17:51:44,847 - utils - INFO - validation report saved to d:\Github Desktop\Python\Hospital Data Curation\logs\validation_report.txt



                          generating validation report                          


validation report generated:
DATA VALIDATION REPORT


Dataset: patients
--------------------------------------------------------------------------------
✓ PASS | unique_patient_id
       all patient ids are unique

✓ PASS | no_missing_patient_id
       no missing values in patient_id


Dataset: visits
--------------------------------------------------------------------------------
✓ PASS | unique_visit_id
       all visit ids are unique

✓ PASS | discharge_after_admission
       all discharge dates are after admission dates

✓ PASS | valid_length_of_stay
       all length of stay values are between 0 and 365 days


Dataset: diagnoses
--------------------------------------------------------------------------------
✓ PASS | valid_icd10_format
       all icd-10 codes are valid

✓ PASS | no_missing_visit_id
       no missing visit ids


Dataset: referential_integrity
----------------------------------------

## 10. final data quality scorecard

In [22]:
# create data quality scorecard
print_section_header("data quality scorecard")

scorecard = {
    'metric': [],
    'status': [],
    'details': []
}

# uniqueness
if 'patient_id' in patients_df.columns:
    is_unique = patients_df['patient_id'].is_unique
    scorecard['metric'].append('patient id uniqueness')
    scorecard['status'].append('✓ pass' if is_unique else '✗ fail')
    scorecard['details'].append(f"{is_unique}")

if 'visit_id' in visits_df.columns:
    is_unique = visits_df['visit_id'].is_unique
    scorecard['metric'].append('visit id uniqueness')
    scorecard['status'].append('✓ pass' if is_unique else '✗ fail')
    scorecard['details'].append(f"{is_unique}")

# completeness
overall_completeness = (master_df.notna().sum().sum() / (len(master_df) * len(master_df.columns))) * 100
scorecard['metric'].append('overall data completeness')
scorecard['status'].append('✓ pass' if overall_completeness > 90 else '⚠ warning')
scorecard['details'].append(f"{overall_completeness:.2f}%")

# consistency - check date columns
if 'discharge_date' in visits_df.columns and 'admission_date' in visits_df.columns:
    # convert to datetime if not already
    visits_df['admission_date'] = pd.to_datetime(visits_df['admission_date'], errors='coerce')
    visits_df['discharge_date'] = pd.to_datetime(visits_df['discharge_date'], errors='coerce')
    date_consistency = (visits_df['discharge_date'] >= visits_df['admission_date']).all()
    scorecard['metric'].append('date consistency')
    scorecard['status'].append('✓ pass' if date_consistency else '✗ fail')
    scorecard['details'].append(str(date_consistency))

# validity - check age if exists in master dataset
if 'age' in master_df.columns:
    age_validity = master_df['age'].between(0, 120).all()
    scorecard['metric'].append('age validity')
    scorecard['status'].append('✓ pass' if age_validity else '✗ fail')
    scorecard['details'].append(str(age_validity))
elif 'age' in patients_df.columns:
    age_validity = patients_df['age'].between(0, 120).all()
    scorecard['metric'].append('age validity')
    scorecard['status'].append('✓ pass' if age_validity else '✗ fail')
    scorecard['details'].append(str(age_validity))

# create scorecard dataframe
scorecard_df = pd.DataFrame(scorecard)
print(scorecard_df.to_string(index=False))

# save scorecard
scorecard_file = LOGS_DIR / 'quality_scorecard.csv'
scorecard_df.to_csv(scorecard_file, index=False)
print(f"\nquality scorecard saved to: {scorecard_file}")



                             data quality scorecard                             

                   metric status details
    patient id uniqueness ✓ pass    True
      visit id uniqueness ✓ pass    True
overall data completeness ✓ pass  96.19%
         date consistency ✓ pass    True

quality scorecard saved to: d:\Github Desktop\Python\Hospital Data Curation\logs\quality_scorecard.csv


## summary

data validation completed:
- ✓ patients dataset validated
- ✓ visits dataset validated
- ✓ diagnoses dataset validated
- ✓ referential integrity confirmed
- ✓ transformed features validated
- ✓ completeness analysis performed
- ✓ statistical validation completed
- ✓ quality scorecard generated

all validation reports saved to `logs/`

next phase: predictive analytics (regression, association, classification)