# hospital data curation project
## comprehensive project summary and handover document

this notebook generates a complete summary of all data curation and analytics work performed.

In [52]:
# import required libraries
import sys
import os
from pathlib import Path

# add src directory to python path
notebook_dir = Path(os.getcwd())
src_dir = notebook_dir / 'src'
if str(src_dir) not in sys.path:
    sys.path.insert(0, str(src_dir))

import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# import project modules
import config
import utils

# use imported modules
RAW_DATA_DIR = config.RAW_DATA_DIR
CLEANED_DATA_DIR = config.CLEANED_DATA_DIR
PREPROCESSED_DATA_DIR = config.PREPROCESSED_DATA_DIR
LOGS_DIR = config.LOGS_DIR
MODELS_DIR = config.MODELS_DIR
VISUALIZATIONS_DIR = config.VISUALIZATIONS_DIR
PROFILING_DIR = config.PROFILING_DIR
SWEETVIZ_DIR = config.SWEETVIZ_DIR
setup_logging = utils.setup_logging
print_section_header = utils.print_section_header


## 1. project overview

In [53]:
# project metadata
print_section_header("hospital data curation project - summary")

project_info = {
    'project_name': 'hospital patient records cleanup and analytics',
    'domain': 'healthcare',
    'objective': 'clean, curate, and prepare hospital data for epidemiological research and predictive analytics',
    'date_completed': datetime.now().strftime('%Y-%m-%d'),
    'total_phases': 9,
    'compliance': 'hipaa-ready data handling'
}

print("project information:")
for key, value in project_info.items():
    print(f"  {key}: {value}")


                    hospital data curation project - summary                    

project information:
  project_name: hospital patient records cleanup and analytics
  domain: healthcare
  objective: clean, curate, and prepare hospital data for epidemiological research and predictive analytics
  date_completed: 2025-11-10
  total_phases: 9
  compliance: hipaa-ready data handling


## 2. datasets processed

In [54]:
# summarize all datasets
print_section_header("datasets inventory")

datasets_summary = []

# raw datasets
for data_file in RAW_DATA_DIR.glob('*.csv'):
    try:
        df = pd.read_csv(data_file)
        datasets_summary.append({
            'type': 'raw',
            'filename': data_file.name,
            'rows': len(df),
            'columns': len(df.columns),
            'size_kb': data_file.stat().st_size / 1024
        })
    except:
        pass

# cleaned datasets
for data_file in CLEANED_DATA_DIR.glob('*.csv'):
    try:
        df = pd.read_csv(data_file)
        datasets_summary.append({
            'type': 'cleaned',
            'filename': data_file.name,
            'rows': len(df),
            'columns': len(df.columns),
            'size_kb': data_file.stat().st_size / 1024
        })
    except:
        pass

# preprocessed datasets
for data_file in PREPROCESSED_DATA_DIR.glob('*.csv'):
    try:
        df = pd.read_csv(data_file)
        datasets_summary.append({
            'type': 'preprocessed',
            'filename': data_file.name,
            'rows': len(df),
            'columns': len(df.columns),
            'size_kb': data_file.stat().st_size / 1024
        })
    except:
        pass

if datasets_summary:
    summary_df = pd.DataFrame(datasets_summary)
    print("\ndataset inventory:")
    print(summary_df.to_string(index=False))
    
    print(f"\ntotal datasets: {len(summary_df)}")
    print(f"total size: {summary_df['size_kb'].sum():.2f} kb")
else:
    print("\nno datasets found - ensure data files are in place")


                               datasets inventory                               


dataset inventory:
        type                       filename  rows  columns    size_kb
         raw                  diagnoses.csv  8000        4 352.329102
         raw              hospital_info.csv    20        5   0.712891
         raw                medications.csv  6000        6 323.243164
         raw                   patients.csv  3000        7 366.284180
         raw                      staff.csv   500        5  20.965820
         raw                     visits.csv  5000        7 266.633789
     cleaned            clean_diagnoses.csv  7998        5 391.287109
     cleaned        clean_hospital_info.csv    20        5   0.712891
     cleaned          clean_medications.csv  5324        6 286.842773
     cleaned             clean_patients.csv  3000        7 375.029297
     cleaned                clean_staff.csv   500        5  20.965820
     cleaned               clean_visits.csv  2481        

## 3. data quality improvements

In [55]:
# load cleaning report if available
print_section_header("data quality improvements")

cleaning_report_file = LOGS_DIR / 'cleaning_report.csv'
if cleaning_report_file.exists():
    cleaning_df = pd.read_csv(cleaning_report_file)
    print("\ncleaning summary:")
    print(cleaning_df.to_string(index=False))
    
    total_initial = cleaning_df['initial_rows'].sum()
    total_final = cleaning_df['final_rows'].sum()
    total_removed = cleaning_df['removed_rows'].sum()
    
    print(f"\noverall statistics:")
    print(f"  total initial rows: {total_initial}")
    print(f"  total final rows: {total_final}")
    print(f"  total removed rows: {total_removed} ({total_removed/total_initial*100:.2f}%)")
    print(f"  data retention rate: {total_final/total_initial*100:.2f}%")
else:
    print("\ncleaning report not found - run notebook 03_data_cleaning.ipynb")


                           data quality improvements                            


cleaning summary:
    dataset  initial_rows  final_rows  removed_rows
   patients          3000        3000             0
     visits          5000        2481          2519
  diagnoses          8000        7998             2
medications          6000        5324           676
      staff           500         500             0

overall statistics:
  total initial rows: 22500
  total final rows: 19303
  total removed rows: 3197 (14.21%)
  data retention rate: 85.79%


## 4. validation results

In [56]:
# load validation scorecard
print_section_header("data validation results")

scorecard_file = LOGS_DIR / 'quality_scorecard.csv'
if scorecard_file.exists():
    scorecard_df = pd.read_csv(scorecard_file)
    print("\nquality scorecard:")
    print(scorecard_df.to_string(index=False))
    
    pass_count = scorecard_df['status'].str.contains('pass').sum()
    total_checks = len(scorecard_df)
    
    print(f"\nvalidation pass rate: {pass_count}/{total_checks} ({pass_count/total_checks*100:.1f}%)")
else:
    print("\nquality scorecard not found - run notebook 06_data_validation.ipynb")

# check for validation report
validation_file = LOGS_DIR / 'validation_report.txt'
if validation_file.exists():
    print(f"\ndetailed validation report available at: {validation_file}")


                            data validation results                             


quality scorecard:
                   metric status details
    patient id uniqueness ✓ pass    True
      visit id uniqueness ✓ pass    True
overall data completeness ✓ pass  96.19%
         date consistency ✓ pass    True

validation pass rate: 4/4 (100.0%)

detailed validation report available at: d:\Github Desktop\Python\Hospital Data Curation\logs\validation_report.txt


## 5. machine learning models

In [57]:
# summarize trained models
print_section_header("machine learning models")

# regression models
regression_comparison_file = MODELS_DIR / 'regression_model_comparison.csv'
if regression_comparison_file.exists():
    print("\nregression analysis (length of stay prediction):")
    regression_df = pd.read_csv(regression_comparison_file)
    print(regression_df.to_string(index=False))
    
    best_model = regression_df.loc[regression_df['test_rmse'].idxmin(), 'model']
    best_rmse = regression_df['test_rmse'].min()
    print(f"\nbest model: {best_model} (rmse: {best_rmse:.3f} days)")

# classification models
classification_comparison_file = MODELS_DIR / 'classification_model_comparison.csv'
if classification_comparison_file.exists():
    print("\n\nclassification analysis (readmission prediction):")
    classification_df = pd.read_csv(classification_comparison_file)
    print(classification_df.to_string(index=False))
    
    best_model = classification_df.loc[classification_df['f1_score'].idxmax(), 'model']
    best_f1 = classification_df['f1_score'].max()
    print(f"\nbest model: {best_model} (f1-score: {best_f1:.3f})")

# count model files
model_files = list(MODELS_DIR.glob('*.pkl'))
print(f"\n\ntrained model files: {len(model_files)}")
for model_file in model_files:
    print(f"  - {model_file.name}")


                            machine learning models                             


regression analysis (length of stay prediction):
            model  train_rmse  test_rmse  train_mae  test_mae  train_r2   test_r2
linear regression  104.530297 105.974927  90.292414 92.027624  0.005783 -0.001454
 ridge regression  104.530298 105.974981  90.292471 92.027865  0.005783 -0.001456
 lasso regression  104.530545 105.960289  90.297439 92.024668  0.005778 -0.001178

best model: lasso regression (rmse: 105.960 days)


classification analysis (readmission prediction):
            model  accuracy  precision  recall  f1_score  auc_roc
    decision tree  0.953722        0.0     0.0       0.0 0.605944
    random forest  0.953722        0.0     0.0       0.0 0.602825
gradient boosting  0.943662        0.0     0.0       0.0 0.492845

best model: decision tree (f1-score: 0.000)


trained model files: 8
  - decision_tree_readmission.pkl
  - gradient_boosting_readmission.pkl
  - lasso_regression_los.pkl
 

## 6. association rules

In [58]:
# summarize association rules
print_section_header("association rule mining results")

rule_files = {
    'diagnosis_association_rules.csv': 'diagnosis co-occurrence patterns',
    'medication_association_rules.csv': 'medication prescription patterns',
    'combined_association_rules.csv': 'diagnosis → medication associations',
    'readmission_prediction_rules.csv': 'readmission risk rules'
}

for filename, description in rule_files.items():
    rule_file = MODELS_DIR / filename
    if rule_file.exists():
        rules_df = pd.read_csv(rule_file)
        print(f"\n{description}:")
        print(f"  total rules: {len(rules_df)}")
        if len(rules_df) > 0:
            print(f"  avg confidence: {rules_df['confidence'].mean():.3f}")
            print(f"  avg lift: {rules_df['lift'].mean():.3f}")
            print(f"  top rule confidence: {rules_df['confidence'].max():.3f}")


                        association rule mining results                         


diagnosis co-occurrence patterns:
  total rules: 0

medication prescription patterns:
  total rules: 20
  avg confidence: 0.216
  avg lift: 0.706
  top rule confidence: 0.235

diagnosis → medication associations:
  total rules: 0

readmission risk rules:
  total rules: 0


## 7. reports and visualizations

In [59]:
# inventory reports and visualizations
print_section_header("generated reports and visualizations")

# profiling reports
profiling_reports = list(PROFILING_DIR.glob('*.html'))
print(f"\nydata profiling reports: {len(profiling_reports)}")
for report in profiling_reports:
    print(f"  - {report.name}")

# sweetviz reports
sweetviz_reports = list(SWEETVIZ_DIR.glob('*.html'))
print(f"\nsweetviz reports: {len(sweetviz_reports)}")
for report in sweetviz_reports:
    print(f"  - {report.name}")

# visualizations
viz_files = list(VISUALIZATIONS_DIR.glob('*.png'))
print(f"\nvisualizations: {len(viz_files)}")
for viz_file in viz_files:
    print(f"  - {viz_file.name}")

# log files
log_files = list(LOGS_DIR.glob('*.csv')) + list(LOGS_DIR.glob('*.txt'))
print(f"\nlog and report files: {len(log_files)}")
for log_file in log_files:
    print(f"  - {log_file.name}")


                      generated reports and visualizations                      


ydata profiling reports: 6
  - diagnoses_profile_20251110_174348.html
  - hospital_info_profile_20251110_174348.html
  - medications_profile_20251110_174348.html
  - patients_profile_20251110_174348.html
  - staff_profile_20251110_174348.html
  - visits_profile_20251110_174348.html

sweetviz reports: 6
  - diagnoses_sweetviz_20251110_174348.html
  - hospital_info_sweetviz_20251110_174348.html
  - medications_sweetviz_20251110_174348.html
  - patients_sweetviz_20251110_174348.html
  - staff_sweetviz_20251110_174348.html
  - visits_sweetviz_20251110_174348.html

visualizations: 8
  - association_lift_distribution.png
  - classification_confusion_matrices.png
  - classification_feature_importance.png
  - classification_roc_curves.png
  - regression_actual_vs_predicted.png
  - regression_model_comparison.png
  - regression_residuals.png
  - top_medications_frequency.png

log and report files: 7
  - cleaning

## 8. key performance indicators

In [60]:
# calculate key metrics from master dataset
print_section_header("key performance indicators")

master_file = PREPROCESSED_DATA_DIR / 'transformed_master_dataset.csv'
if master_file.exists():
    master_df = pd.read_csv(master_file)
    
    kpis = {}
    
    # length of stay
    if 'length_of_stay' in master_df.columns:
        kpis['average_length_of_stay'] = f"{master_df['length_of_stay'].mean():.2f} days"
        kpis['median_length_of_stay'] = f"{master_df['length_of_stay'].median():.2f} days"
    
    # readmission rate
    if 'is_readmitted' in master_df.columns:
        readmission_rate = (master_df['is_readmitted'].sum() / len(master_df)) * 100
        kpis['readmission_rate'] = f"{readmission_rate:.2f}%"
    
    # high-risk patients
    if 'is_high_risk' in master_df.columns:
        high_risk_rate = (master_df['is_high_risk'].sum() / len(master_df)) * 100
        kpis['high_risk_patient_rate'] = f"{high_risk_rate:.2f}%"
    
    # unique patients and visits
    if 'patient_id' in master_df.columns:
        kpis['unique_patients'] = master_df['patient_id'].nunique()
    if 'visit_id' in master_df.columns:
        kpis['total_visits'] = master_df['visit_id'].nunique()
    
    # average diagnoses and medications
    if 'diagnosis_count' in master_df.columns:
        kpis['avg_diagnoses_per_visit'] = f"{master_df['diagnosis_count'].mean():.2f}"
    if 'medication_count' in master_df.columns:
        kpis['avg_medications_per_visit'] = f"{master_df['medication_count'].mean():.2f}"
    
    print("\nkey performance indicators:")
    for metric, value in kpis.items():
        print(f"  {metric}: {value}")
else:
    print("\nmaster dataset not found")


                           key performance indicators                           


key performance indicators:
  average_length_of_stay: 182.27 days
  median_length_of_stay: 180.00 days
  readmission_rate: 4.72%
  high_risk_patient_rate: 21.93%
  unique_patients: 1692
  total_visits: 2481
  avg_diagnoses_per_visit: 1.63
  avg_medications_per_visit: 1.06


## 9. project deliverables checklist

In [61]:
# create deliverables checklist
print_section_header("project deliverables checklist")

deliverables = [
    ('cleaned datasets', CLEANED_DATA_DIR, '*.csv'),
    ('integrated datasets', PREPROCESSED_DATA_DIR, '*.csv'),
    ('profiling reports', PROFILING_DIR, '*.html'),
    ('sweetviz reports', SWEETVIZ_DIR, '*.html'),
    ('validation reports', LOGS_DIR, ['validation_report.txt', 'quality_scorecard.csv']),
    ('data dictionary', LOGS_DIR, ['data_dictionary.csv', 'data_dictionary.xlsx']),
    ('ml models', MODELS_DIR, '*.pkl'),
    ('association rules', MODELS_DIR, '*_rules.csv'),
    ('visualizations', VISUALIZATIONS_DIR, '*.png'),
    ('documentation', Path('.'), 'README.md')
]

print("\ndeliverables status:")
for deliverable_name, directory, pattern in deliverables:
    if isinstance(pattern, list):
        files = [directory / p for p in pattern]
        exists = all(f.exists() for f in files)
        count = len([f for f in files if f.exists()])
    else:
        files = list(directory.glob(pattern))
        exists = len(files) > 0
        count = len(files)
    
    status = "✓" if exists else "✗"
    print(f"  {status} {deliverable_name}: {count} file(s)")


                         project deliverables checklist                         


deliverables status:
  ✓ cleaned datasets: 6 file(s)
  ✓ integrated datasets: 7 file(s)
  ✓ profiling reports: 6 file(s)
  ✓ sweetviz reports: 6 file(s)
  ✓ validation reports: 2 file(s)
  ✓ data dictionary: 2 file(s)
  ✓ ml models: 8 file(s)
  ✓ association rules: 4 file(s)
  ✓ visualizations: 8 file(s)
  ✓ documentation: 1 file(s)

                         project deliverables checklist                         


deliverables status:
  ✓ cleaned datasets: 6 file(s)
  ✓ integrated datasets: 7 file(s)
  ✓ profiling reports: 6 file(s)
  ✓ sweetviz reports: 6 file(s)
  ✓ validation reports: 2 file(s)
  ✓ data dictionary: 2 file(s)
  ✓ ml models: 8 file(s)
  ✓ association rules: 4 file(s)
  ✓ visualizations: 8 file(s)
  ✓ documentation: 1 file(s)


## summary

comprehensive project summary completed:
- ✓ all datasets inventoried
- ✓ quality improvements documented
- ✓ validation results summarized
- ✓ ml models catalogued
- ✓ reports and visualizations listed
- ✓ kpis calculated
- ✓ deliverables checklist verified
- ✓ handover document generated

**project successfully completed and documented!**