# Data Cleaning Notebook

This notebook demonstrates the data cleaning process for healthcare datasets.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set up paths
data_dir = Path('../data')
raw_dir = data_dir / 'raw'
cleaned_dir = data_dir / 'cleaned'

print(f"Raw data directory: {raw_dir}")
print(f"Cleaned data directory: {cleaned_dir}")


## Load Raw Data


In [None]:
# Load sample dataset
try:
    # Try to load MIMIC demo data
    mimic_file = raw_dir / 'mimic_demo' / 'admissions.csv'
    if mimic_file.exists():
        df = pd.read_csv(mimic_file)
        print(f"Loaded MIMIC data: {df.shape}")
    else:
        # Create sample data for demonstration
        np.random.seed(42)
        df = pd.DataFrame({
            'patient_id': range(1000),
            'age': np.random.normal(65, 15, 1000),
            'gender': np.random.choice(['M', 'F'], 1000),
            'admission_type': np.random.choice(['URGENT', 'ELECTIVE', 'EMERGENCY'], 1000),
            'length_of_stay': np.random.exponential(5, 1000),
            'mortality': np.random.choice([0, 1], 1000, p=[0.8, 0.2])
        })
        print(f"Created sample data: {df.shape}")
except Exception as e:
    print(f"Error loading data: {e}")
    df = pd.DataFrame()
