In [4]:
# Data Analysis for Sepsis Prediction Dataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Configuration
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 1000)


# Load the data
def load_data(path="../data/raw/Dataset.csv"):
    print("Loading data...")
    df = pd.read_csv(path)
    print(f"Loaded dataset with {df.shape[0]:,} rows and {df.shape[1]} columns")
    return df


df = load_data()

# Basic Dataset Information
print("\nBasic Dataset Information:")
print("-" * 50)
print("\nDataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())


# Memory Usage
def memory_usage(df):
    memory = df.memory_usage(deep=True).sum()
    for unit in ["B", "KB", "MB", "GB"]:
        if memory < 1024:
            return f"{memory:.2f} {unit}"
        memory /= 1024


print(f"\nMemory Usage: {memory_usage(df)}")

# Analyze Unit1 and Unit2
print("\nUnit1 Analysis:")
print("-" * 50)
unit1_counts = df["Unit1"].value_counts()
print("\nTop 10 most common Unit1 values:")
print(unit1_counts.head(10))
print(f"\nTotal unique Unit1 values: {len(unit1_counts)}")

print("\nUnit2 Analysis:")
print("-" * 50)
unit2_counts = df["Unit2"].value_counts()
print("\nTop 10 most common Unit2 values:")
print(unit2_counts.head(10))
print(f"\nTotal unique Unit2 values: {len(unit2_counts)}")

# Analyze values per patient
print("\nPatient Analysis:")
print("-" * 50)
patients = df["Patient_ID"].unique()
print(f"Total number of unique patients: {len(patients):,}")

# Calculate average readings per patient
readings_per_patient = df.groupby("Patient_ID").size()
print(f"\nReadings per patient:")
print(f"Mean: {readings_per_patient.mean():.2f}")
print(f"Median: {readings_per_patient.median():.2f}")
print(f"Min: {readings_per_patient.min()}")
print(f"Max: {readings_per_patient.max()}")


# Analyze Unit assignments per patient
def analyze_unit_assignments():
    patient_units = df.groupby("Patient_ID").agg(
        {"Unit1": "nunique", "Unit2": "nunique"}
    )

    print("\nUnit assignments per patient:")
    print("\nUnit1:")
    print(patient_units["Unit1"].value_counts().head())
    print("\nUnit2:")
    print(patient_units["Unit2"].value_counts().head())

    # Patients with multiple units
    multi_unit1 = patient_units[patient_units["Unit1"] > 1]
    multi_unit2 = patient_units[patient_units["Unit2"] > 1]

    print(f"\nPatients with multiple Unit1 assignments: {len(multi_unit1)}")
    print(f"Patients with multiple Unit2 assignments: {len(multi_unit2)}")

    return multi_unit1, multi_unit2


multi_unit1, multi_unit2 = analyze_unit_assignments()


# Examine a few patients with multiple unit assignments
def examine_patient_units(patient_id):
    patient_data = df[df["Patient_ID"] == patient_id]
    return patient_data[["Patient_ID", "Hour", "Unit1", "Unit2"]].sort_values("Hour")


if len(multi_unit1) > 0:
    print("\nExample of patient with multiple Unit1 assignments:")
    sample_patient = multi_unit1.index[0]
    print(examine_patient_units(sample_patient))


# Check for patterns in Unit assignments
def check_unit_patterns():
    # Check if Unit2 is dependent on Unit1
    unit_combinations = df.groupby(["Unit1", "Unit2"]).size().reset_index(name="count")
    unit_combinations = unit_combinations.sort_values("count", ascending=False)

    print("\nMost common Unit1-Unit2 combinations:")
    print(unit_combinations.head(10))

    # Check for temporal patterns
    df["Hour_bin"] = pd.cut(df["Hour"], bins=24, labels=range(24))
    unit_time = df.groupby("Hour_bin")["Unit1"].value_counts().unstack()

    return unit_combinations, unit_time


unit_combinations, unit_time = check_unit_patterns()


# Missing Values Analysis
def analyze_missing_values(df):
    missing = df.isnull().sum()
    missing_pct = (missing / len(df)) * 100
    missing_stats = pd.DataFrame(
        {"Missing Values": missing, "Percentage": missing_pct}
    ).sort_values("Percentage", ascending=False)

    print("\nMissing Values Analysis:")
    print("-" * 50)
    print("\nColumns with missing values:")
    print(missing_stats[missing_stats["Missing Values"] > 0])

    return missing_stats


missing_stats = analyze_missing_values(df)


# Data Quality Checks
def data_quality_checks(df):
    print("\nData Quality Checks:")
    print("-" * 50)

    # Check for obvious errors in vital signs
    vitals = {
        "HR": (0, 300),  # Heart rate in bpm
        "O2Sat": (0, 100),  # Oxygen saturation percentage
        "Temp": (25, 45),  # Temperature in Celsius
        "SBP": (0, 300),  # Systolic blood pressure
        "DBP": (0, 200),  # Diastolic blood pressure
        "Resp": (0, 100),  # Respiratory rate
    }

    for vital, (min_val, max_val) in vitals.items():
        if vital in df.columns:
            outliers = df[(df[vital] < min_val) | (df[vital] > max_val)][vital]
            if len(outliers) > 0:
                print(f"\n{vital} outliers:")
                print(f"Values outside [{min_val}, {max_val}]: {len(outliers)}")
                print(f"Example values: {outliers.head().tolist()}")


data_quality_checks(df)


# Save summary to file
def save_analysis_summary(filename="data_analysis_summary.txt"):
    with open(filename, "w") as f:
        f.write("Data Analysis Summary\n")
        f.write("=" * 50 + "\n\n")

        # Dataset basics
        f.write(f"Dataset Shape: {df.shape}\n")
        f.write(f"Memory Usage: {memory_usage(df)}\n")
        f.write(f"Number of Patients: {len(patients):,}\n\n")

        # Unit analysis
        f.write("Unit Analysis\n")
        f.write("-" * 20 + "\n")
        f.write(f"Unique Unit1 values: {len(unit1_counts)}\n")
        f.write(f"Unique Unit2 values: {len(unit2_counts)}\n")
        f.write(f"Patients with multiple Unit1: {len(multi_unit1)}\n")
        f.write(f"Patients with multiple Unit2: {len(multi_unit2)}\n\n")

        # Missing values
        f.write("Missing Values Summary\n")
        f.write("-" * 20 + "\n")
        f.write(missing_stats[missing_stats["Missing Values"] > 0].to_string())


save_analysis_summary()

print("\nAnalysis complete. Summary saved to 'data_analysis_summary.txt'")


Loading data...
Loaded dataset with 1,552,210 rows and 44 columns

Basic Dataset Information:
--------------------------------------------------

Dataset Shape: (1552210, 44)

Columns: ['Unnamed: 0', 'Hour', 'HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'EtCO2', 'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN', 'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct', 'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium', 'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC', 'Fibrinogen', 'Platelets', 'Age', 'Gender', 'Unit1', 'Unit2', 'HospAdmTime', 'ICULOS', 'SepsisLabel', 'Patient_ID']

Memory Usage: 521.07 MB

Unit1 Analysis:
--------------------------------------------------

Top 10 most common Unit1 values:
Unit1
0.0    473349
1.0    466901
Name: count, dtype: int64

Total unique Unit1 values: 2

Unit2 Analysis:
--------------------------------------------------

Top 10 most common Unit2 values:
Unit2
1.0    473349
0.0    466901
N

  unit_time = df.groupby("Hour_bin")["Unit1"].value_counts().unstack()
