# Missing Value Handling Strategy - TEDS-D
Analyze and handle missing values appropriately for statistical analysis and ML

## Load Data and Analyze Missing Patterns
Examine extent and patterns of missing data

In [None]:
import pandas as pd

df = pd.read_csv("1_datasets/processed/teds_d_2023_cleaned.csv")

  df = pd.read_csv("1_datasets/processed/teds_d_2023_cleaned.csv")


## Calculate Missing Value Statistics
Get percentage of missing values for each column

In [15]:
missing_stats = pd.DataFrame(
    {
        "column": df.columns,
        "missing_count": df.isnull().sum(),
        "missing_percent": (df.isnull().sum() / len(df) * 100).round(2),
        "non_missing_count": df.notnull().sum(),
    }
).sort_values("missing_percent", ascending=False)
for col in [
    "patient_id",
    "service_type_admit",
    "primary_substance_admit",
    "age_group",
    "sex",
    "discharge_reason",
    "length_of_stay",
]:
    if col in df.columns:
        missing = df[col].isnull().sum()
        pct = missing / len(df) * 100
        print(f"{col:35} {missing:>8} ({pct:>5.2f}%)")

missing_stats

patient_id                                 0 ( 0.00%)
service_type_admit                         0 ( 0.00%)
primary_substance_admit               204027 (13.84%)
age_group                                  0 ( 0.00%)
sex                                      909 ( 0.06%)
discharge_reason                           0 ( 0.00%)
length_of_stay                        951494 (64.55%)


Unnamed: 0,column,missing_count,missing_percent,non_missing_count
arrests_discharge,arrests_discharge,1410497,95.69,63528
arrests_admit,arrests_admit,1387731,94.15,86294
tertiary_substance_discharge,tertiary_substance_discharge,1248165,84.68,225860
tertiary_substance_admit,tertiary_substance_admit,1209131,82.03,264894
pregnant,pregnant,993700,67.41,480325
...,...,...,...,...
short_stay,short_stay,0,0.00,1474025
long_stay,long_stay,0,0.00,1474025
employment_improved,employment_improved,0,0.00,1474025
housing_improved,housing_improved,0,0.00,1474025


## Identify Critical Variables
Define which variables are essential (cannot be missing for analysis)

In [24]:
critical_vars = [
    "patient_id",
    "discharge_reason",
]

## Create Analysis-Ready Dataset (Minimal Removal)
Remove only rows missing critical variables for analysis

In [25]:
df_analysis = df.dropna(subset=critical_vars)

rows_removed = len(df) - len(df_analysis)
removal_percent = round(rows_removed / len(df) * 100, 2)

removal_summary = {
    "original_rows": len(df),
    "rows_after_removal": len(df_analysis),
    "rows_removed": rows_removed,
    "percent_removed": removal_percent,
}
df_analysis.to_csv("1_datasets/processed/teds_d_analysis_ready.csv", index=False)

In [26]:
print(removal_summary)

{'original_rows': 1474025, 'rows_after_removal': 1474025, 'rows_removed': 0, 'percent_removed': 0.0}


## Machine Learning Preparation (For Later Phase)
For ML models, we'll need imputation rather than deletion

In [27]:
df_ml = df.copy()

numeric_cols = [
    "years_using",
    "number_of_substances_admit",
    "number_of_substances_discharge",
]
for col in numeric_cols:
    if col in df_ml.columns:
        df_ml[col] = df_ml[col].fillna(df_ml[col].median())

categorical_cols = [
    "wait_time_days",
    "prior_treatments",
    "employment_admit",
    "employment_discharge",
    "education_level",
    "living_arrangement_admit",
    "living_arrangement_discharge",
    "income_source",
    "length_of_stay",
    "discharge_reason",
]
for col in categorical_cols:
    if col in df_ml.columns:
        mode_val = df_ml[col].mode()
        if len(mode_val) > 0:
            df_ml[col] = df_ml[col].fillna(mode_val[0])

binary_cols = [
    col
    for col in df_ml.columns
    if col.startswith("is_")
    or col.startswith("has_")
    or col
    in [
        "completed_treatment",
        "dropped_out",
        "terminated",
        "transferred",
        "short_stay",
        "long_stay",
        "employment_improved",
        "housing_improved",
        "arrests_reduced",
    ]
]
for col in binary_cols:
    if col in df_ml.columns:
        df_ml[col] = df_ml[col].fillna(0)

remaining_missing = df_ml.isnull().sum()
cols_with_missing = remaining_missing[remaining_missing > 0]

if len(cols_with_missing) > 0:
    print(cols_with_missing)

    for col in cols_with_missing.index:
        if df_ml[col].dtype in ["object", "category"]:
            mode_val = df_ml[col].mode()
            fill_val = mode_val[0] if len(mode_val) > 0 else "Unknown"
            df_ml[col] = df_ml[col].fillna(fill_val)
        else:
            median_val = df_ml[col].median()
            fill_val = median_val if pd.notna(median_val) else 0
            df_ml[col] = df_ml[col].fillna(fill_val)

final_missing = df_ml.isnull().sum().sum()

df_ml.to_csv("1_datasets/processed/teds_d_ml_ready.csv", index=False)

sex                                   909
race                                70421
ethnicity                           76662
marital_status                     347693
arrests_admit                     1387731
arrests_discharge                 1410497
referral_source                    167818
primary_substance_admit            204027
secondary_substance_admit          769527
tertiary_substance_admit          1209131
primary_substance_discharge        358410
secondary_substance_discharge      885301
tertiary_substance_discharge      1248165
route_primary                      217171
frequency_primary_admit            221899
frequency_primary_discharge        464979
age_first_use_primary              227755
medication_assisted_therapy        177091
dsm_diagnosis                      274448
health_insurance                   855659
payment_source                     832872
self_help_attendance_admit         306565
self_help_attendance_discharge     296589
injection_drug_use                

In [28]:
print(final_missing)

0
