## Load Data and Analyze Missing Patterns
Examine extent and patterns of missing data

In [9]:
import pandas as pd

df = pd.read_csv("1_datasets/processed/teds_a_2023_cleaned.csv")

  df = pd.read_csv("1_datasets/processed/teds_a_2023_cleaned.csv")


## Calculate Missing Value Statistics
Get percentage of missing values for each column

In [4]:
missing_stats = pd.DataFrame(
    {
        "column": df.columns,
        "missing_count": df.isnull().sum(),
        "missing_percent": (df.isnull().sum() / len(df) * 100).round(2),
        "non_missing_count": df.notnull().sum(),
    }
).sort_values("missing_percent", ascending=False)
missing_stats

Unnamed: 0,column,missing_count,missing_percent,non_missing_count
recent_arrests,recent_arrests,1528810,94.03,97023
tertiary_substance,tertiary_substance,1390965,85.55,234868
pregnant,pregnant,1131055,69.57,494778
income_source,income_source,962960,59.23,662873
secondary_substance,secondary_substance,957190,58.87,668643
health_insurance,health_insurance,878480,54.03,747353
wait_time_days,wait_time_days,872723,53.68,753110
payment_source,payment_source,867697,53.37,758136
marital_status,marital_status,474164,29.16,1151669
self_help_attendance,self_help_attendance,443359,27.27,1182474


## Identify Critical Variables
Define which variables are essential (cannot be missing for analysis) and
remove only rows missing critical variables for analysis

In [8]:
critical_vars = ["patient_id", "service_type", "primary_substance", "age_group", "sex"]

df_analysis = df.dropna(subset=critical_vars)

rows_removed = len(df) - len(df_analysis)
removal_percent = round((rows_removed / len(df) * 100), 2)

removal_summary = {
    "original_rows": len(df),
    "rows_after_removal": len(df_analysis),
    "rows_removed": rows_removed,
    "percent_removed": removal_percent,
}
df_analysis.to_csv("1_datasets/processed/teds_analysis_ready.csv", index=False)

## Machine Learning Preparation (For Later Phase)
For ML models, we'll need imputation rather than deletion

In [7]:
df_ml = df.copy()

numeric_cols = ["years_using", "number_of_substances"]
for col in numeric_cols:
    if col in df_ml.columns:
        df_ml[col] = df_ml[col].fillna(df_ml[col].median())

categorical_cols = [
    "wait_time_days",
    "prior_treatments",
    "employment_status",
    "education_level",
    "living_arrangement",
    "income_source",
]
for col in categorical_cols:
    if col in df_ml.columns:
        df_ml[col] = df_ml[col].fillna(df_ml[col].mode()[0])

binary_cols = [
    col for col in df_ml.columns if col.startswith("is_") or col.startswith("has_")
]
for col in binary_cols:
    if col in df_ml.columns:
        df_ml[col] = df_ml[col].fillna(0)

df_ml.to_csv("1_datasets/processed/teds_ml_ready.csv", index=False)