In [None]:
import numpy as np
import pandas as pd

raw_data = pd.read_csv("../data/raw/diabetic_data.csv")

missing_value_count = (raw_data == "?").sum()

missing_value_percentage = ((raw_data == "?").sum() / len(raw_data)) * 100

missing_value_percentage

raw_data = raw_data.drop(columns=["weight", "payer_code", "medical_specialty"], inplace=False)

raw_data.head()

DRUG_COLS = [
    "metformin","repaglinide","nateglinide","chlorpropamide","glimepiride",
    "acetohexamide","glipizide","glyburide","tolbutamide","pioglitazone",
    "rosiglitazone","acarbose","miglitol","troglitazone","tolazamide",
    "examide","citoglipton","insulin","glyburide-metformin","glipizide-metformin",
    "glimepiride-pioglitazone","metformin-rosiglitazone","metformin-pioglitazone"
]

drug_summary = raw_data[DRUG_COLS].apply(pd.Series.value_counts).T.fillna(0).astype(int)
drug_percentages = drug_summary.div(drug_summary.sum(axis=1), axis=0) * 100
  
drug_percentages

drop_drugs = [
    "nateglinide",
    "chlorpropamide",
    "acetohexamide",
    "tolbutamide",
    "acarbose",
    "miglitol",
    "troglitazone",
    "tolazamide",
    "examide",
    "citoglipton",
    "glyburide-metformin",
    "glipizide-metformin",
    "glimepiride-pioglitazone",
    "metformin-rosiglitazone",
    "metformin-pioglitazone"
]

raw_data = raw_data.drop(columns=drop_drugs, inplace=False)
raw_data.head()

patient_nbr_is_unique = raw_data["patient_nbr"].is_unique

encounter_id_is_unique = raw_data["encounter_id"].is_unique

print(f"Patient number is unique: {patient_nbr_is_unique}")
print(f"Encounter id is unique: {encounter_id_is_unique}")

raw_data = raw_data.drop(columns=["patient_nbr", "encounter_id"], inplace=False)

raw_data.head()

OHE_COLS = [
    "race",
    "gender",
    "admission_type_id",
    "discharge_disposition_id",
    "admission_source_id",
    "max_glu_serum",
    "A1Cresult"
]

raw_data = pd.get_dummies(raw_data, columns=OHE_COLS, drop_first=False, dtype=int)

raw_data.head()

age_map = {
    "[0-10)":5, "[10-20)":15, "[20-30)":25, "[30-40)":35,
    "[40-50)":45, "[50-60)":55, "[60-70)":65, "[70-80)":75,
    "[80-90)":85, "[90-100)":95
}
raw_data["age"] = raw_data["age"].map(age_map).astype(int)
raw_data.head()

# 

DRUG_KEEP = [
    "metformin","repaglinide","glimepiride","glipizide","glyburide",
    "pioglitazone","rosiglitazone","insulin"
]
drug_map = {"No":0, "Down":1, "Steady":2, "Up":3}
for col in DRUG_KEEP:
    raw_data[col] = raw_data[col].map(drug_map).astype(int)

raw_data.head()

binary_map = {"No":0, "Ch":1, "Yes":1}
raw_data["change"] = raw_data["change"].map(binary_map).astype(int)
raw_data["diabetesMed"] = raw_data["diabetesMed"].map(binary_map).astype(int)

raw_data.head()

# Bucket ICD-9 codes for interpretability
# Example of common buckets:
# - Cardiovascular Disease
# - Kidney Disease
# - Respiratory Disease

for c in ["diag_1", "diag_2", "diag_3"]:
    raw_data[c] = raw_data[c].replace("?", "UNK").astype(str)

def bucket_icd9(code: str) -> str:
    """
    Returns a coarse clinical group for an ICD-9 diagnosis code.
    - Handles UNK / missing
    - Handles 'V' and 'E' codes
    - Uses the first 3 digits for numeric codes (e.g., 250.13 -> 250)
    """
    if not code or code.upper() in {"UNK", "UNKNOWN", "NA", "NONE"}:
        return "Unknown"

    code = code.strip()

    # V and E codes (non-numeric ICD-9 chapters)
    if code[0] in ("V", "v"):
        return "Supplementary Factors (V)"
    if code[0] in ("E", "e"):
        return "External Causes (E)"

    # Extract first 3 digits from numeric codes (handles decimals like 250.13)
    m = re.match(r"^(\d{3})", code.replace(".", ""))
    if not m:
        return "Unknown"

    three = int(m.group(1))

    # --- Buckets commonly used with this dataset ---
    # Infectious & Parasitic
    if   1 <= three <= 139:   return "Infectious & Parasitic (001–139)"
    # Neoplasms
    if 140 <= three <= 239:   return "Neoplasms (140–239)"
    # Endocrine, Nutritional, Metabolic (includes diabetes 250.*)
    if 240 <= three <= 279:
        if three == 250:      return "Diabetes (250)"
        return "Endocrine/Metabolic (240–279)"
    # Diseases of the Blood
    if 280 <= three <= 289:   return "Blood (280–289)"
    # Mental Disorders
    if 290 <= three <= 319:   return "Mental (290–319)"
    # Nervous System & Sense Organs
    if 320 <= three <= 389:   return "Nervous/Sense (320–389)"
    # Circulatory System
    if 390 <= three <= 459:   return "Circulatory (390–459)"
    # Respiratory System
    if 460 <= three <= 519:   return "Respiratory (460–519)"
    # Digestive System
    if 520 <= three <= 579:   return "Digestive (520–579)"
    # Genitourinary System
    if 580 <= three <= 629:   return "Genitourinary (580–629)"
    # Pregnancy, Childbirth, Puerperium
    if 630 <= three <= 679:   return "Pregnancy (630–679)"
    # Skin & Subcutaneous Tissue
    if 680 <= three <= 709:   return "Skin (680–709)"
    # Musculoskeletal & Connective Tissue
    if 710 <= three <= 739:   return "Musculoskeletal (710–739)"
    # Congenital Anomalies
    if 740 <= three <= 759:   return "Congenital (740–759)"
    # Certain Conditions Originating in Perinatal Period
    if 760 <= three <= 779:   return "Perinatal (760–779)"
    # Symptoms, Signs, & Ill-defined Conditions
    if 780 <= three <= 799:   return "Symptoms/Ill-defined (780–799)"
    # Injury & Poisoning
    if 800 <= three <= 999:   return "Injury/Poisoning (800–999)"

    return "Other/Unmapped"

# 3) Apply to diag_1..3 to create grouped columns
for c in ["diag_1", "diag_2", "diag_3"]:
    raw_data[c + "_grp"] = raw_data[c].apply(bucket_icd9)

# (Optional) Drop raw ICD columns if you won’t use them directly
# raw_data = raw_data.drop(columns=["diag_1","diag_2","diag_3"])

# 4) (Optional) One-hot encode the new groups
DIAG_GRP_COLS = ["diag_1_grp", "diag_2_grp", "diag_3_grp"]
raw_data = pd.get_dummies(raw_data, columns=DIAG_GRP_COLS, drop_first=False, dtype=int)

raw_data.head()



NameError: name 'pd' is not defined