In [2]:
import numpy as np
import pandas as pd
import re

In [3]:
raw_data = pd.read_csv("../data/raw/diabetic_data.csv")

In [4]:
missing_value_count = (raw_data == "?").sum()

missing_value_percentage = ((raw_data == "?").sum() / len(raw_data)) * 100

missing_value_percentage

encounter_id                 0.000000
patient_nbr                  0.000000
race                         2.233555
gender                       0.000000
age                          0.000000
weight                      96.858479
admission_type_id            0.000000
discharge_disposition_id     0.000000
admission_source_id          0.000000
time_in_hospital             0.000000
payer_code                  39.557416
medical_specialty           49.082208
num_lab_procedures           0.000000
num_procedures               0.000000
num_medications              0.000000
number_outpatient            0.000000
number_emergency             0.000000
number_inpatient             0.000000
diag_1                       0.020636
diag_2                       0.351787
diag_3                       1.398306
number_diagnoses             0.000000
max_glu_serum                0.000000
A1Cresult                    0.000000
metformin                    0.000000
repaglinide                  0.000000
nateglinide 

### Drop columns that exceed the 20% acceptable threshold for missing values.

In [5]:
raw_data = raw_data.drop(columns=["weight", "payer_code", "medical_specialty"], inplace=False)

raw_data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),6,25,1,1,41,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,59,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,11,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,44,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,51,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [6]:
DRUG_COLS = [
    "metformin","repaglinide","nateglinide","chlorpropamide","glimepiride",
    "acetohexamide","glipizide","glyburide","tolbutamide","pioglitazone",
    "rosiglitazone","acarbose","miglitol","troglitazone","tolazamide",
    "examide","citoglipton","insulin","glyburide-metformin","glipizide-metformin",
    "glimepiride-pioglitazone","metformin-rosiglitazone","metformin-pioglitazone"
]

drug_summary = raw_data[DRUG_COLS].apply(pd.Series.value_counts).T.fillna(0).astype(int)
drug_percentages = drug_summary.div(drug_summary.sum(axis=1), axis=0) * 100
  
drug_percentages

Unnamed: 0,Down,No,Steady,Up
metformin,0.565022,80.358862,18.027632,1.048484
repaglinide,0.044219,98.487707,1.359983,0.108091
nateglinide,0.010809,99.3092,0.656408,0.023584
chlorpropamide,0.000983,99.915492,0.077629,0.005896
glimepiride,0.190633,94.899082,4.588959,0.321325
acetohexamide,0.0,99.999017,0.000983,0.0
glipizide,0.550282,87.534147,11.158933,0.756638
glyburide,0.554213,89.534815,9.113063,0.797909
tolbutamide,0.0,99.977399,0.022601,0.0
pioglitazone,0.115952,92.799167,6.854942,0.229939


### Drop columns that are near constant (>= 99%).

In [7]:
drop_drugs = [
    "nateglinide",
    "chlorpropamide",
    "acetohexamide",
    "tolbutamide",
    "acarbose",
    "miglitol",
    "troglitazone",
    "tolazamide",
    "examide",
    "citoglipton",
    "glyburide-metformin",
    "glipizide-metformin",
    "glimepiride-pioglitazone",
    "metformin-rosiglitazone",
    "metformin-pioglitazone"
]

raw_data = raw_data.drop(columns=drop_drugs, inplace=False)
raw_data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,repaglinide,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),6,25,1,1,41,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,59,...,No,No,No,No,No,No,Up,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,11,...,No,No,Steady,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,44,...,No,No,No,No,No,No,Up,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,51,...,No,No,Steady,No,No,No,Steady,Ch,Yes,NO


In [8]:
patient_nbr_is_unique = raw_data["patient_nbr"].is_unique

encounter_id_is_unique = raw_data["encounter_id"].is_unique

print(f"Patient number is unique: {patient_nbr_is_unique}")
print(f"Encounter id is unique: {encounter_id_is_unique}")

Patient number is unique: False
Encounter id is unique: True


In [9]:
raw_data = raw_data.drop(columns=["patient_nbr", "encounter_id"], inplace=False)

raw_data.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,repaglinide,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),6,25,1,1,41,0,1,...,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,[10-20),1,1,7,3,59,0,18,...,No,No,No,No,No,No,Up,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),1,1,7,2,11,5,13,...,No,No,Steady,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[30-40),1,1,7,2,44,1,16,...,No,No,No,No,No,No,Up,Ch,Yes,NO
4,Caucasian,Male,[40-50),1,1,7,1,51,0,8,...,No,No,Steady,No,No,No,Steady,Ch,Yes,NO


In [10]:
OHE_COLS = [
    "race",
    "gender",
    "admission_type_id",
    "discharge_disposition_id",
    "admission_source_id",
    "max_glu_serum",
    "A1Cresult"
]

raw_data = pd.get_dummies(raw_data, columns=OHE_COLS, drop_first=False, dtype=int)

raw_data.head()

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,...,admission_source_id_17,admission_source_id_20,admission_source_id_22,admission_source_id_25,max_glu_serum_>200,max_glu_serum_>300,max_glu_serum_Norm,A1Cresult_>7,A1Cresult_>8,A1Cresult_Norm
0,[0-10),1,41,0,1,0,0,0,250.83,?,...,0,0,0,0,0,0,0,0,0,0
1,[10-20),3,59,0,18,0,0,0,276.0,250.01,...,0,0,0,0,0,0,0,0,0,0
2,[20-30),2,11,5,13,2,0,1,648.0,250,...,0,0,0,0,0,0,0,0,0,0
3,[30-40),2,44,1,16,0,0,0,8.0,250.43,...,0,0,0,0,0,0,0,0,0,0
4,[40-50),1,51,0,8,0,0,0,197.0,157,...,0,0,0,0,0,0,0,0,0,0


In [11]:
age_map = {
    "[0-10)":0, "[10-20)":1, "[20-30)":2, "[30-40)":3,
    "[40-50)":4, "[50-60)":5, "[60-70)":6, "[70-80)":7,
    "[80-90)":8, "[90-100)":9
}

raw_data["age"] = raw_data["age"].map(age_map).astype(int)
raw_data.head()

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,...,admission_source_id_17,admission_source_id_20,admission_source_id_22,admission_source_id_25,max_glu_serum_>200,max_glu_serum_>300,max_glu_serum_Norm,A1Cresult_>7,A1Cresult_>8,A1Cresult_Norm
0,0,1,41,0,1,0,0,0,250.83,?,...,0,0,0,0,0,0,0,0,0,0
1,1,3,59,0,18,0,0,0,276.0,250.01,...,0,0,0,0,0,0,0,0,0,0
2,2,2,11,5,13,2,0,1,648.0,250,...,0,0,0,0,0,0,0,0,0,0
3,3,2,44,1,16,0,0,0,8.0,250.43,...,0,0,0,0,0,0,0,0,0,0
4,4,1,51,0,8,0,0,0,197.0,157,...,0,0,0,0,0,0,0,0,0,0


In [12]:
DRUG_COLS = [
    "metformin","repaglinide","glimepiride","glipizide","glyburide","pioglitazone","rosiglitazone","insulin"
]

for col in DRUG_COLS:
    unique_values_drugs = raw_data[col].unique()
    print(f"{col}: {unique_values_drugs}")

metformin: ['No' 'Steady' 'Up' 'Down']
repaglinide: ['No' 'Up' 'Steady' 'Down']
glimepiride: ['No' 'Steady' 'Down' 'Up']
glipizide: ['No' 'Steady' 'Up' 'Down']
glyburide: ['No' 'Steady' 'Up' 'Down']
pioglitazone: ['No' 'Steady' 'Up' 'Down']
rosiglitazone: ['No' 'Steady' 'Up' 'Down']
insulin: ['No' 'Up' 'Steady' 'Down']


In [13]:
drug_map = {"No":0, "Down":1, "Steady":2, "Up":3}

for col in DRUG_COLS:
    raw_data[col] = raw_data[col].map(drug_map).astype(int)

raw_data.head()

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,...,admission_source_id_17,admission_source_id_20,admission_source_id_22,admission_source_id_25,max_glu_serum_>200,max_glu_serum_>300,max_glu_serum_Norm,A1Cresult_>7,A1Cresult_>8,A1Cresult_Norm
0,0,1,41,0,1,0,0,0,250.83,?,...,0,0,0,0,0,0,0,0,0,0
1,1,3,59,0,18,0,0,0,276.0,250.01,...,0,0,0,0,0,0,0,0,0,0
2,2,2,11,5,13,2,0,1,648.0,250,...,0,0,0,0,0,0,0,0,0,0
3,3,2,44,1,16,0,0,0,8.0,250.43,...,0,0,0,0,0,0,0,0,0,0
4,4,1,51,0,8,0,0,0,197.0,157,...,0,0,0,0,0,0,0,0,0,0


In [14]:
unique_values_change = raw_data["change"].unique()
print(f"Change: {unique_values_change}")

Change: ['No' 'Ch']


In [15]:
unique_values_diabetesMed = raw_data["diabetesMed"].unique()
print(f"DiabetesMed: {unique_values_diabetesMed}")

DiabetesMed: ['No' 'Yes']


In [16]:
binary_map = {"No":0, "Ch":1, "Yes":1}
raw_data["change"] = raw_data["change"].map(binary_map).astype(int)
raw_data["diabetesMed"] = raw_data["diabetesMed"].map(binary_map).astype(int)

raw_data.head()

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,...,admission_source_id_17,admission_source_id_20,admission_source_id_22,admission_source_id_25,max_glu_serum_>200,max_glu_serum_>300,max_glu_serum_Norm,A1Cresult_>7,A1Cresult_>8,A1Cresult_Norm
0,0,1,41,0,1,0,0,0,250.83,?,...,0,0,0,0,0,0,0,0,0,0
1,1,3,59,0,18,0,0,0,276.0,250.01,...,0,0,0,0,0,0,0,0,0,0
2,2,2,11,5,13,2,0,1,648.0,250,...,0,0,0,0,0,0,0,0,0,0
3,3,2,44,1,16,0,0,0,8.0,250.43,...,0,0,0,0,0,0,0,0,0,0
4,4,1,51,0,8,0,0,0,197.0,157,...,0,0,0,0,0,0,0,0,0,0


In [17]:
unique_values_change = raw_data["readmitted"].unique()
print(f"Readmitted: {unique_values_change}")

Readmitted: ['NO' '>30' '<30']


In [18]:
readmitted_binary_map = {"NO":0, ">30":0, "<30":1}
raw_data["readmitted"] = raw_data["readmitted"].map(readmitted_binary_map).astype(int)

raw_data.head()

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,...,admission_source_id_17,admission_source_id_20,admission_source_id_22,admission_source_id_25,max_glu_serum_>200,max_glu_serum_>300,max_glu_serum_Norm,A1Cresult_>7,A1Cresult_>8,A1Cresult_Norm
0,0,1,41,0,1,0,0,0,250.83,?,...,0,0,0,0,0,0,0,0,0,0
1,1,3,59,0,18,0,0,0,276.0,250.01,...,0,0,0,0,0,0,0,0,0,0
2,2,2,11,5,13,2,0,1,648.0,250,...,0,0,0,0,0,0,0,0,0,0
3,3,2,44,1,16,0,0,0,8.0,250.43,...,0,0,0,0,0,0,0,0,0,0
4,4,1,51,0,8,0,0,0,197.0,157,...,0,0,0,0,0,0,0,0,0,0


In [34]:
unique_values_diag1 = raw_data["diag_1"].unique()
print(f"Diagnosis 1: {unique_values_diag1}")

Diagnosis 1: ['250.83' '276' '648' '8' '197' '414' '428' '398' '434' '250.7' '157'
 '518' '999' '410' '682' '402' '737' '572' 'V57' '189' '786' '427' '996'
 '277' '584' '462' '473' '411' '174' '486' '998' '511' '432' '626' '295'
 '196' '250.6' '618' '182' '845' '423' '808' '250.4' '722' '403' '250.11'
 '784' '707' '440' '151' '715' '997' '198' '564' '812' '38' '590' '556'
 '578' '250.32' '433' 'V58' '569' '185' '536' '255' '250.13' '599' '558'
 '574' '491' '560' '244' '250.03' '577' '730' '188' '824' '250.8' '332'
 '562' '291' '296' '510' '401' '263' '438' '70' '250.02' '493' '642' '625'
 '571' '738' '593' '250.42' '807' '456' '446' '575' '250.41' '820' '515'
 '780' '250.22' '995' '235' '250.82' '721' '787' '162' '724' '282' '514'
 'V55' '281' '250.33' '530' '466' '435' '250.12' 'V53' '789' '566' '822'
 '191' '557' '733' '455' '711' '482' '202' '280' '553' '225' '154' '441'
 '250.81' '349' '?' '962' '592' '507' '386' '156' '200' '728' '348' '459'
 '426' '388' '607' '337' '82' '531' '59

In [20]:
raw_data = raw_data.drop(columns=["diag_2", "diag_3"], inplace=False)

In [21]:
chapters = pd.read_csv("../data/external/icd9_chapters.csv")

In [32]:
def safe_int(x):
    try:
        return int(str(x).replace("E", "").replace("V", ""))
    except ValueError:
        return None

chapters["start_num"] = chapters["start"].apply(safe_int)
chapters["end_num"]   = chapters["end"].apply(safe_int)

def bucket_icd9(code: str) -> str:
    # treat "?" as unknown
    if pd.isna(code) or str(code).strip() == "?":
        return "Unknown"

    code = str(code).strip()

    # V / E chapters
    if code[0].upper() == "V":
        return "Supplementary Factors (V)"
    if code[0].upper() == "E":
        return "External Causes (E)"

    m = re.match(r"^(\d{3})", code.replace(".", ""))
    if not m:
        return "Unknown"

    three = int(m.group(1))

    for _, row in chapters.iterrows():
        if row["start_num"] is not None and row["end_num"] is not None:
            if row["start_num"] <= three <= row["end_num"]:
                return row["name"]

    return "Other/Unmapped"

c = "diag_1"

raw_data[c] = raw_data[c].astype(str)
raw_data[c + "_grp"] = raw_data[c].apply(bucket_icd9)

In [33]:
raw_data.head()

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,number_diagnoses,...,admission_source_id_20,admission_source_id_22,admission_source_id_25,max_glu_serum_>200,max_glu_serum_>300,max_glu_serum_Norm,A1Cresult_>7,A1Cresult_>8,A1Cresult_Norm,diag_1_grp
0,0,1,41,0,1,0,0,0,250.83,1,...,0,0,0,0,0,0,0,0,0,Endocrine/Metabolic/Immunity
1,1,3,59,0,18,0,0,0,276.0,9,...,0,0,0,0,0,0,0,0,0,Endocrine/Metabolic/Immunity
2,2,2,11,5,13,2,0,1,648.0,6,...,0,0,0,0,0,0,0,0,0,Pregnancy/Childbirth/Puerperium
3,3,2,44,1,16,0,0,0,8.0,7,...,0,0,0,0,0,0,0,0,0,Unknown
4,4,1,51,0,8,0,0,0,197.0,5,...,0,0,0,0,0,0,0,0,0,Neoplasms


In [36]:
raw_data = raw_data.drop(columns=["diag_1"], inplace=False)

In [39]:
raw_data = pd.get_dummies(
    raw_data,
    columns=["diag_1_grp"],
    prefix="diag1",
    drop_first=False,
    dtype=int
)

raw_data.head()

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,metformin,...,max_glu_serum_Norm,A1Cresult_>7,A1Cresult_>8,A1Cresult_Norm,diag1_0,diag1_1,diag1_2,diag1_18,diag1_19,diag1_20
0,0,1,41,0,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
1,1,3,59,0,18,0,0,0,9,0,...,0,0,0,0,1,0,0,0,0,0
2,2,2,11,5,13,2,0,1,6,0,...,0,0,0,0,1,0,0,0,0,0
3,3,2,44,1,16,0,0,0,7,0,...,0,0,0,0,1,0,0,0,0,0
4,4,1,51,0,8,0,0,0,5,0,...,0,0,0,0,0,0,1,0,0,0


In [40]:
# See non-numeric columns
non_numeric = raw_data.select_dtypes(exclude=["number"]).columns
print("Non-numeric columns:", non_numeric.tolist())

Non-numeric columns: []


In [35]:
raw_data.to_csv("testing.csv", index=False)