In [1]:
import pandas as pd
from best_library.cleaning_data import load_data, split_data, clean_data
from best_library.features import encode_features
from best_library.model import train_model, predict
from best_library.evaluation import compute_roc_auc

In [2]:
# ## 2. Load and inspect data

file_path = "../sample_diabetes_mellitus_data.csv"

df = load_data(file_path)
print("Raw data shape:", df.shape)
df.head()


Raw data shape: (10000, 53)


Unnamed: 0.1,Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,...,ventilated_apache,wbc_apache,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus
0,0,214826,118,68.0,22.732803,0,Caucasian,M,180.3,Floor,...,0,14.1,0,0,0,0,0,0,0,1
1,1,246060,81,77.0,27.421875,0,Caucasian,F,160.0,Floor,...,1,12.7,0,0,0,0,0,0,0,1
2,2,276985,118,25.0,31.952749,0,Caucasian,F,172.7,Emergency Department,...,0,,0,0,0,0,0,0,0,0
3,3,262220,118,81.0,22.635548,1,Caucasian,F,165.1,Operating Room,...,1,8.0,0,0,0,0,0,0,0,0
4,4,201746,33,19.0,,0,Caucasian,M,188.0,,...,0,,0,0,0,0,0,0,0,0


In [3]:
# ## 3. Split the data into train and test

train_df, test_df = split_data(df)

print(f"Train size: {train_df.shape}")
print(f"Test size: {test_df.shape}")


Train size: (8000, 53)
Test size: (2000, 53)


In [4]:
# ## 4. Clean data

train_df = clean_data(train_df)
test_df = clean_data(test_df)

print("After cleaning:")
print(f"Train size: {train_df.shape}")
print(f"Test size: {test_df.shape}")


After cleaning:
Train size: (7494, 53)
Test size: (1874, 53)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df[col].mean())


In [5]:
# ## 5. Encode features

train_df = encode_features(train_df)
test_df = encode_features(test_df)

train_df.head()


Unnamed: 0.1,Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,gender,height,hospital_admit_source,icu_admit_source,...,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus,gender_binary,ethnicity_Asian,ethnicity_Caucasian,ethnicity_Hispanic,ethnicity_Native American,ethnicity_Other/Unknown
9254,9254,185898,137,88.0,28.713767,0,F,157.48,Emergency Department,Accident & Emergency,...,1,0,0,1,0,False,True,False,False,False
1561,1561,179718,83,69.0,,0,F,152.4,Floor,Floor,...,0,0,0,1,0,False,True,False,False,False
1670,1670,204365,118,61.0,,0,M,188.0,,Accident & Emergency,...,0,0,0,0,1,False,True,False,False,False
6087,6087,167361,118,36.0,33.861606,1,F,165.1,Operating Room,Operating Room / Recovery,...,0,0,0,0,0,False,False,False,False,False
6669,6669,213013,83,49.0,22.693954,1,F,162.6,Operating Room,Operating Room / Recovery,...,0,0,0,0,0,False,True,False,False,False


In [6]:
# ## 6. Define feature set and target

target = "diabetes_mellitus"
features = [
    "age", "height", "weight", "aids", "cirrhosis", "hepatic_failure",
    "immunosuppression", "leukemia", "lymphoma", "solid_tumor_with_metastasis",
    "gender_binary"
]

# include ethnicity dummies dynamically
ethnicity_cols = [col for col in train_df.columns if col.startswith("ethnicity_")]
features += ethnicity_cols

print("Features used for modeling:")
print(features)


Features used for modeling:
['age', 'height', 'weight', 'aids', 'cirrhosis', 'hepatic_failure', 'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis', 'gender_binary', 'ethnicity_Asian', 'ethnicity_Caucasian', 'ethnicity_Hispanic', 'ethnicity_Native American', 'ethnicity_Other/Unknown']


In [7]:
# ## 7. Train model

model = train_model(train_df[features], train_df[target], model_type='log')


In [8]:
# ## 8. Predict probabilities for train and test sets

train_df, test_df = predict(model, train_df, test_df, features)


train_df[["diabetes_mellitus", "predictions"]].head()


Unnamed: 0,diabetes_mellitus,predictions
9254,1,0.7
1561,1,0.711111
1670,0,0.033333
6087,0,0.166667
6669,0,0.066667


In [9]:
# ## 9. Evaluate model using ROC-AUC

train_auc = compute_roc_auc(train_df[target], train_df["predictions"])
test_auc = compute_roc_auc(test_df[target], test_df["predictions"])

print(f"Train ROC-AUC: {train_auc:.3f}")
print(f"Test ROC-AUC:  {test_auc:.3f}")


Train ROC-AUC: 0.999
Test ROC-AUC:  0.610
