In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv("patient_data.csv").drop(['patient_id'], axis=1)

In [2]:
data.describe()

Unnamed: 0,age,bmi,medication_count,days_hospitalized,readmitted,last_lab_glucose,urban,albumin_globulin_ratio,alanine_aminotransferase
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,53.5397,26.98658,3.0088,4.9747,0.2618,100.29563,0.7003,0.505534,26.985214
std,20.757324,4.900447,1.740639,2.228981,0.439636,19.908399,0.458149,0.289413,4.900837
min,18.0,9.6,0.0,0.0,0.0,23.0,0.0,0.0,9.6081
25%,36.0,23.6,2.0,3.0,0.0,87.0,0.0,0.255625,23.63065
50%,53.0,26.9,3.0,5.0,0.0,100.1,1.0,0.5037,26.946
75%,72.0,30.3,4.0,6.0,1.0,113.5,1.0,0.7551,30.2975
max,89.0,43.6,13.0,20.0,1.0,178.4,1.0,1.0,43.5486


In [3]:
non_numeric_cols = data.select_dtypes(exclude=np.number) #check items that are not numeric and need processing

In [4]:
non_numeric_cols.describe()

#We can notice here that we have a significant amount of NaN values for exercise_frequency and education_level
#20.12% of the dataset has exercise_frequency as NaN and 10.4% has education_level as NaN

Unnamed: 0,sex,smoker,diagnosis_code,exercise_frequency,diet_quality,income_bracket,education_level,chronic_obstructive_pulmonary_disease
count,10000,10000,10000,7988,10000,10000,8960,10000
unique,2,2,5,3,3,3,3,4
top,Female,No,D5,Moderate,Average,Middle,Secondary,C
freq,5072,7039,2040,3068,4000,4042,3984,2524


In [5]:
non_numeric_cols.nunique(dropna=False)

sex                                      2
smoker                                   2
diagnosis_code                           5
exercise_frequency                       4
diet_quality                             3
income_bracket                           3
education_level                          4
chronic_obstructive_pulmonary_disease    4
dtype: int64

In [6]:
for col in non_numeric_cols.columns:
    print(f"\nColumn: {col}")
    print(non_numeric_cols[col].unique())

# With this information we can notice that some attributes are binary, like sex or smoker, so they are straight foward to deal with.
# However, we notice that some data like exercise_frequency and education_level have NaN elements, so we have to use a model that already
# deals with that like XgBoost or with methods like data imputation or deletion.


Column: sex
<StringArray>
['Male', 'Female']
Length: 2, dtype: str

Column: smoker
<StringArray>
['No', 'Yes']
Length: 2, dtype: str

Column: diagnosis_code
<StringArray>
['D1', 'D4', 'D3', 'D2', 'D5']
Length: 5, dtype: str

Column: exercise_frequency
<StringArray>
[nan, 'High', 'Low', 'Moderate']
Length: 4, dtype: str

Column: diet_quality
<StringArray>
['Average', 'Good', 'Poor']
Length: 3, dtype: str

Column: income_bracket
<StringArray>
['Middle', 'High', 'Low']
Length: 3, dtype: str

Column: education_level
<StringArray>
['Secondary', 'Primary', 'Tertiary', nan]
Length: 4, dtype: str

Column: chronic_obstructive_pulmonary_disease
<StringArray>
['B', 'C', 'D', 'A']
Length: 4, dtype: str


In [7]:
def encode_dataset(df):
    """
    Encode categorical variables with proper ordinal relationships.
    Preserves NaN for XGBoost to handle.
    """
    data_encoded = df.copy()
    
    binary_mappings = {
        'sex': {'Female': 0, 'Male': 1},
        'smoker': {'No': 0, 'Yes': 1}
    }
    
    for col, mapping in binary_mappings.items():
        data_encoded[col] = data_encoded[col].map(mapping)
    
    ordinal_mappings = {
        'exercise_frequency': {'Low': 0, 'Moderate': 1, 'High': 2},
        'diet_quality': {'Poor': 0, 'Average': 1, 'Good': 2},
        'income_bracket': {'Low': 0, 'Middle': 1, 'High': 2},
        'education_level': {'Primary': 0, 'Secondary': 1, 'Tertiary': 2}
    }
    
    for col, mapping in ordinal_mappings.items():
        data_encoded[col] = data_encoded[col].map(mapping).astype('float')
    
    diagnosis_mapping = {'D1': 1, 'D2': 2, 'D3': 3, 'D4': 4, 'D5': 5}
    data_encoded['diagnosis_code'] = data_encoded['diagnosis_code'].map(diagnosis_mapping).astype('float')
    
    target_mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
    data_encoded['chronic_obstructive_pulmonary_disease'] = \
        data_encoded['chronic_obstructive_pulmonary_disease'].map(target_mapping)
    
    return data_encoded

In [8]:
data.head()

Unnamed: 0,age,sex,bmi,smoker,diagnosis_code,medication_count,days_hospitalized,readmitted,last_lab_glucose,exercise_frequency,diet_quality,income_bracket,education_level,urban,albumin_globulin_ratio,chronic_obstructive_pulmonary_disease,alanine_aminotransferase
0,69,Male,25.3,No,D1,1,9,1,100.8,,Average,Middle,Secondary,0,0.8934,B,25.4423
1,32,Male,27.0,Yes,D4,1,7,0,106.3,High,Good,High,Primary,1,0.7104,C,27.0529
2,89,Female,30.6,No,D1,2,9,0,138.4,,Good,Middle,Tertiary,1,0.1078,B,30.4405
3,78,Male,17.8,Yes,D1,4,5,0,81.8,Low,Average,Low,Secondary,1,0.3754,C,17.5797
4,38,Female,37.7,No,D3,5,6,0,115.2,Low,Poor,Low,Primary,1,0.0994,D,37.7834


In [9]:
df_model = encode_dataset(data)

In [10]:
df_model.to_csv('treated_data.csv', index=False)

In [11]:
import numpy as np

def compute_defaults(df_model):
    EXCLUDE_COLS = [
        "chronic_obstructive_pulmonary_disease",
        "alanine_aminotransferase",
    ]

    df_features = df_model.drop(columns=EXCLUDE_COLS)

    categorical_cols = [
        "sex",
        "smoker",
        "diagnosis_code",
        "exercise_frequency",
        "diet_quality",
        "income_bracket",
        "education_level",
        "readmitted",
        "urban",
    ]

    continuous_cols = [
        "age",
        "bmi",
        "medication_count",
        "days_hospitalized",
        "last_lab_glucose",
        "albumin_globulin_ratio",
    ]

    defaults = {}

    # Mode for categorical
    for col in categorical_cols:
        value = df_features[col].mode(dropna=True)[0]
        defaults[col] = int(value) if isinstance(value, np.integer) else value

    # Median for continuous
    for col in continuous_cols:
        value = df_features[col].median()
        defaults[col] = float(value)

    return defaults


In [12]:
compute_defaults(df_model)

{'sex': 0,
 'smoker': 0,
 'diagnosis_code': np.float64(5.0),
 'exercise_frequency': np.float64(1.0),
 'diet_quality': np.float64(1.0),
 'income_bracket': np.float64(1.0),
 'education_level': np.float64(1.0),
 'readmitted': 0,
 'urban': 1,
 'age': 53.0,
 'bmi': 26.9,
 'medication_count': 3.0,
 'days_hospitalized': 5.0,
 'last_lab_glucose': 100.1,
 'albumin_globulin_ratio': 0.5037}