In [1472]:
import numpy as np
import pandas as pd
import os

# Data Exploration

In [1473]:
df = pd.read_csv("Imported data/Assignment DataSet.csv")
df.head(10)

Unnamed: 0.1,Unnamed: 0,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,metformin-rosiglitazone,change,diabetesMed,readmitted
0,0,41186889,Caucasian,Female,[80-90),1,6,7,13,69,...,No,No,Steady,Steady,No,No,No,No,Yes,NO
1,1,83591073,Caucasian,Female,[80-90),6,1,17,5,34,...,No,No,Steady,No,No,No,No,No,No,NO
2,2,68337009,Other,Male,[60-70),1,2,1,1,37,...,No,No,Steady,Steady,No,No,No,No,Yes,NO
3,3,73175076,Caucasian,Male,[40-50),1,1,7,4,56,...,No,No,Steady,Up,No,No,No,Ch,Yes,YES
4,4,31718349,Caucasian,Female,[70-80),1,1,7,1,38,...,No,No,Steady,No,No,No,No,No,No,NO
5,5,20909367,Hispanic,Female,[80-90),2,6,1,2,61,...,No,No,No,Down,No,No,No,Ch,Yes,NO
6,6,86870088,AfricanAmerican,Female,[50-60),1,1,7,3,26,...,No,No,No,Down,No,No,No,Ch,Yes,YES
7,7,52043301,Caucasian,Male,[60-70),3,1,1,6,66,...,No,No,No,No,No,No,No,No,No,NO
8,8,58716846,AfricanAmerican,Female,[60-70),1,13,7,9,74,...,No,No,No,Steady,No,No,No,No,Yes,NO
9,9,102729258,Caucasian,Male,[40-50),6,1,7,12,68,...,No,No,Steady,Up,No,No,No,Ch,Yes,NO


In [1474]:
print('Number of columns:', len(df.columns)) # -1 to exclude the target column
print('Number of samples:', len(df))

Number of columns: 38
Number of samples: 5000


In [1475]:
features = df.columns[2:-1]
df_features = df[features]
patient_class = df.columns[-1]
df_class = df[patient_class]

In [1476]:
print('Features: ', features)
print('Number of features:', len(features))

Features:  Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'diag_4', 'number_diagnoses', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'troglitazone', 'tolazamide',
       'examide', 'citoglipton', 'insulin', 'glyburide-metformin',
       'glipizide-metformin', 'metformin-rosiglitazone', 'change',
       'diabetesMed'],
      dtype='object')
Number of features: 35


In [1477]:
# Statistical summary of the dataset of both numerical and categorical columns
df_features.describe(include='all')

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,metformin-rosiglitazone,change,diabetesMed
count,5000,5000,5000,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,...,5000,5000,5000,5000,5000,5000,5000,5000,5000,5000
unique,6,2,10,,,,,,,,...,1,2,1,2,4,4,2,1,2,2
top,Caucasian,Female,[70-80),,,,,,,,...,No,No,No,Steady,No,No,No,No,No,Yes
freq,3759,2701,1241,,,,,,,,...,5000,4999,5000,3457,2329,4967,4999,5000,2692,3860
mean,,,,2.0322,3.7004,5.704,4.4304,43.1952,1.3604,15.9046,...,,,,,,,,,,
std,,,,1.45643,5.265148,3.951393,3.009612,19.761316,1.711227,8.117406,...,,,,,,,,,,
min,,,,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,,,,,,,,,,
25%,,,,1.0,1.0,1.0,2.0,31.0,0.0,10.0,...,,,,,,,,,,
50%,,,,1.0,1.0,7.0,4.0,44.0,1.0,15.0,...,,,,,,,,,,
75%,,,,3.0,3.25,7.0,6.0,57.0,2.0,20.0,...,,,,,,,,,,


## Data quality

### Repeated instances

First, we analyze if there are repeated patients in the data.

In [1478]:
repeated_instances = df.duplicated()
repeated_instances = repeated_instances[repeated_instances == True]
print('Number of repeated instances:', len(repeated_instances))

Number of repeated instances: 0


In [1479]:
repeated_patients = df['patient_nbr'].duplicated()
repeated_patients = repeated_patients[repeated_patients == True]
repeated_patients
print('Number of repeated patients:', len(repeated_patients))

Number of repeated patients: 163


There is no duplicated data, but there are duplicated patients (personal data duplicated). To avoid bias, we keep one of the repeated entries.

In [1480]:
# Keep only one instance of each repeated id
df = df.drop_duplicates(subset='patient_nbr')
print('Number of unique patients:', len(df))

Number of unique patients: 4837


### Missing values

Now, we are interested in studying the data quality of each feature. First we look into the features with missing values:

In [1481]:
nan_values = df.isna().sum()
print('Number of missing values in each column:', nan_values)

Number of missing values in each column: Unnamed: 0                  0
patient_nbr                 0
race                        0
gender                      0
age                         0
admission_type_id           0
discharge_disposition_id    0
admission_source_id         0
time_in_hospital            0
num_lab_procedures          0
num_procedures              0
num_medications             0
number_outpatient           0
number_emergency            0
number_inpatient            0
diag_1                      0
diag_2                      0
diag_3                      0
diag_4                      0
number_diagnoses            0
chlorpropamide              0
glimepiride                 0
acetohexamide               0
glyburide                   0
tolbutamide                 0
pioglitazone                0
rosiglitazone               0
troglitazone                0
tolazamide                  0
examide                     0
citoglipton                 0
insulin                     0

There is no Nan values. However, by observing the data, we can note how the missing values are represented as '?':

In [1482]:
# Count the number of missing values in each column
missing_values = df[df == '?'].count()
missing_values = missing_values[missing_values > 0]/len(df)*100
missing_values = missing_values.sort_values(ascending=False)
missing_values

tolbutamide    79.801530
race            2.274137
dtype: float64

The features 'race' and 'tolbutamide' are the only ones with missing values. For the 'tolbutamide' feature, since there is an almost $80\%$ of missing values, we will not consider this feature.

In [1483]:
df = df.drop('tolbutamide', axis=1)
df

Unnamed: 0.1,Unnamed: 0,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,metformin-rosiglitazone,change,diabetesMed,readmitted
0,0,41186889,Caucasian,Female,[80-90),1,6,7,13,69,...,No,No,Steady,Steady,No,No,No,No,Yes,NO
1,1,83591073,Caucasian,Female,[80-90),6,1,17,5,34,...,No,No,Steady,No,No,No,No,No,No,NO
2,2,68337009,Other,Male,[60-70),1,2,1,1,37,...,No,No,Steady,Steady,No,No,No,No,Yes,NO
3,3,73175076,Caucasian,Male,[40-50),1,1,7,4,56,...,No,No,Steady,Up,No,No,No,Ch,Yes,YES
4,4,31718349,Caucasian,Female,[70-80),1,1,7,1,38,...,No,No,Steady,No,No,No,No,No,No,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4995,6316443,Caucasian,Female,[80-90),1,1,7,3,47,...,No,No,Steady,No,No,No,No,Ch,Yes,YES
4996,4996,96364728,Caucasian,Male,[70-80),2,1,1,8,52,...,No,No,Steady,No,No,No,No,Ch,Yes,YES
4997,4997,101621745,Caucasian,Male,[50-60),1,1,2,1,17,...,No,No,Steady,Steady,No,No,No,Ch,Yes,NO
4998,4998,21969333,Caucasian,Male,[70-80),1,1,7,2,42,...,No,No,No,No,No,No,No,No,Yes,YES


For the 'race' feature, we can treat this problem in different ways. First, we can consider deleting those instances, but it can be reflected in a decreasing of data quantity. Another option is assigning the most frequent race to the instances with no race in the data or even providing a random value according to the data frequency distribution from the possible races previously represented. These possibilities may decrease the data quality.

In [1484]:
# Count the occurrences of each value in the 'race' column
n_rv = df['race'].value_counts()

# Compute each value distribution
n_distribution = n_rv.drop('?', axis=0)
n_distribution = n_distribution/(len(df)-n_rv['?'])
print(n_distribution.sum())

print('Distribution of values without considering ? values: ', n_distribution)


# Print the number of values in each category
print('Number of values:', n_rv)


1.0
Distribution of values without considering ? values:  race
Caucasian          0.770891
AfricanAmerican    0.187011
Hispanic           0.019040
Other              0.014809
Asian              0.008250
Name: count, dtype: float64
Number of values: race
Caucasian          3644
AfricanAmerican     884
?                   110
Hispanic             90
Other                70
Asian                39
Name: count, dtype: int64


In [1485]:
# Replace missing values ('?') in the 'race' column with the most frequent race
prob = n_distribution.to_numpy()
df.loc[df['race'] == '?', 'race'] = np.random.choice(n_distribution.index, size=n_rv['?'], p=prob)
print(df['race'].value_counts())

race
Caucasian          3724
AfricanAmerican     909
Hispanic             92
Other                72
Asian                40
Name: count, dtype: int64


### Uninformative features

Then, we look for features which do not give any information.

In [1486]:
dict_vc = {}
for column in df.columns:
    dict_vc[column] = df[column].nunique()
dict_vc

{'Unnamed: 0': 4837,
 'patient_nbr': 4837,
 'race': 5,
 'gender': 2,
 'age': 10,
 'admission_type_id': 7,
 'discharge_disposition_id': 20,
 'admission_source_id': 11,
 'time_in_hospital': 14,
 'num_lab_procedures': 99,
 'num_procedures': 7,
 'num_medications': 62,
 'number_outpatient': 21,
 'number_emergency': 13,
 'number_inpatient': 14,
 'diag_1': 370,
 'diag_2': 354,
 'diag_3': 373,
 'diag_4': 4837,
 'number_diagnoses': 12,
 'chlorpropamide': 3,
 'glimepiride': 4,
 'acetohexamide': 1,
 'glyburide': 4,
 'pioglitazone': 4,
 'rosiglitazone': 4,
 'troglitazone': 1,
 'tolazamide': 2,
 'examide': 1,
 'citoglipton': 2,
 'insulin': 4,
 'glyburide-metformin': 4,
 'glipizide-metformin': 1,
 'metformin-rosiglitazone': 1,
 'change': 2,
 'diabetesMed': 2,
 'readmitted': 2}

In [1487]:
uninformative_columns = [column for column in df.columns if df[column].nunique() == 1]
uninformative_columns

['acetohexamide',
 'troglitazone',
 'examide',
 'glipizide-metformin',
 'metformin-rosiglitazone']

This features does not provide additional information. Therefore, we discard them.

In [1488]:
df = df.drop(uninformative_columns, axis=1)

## Data manipulation

### Categorical values

In this section we convert the categorical values to integer identifiers.

In [1489]:
#Initialize the dictionary
category_mapping = {}

# Identify categorical columns
categorical_columns = df.select_dtypes(include='object').columns
categorical_columns

Index(['race', 'gender', 'age', 'chlorpropamide', 'glimepiride', 'glyburide',
       'pioglitazone', 'rosiglitazone', 'tolazamide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

##### Race

We have seen previously that the 'race' feature has multiple values. However, Hispanic and Asian represent less than a 4% of the data. Then, to simplify the number of possible values, we hvae considered to include these races inside the 'Others' option.

In [1490]:
df.loc[df['race'] == 'Hispanic', 'race'] = 'Other'
df.loc[df['race'] == 'Asian', 'race'] = 'Other'
df['race'].value_counts()

race
Caucasian          3724
AfricanAmerican     909
Other               204
Name: count, dtype: int64

#### Simplified cathegories

According to IDS_mapping.csv we can simplify the categorical values of 'admission_type_id', 'discharge_disposition_id' and 'admission_source_id' as follows:

In [1491]:
'''Admission_type_id'''

admission_type_unique = df['admission_type_id'].unique()
new_index = [0,0,1,2,3,2,0,3]

admision_id_mapping = {0: 'Urgent', 1: 'Elective', 2: 'Newborn', 3: 'Unknown'}

df['admission_type_id'] = df['admission_type_id'].astype(object)

# Simplify the categorical values of the 'admission_type_id' column to 4 values: Urgent, Elective, Newborn, and Unknown
for i in admission_type_unique:
    df.loc[df['admission_type_id'] == i, 'admission_type_id'] = admision_id_mapping[new_index[i-1]]


# Update the category mapping
category_mapping['admission_type_id'] = {i: admision_id_mapping[new_index[i-1]] for i in admission_type_unique}


In [1492]:
# Replace the unknown values in the 'admission_type_id' columnwith a random value from the distribution of the column
df['admission_type_id'].value_counts()/len(df)

admission_type_id
Urgent      0.709324
Elective    0.187099
Newborn     0.056233
Unknown     0.047343
Name: count, dtype: float64

In [1493]:
# Since the unknown values are less than 5% of the total values, we can replace them with a random value from the distribution of the column
n_rv = df['admission_type_id'].value_counts()
n_rv_k = n_rv.drop('Unknown')
prob = n_rv_k.to_numpy()/n_rv_k.sum()
df.loc[df['admission_type_id'] == 'Unknown', 'admission_type_id'] = np.random.choice(n_rv_k.index, size=n_rv['Unknown'], p=prob)
df['admission_type_id'].value_counts()

admission_type_id
Urgent      3602
Elective     952
Newborn      283
Name: count, dtype: int64

In [1494]:
'''Discharge_disposition_id'''

# Simplify the categorical values of the 'discharge_disposition_id' column to 4 values: Home, Hospital, Hospice, and Other
discharge_disposition_unique = df['discharge_disposition_id'].unique()
new_index = [0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 3, 3, 2, 2, 1, 1, 1, 3, 2, 2, 2, 1, 1, 1, 3, 3, 1, 1, 1, 1]

# Update the category mapping
discharge_mapping = {0: 'Home', 1: 'Hospital', 2: 'Hospice', 3: 'Unknown'}

df['discharge_disposition_id'] = df['discharge_disposition_id'].astype(object)

for i in discharge_disposition_unique:
    df.loc[df['discharge_disposition_id']==i, 'discharge_disposition_id'] = discharge_mapping[new_index[i-1]]

# Store new index category mapping
category_mapping['discharge_disposition_id'] = {i: discharge_mapping[new_index[i-1]] for i in discharge_disposition_unique}


In [1495]:
df['discharge_disposition_id'].value_counts()/len(df)

discharge_disposition_id
Home        0.726277
Hospital    0.202812
Unknown     0.061195
Hospice     0.009717
Name: count, dtype: float64

In [1496]:
n_rv = df['discharge_disposition_id'].value_counts()
n_rv_k = n_rv.drop('Unknown')
prob = n_rv_k.to_numpy()/n_rv_k.sum()
df.loc[df['discharge_disposition_id'] == 'Unknown', 'discharge_disposition_id'] = np.random.choice(n_rv_k.index, size=n_rv['Unknown'], p=prob)
df['discharge_disposition_id'].value_counts()

discharge_disposition_id
Home        3741
Hospital    1048
Hospice       48
Name: count, dtype: int64

In [1497]:
'''Admission_source_id'''

# Simplify the categorical values of the 'admission_source_id' column to 4 values: Planned (Referral and Transfer), Emergency and Other

admision_source_unique = df['admission_source_id'].unique()
admision_source_unique

array([ 7, 17,  1,  5,  2,  6,  9,  4, 20,  3, 11])

In [1498]:
new_index = [1, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0]
admission_source_mapping = {0: 'Planned', 1: 'Emergency', 2: 'Unknown'}

df['admission_source_id'] = df['admission_source_id'].astype(object)

for i, code in enumerate(admision_source_unique):
    df.loc[df['admission_source_id'] == code, 'admission_source_id'] = admission_source_mapping[new_index[i]]

# Store new index category mapping
category_mapping['admission_source_id'] = {code: admission_source_mapping[new_index[i]] for i, code in enumerate(admision_source_unique)}

In [1499]:
df['admission_source_id'].value_counts()/len(df)

admission_source_id
Emergency    0.572669
Planned      0.365516
Unknown      0.061815
Name: count, dtype: float64

In [1500]:
n_rv = df['admission_source_id'].value_counts()
n_rv_k = n_rv.drop('Unknown')
prob = n_rv_k.to_numpy()/n_rv_k.sum()
df.loc[df['admission_source_id'] == 'Unknown', 'admission_source_id'] = np.random.choice(n_rv_k.index, size=n_rv['Unknown'], p=prob)
df['admission_source_id'].value_counts()

admission_source_id
Emergency    2959
Planned      1878
Name: count, dtype: int64

In [1501]:
'''Diagnosis columns'''

# First we delete the diag_4 column since the values are not consistents with the pdf database and the icd9 codes.
df = df.drop('diag_4', axis=1)

# Diagnostic mappinng
diag_mapping = {
    'Circulatory': '[390-459], [785]',
    'Respiratory': '[460-519], [786]',
    'Digestive': '[520-579], [787]',
    'Diabetes': '[250.xx]',
    'Injury': '[800-999]',
    'Musculoskeletal': '[710-739]',
    'Genitourinary': '[580-629], [788]',
    'Neoplasms': '[140-239]',
    'Other': 'Other'
}

def map_diagnosis(diag_code):

    """This function maps the diagnosis codes to the corresponding category."""

    if (diag_code >= 390 and diag_code <= 459) or diag_code == 785:
        return 'Circulatory'
    elif (diag_code >= 460 and diag_code <= 519) or diag_code == 786:
        return 'Respiratory'
    elif (diag_code >= 520 and diag_code <= 579) or diag_code == 787:
        return 'Digestive'
    elif diag_code < 251 and diag_code >= 250:
        return 'Diabetes'
    elif diag_code >= 800 and diag_code <= 999:
        return 'Injury'
    elif diag_code >= 710 and diag_code <= 739:
        return 'Musculoskeletal'
    elif (diag_code >= 580 and diag_code <= 629) or diag_code == 788:
        return 'Genitourinary'
    elif diag_code >= 140 and diag_code <= 239:
        return 'Neoplasms'
    else:
        return 'Other'
    

for i in range (1,4):
    column = f'diag_{i}'
    diag_unique = df[column].unique()
    df[column] = df[column].astype(object)
    for diag_code in diag_unique:
        df.loc[df[column] == diag_code, column] = map_diagnosis(diag_code)


In [1502]:
category_mapping
mapping_csv = pd.DataFrame.from_dict(category_mapping)
mapping_csv.sort_index()

Unnamed: 0,admission_type_id,discharge_disposition_id,admission_source_id
1,Urgent,Home,Planned
2,Urgent,Hospital,Planned
3,Elective,Hospital,Planned
4,,Hospital,Planned
5,Unknown,Hospital,Planned
6,Newborn,Home,Planned
7,Urgent,Hospital,Emergency
8,Unknown,Home,
9,,Hospital,Planned
10,,Hospital,


In [1503]:
if not os.path.exists('Mappings'):
    os.makedirs('Mappings')

# Store the category mapping in a CSV file
for column in category_mapping:
    path = f'Mappings/{column}_mapping.csv'
    maps_df = mapping_csv[column].dropna()
    maps_df.to_csv(path, index=True, header=False)

path = 'Mappings/diagnosis_mapping.csv'
diag_map = pd.DataFrame.from_dict(diag_mapping, columns=['ICD9 Codes'], orient='index')
diag_map.to_csv(path, index=True, header=True)

Export the modified/simplified data

In [1504]:
if not os.path.exists('Preprocessed data'):
    os.makedirs('Preprocessed data')

columns_feat_class = df.columns[2:]
df = df[columns_feat_class]

# Store the preprocessed dataset in a CSV file
df.to_csv('Preprocessed data/preprocessed_data.csv', index=True)

In [1505]:
df

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,glyburide,pioglitazone,rosiglitazone,tolazamide,citoglipton,insulin,glyburide-metformin,change,diabetesMed,readmitted
0,Caucasian,Female,[80-90),Urgent,Home,Emergency,13,69,4,38,...,No,No,No,No,Steady,Steady,No,No,Yes,NO
1,Caucasian,Female,[80-90),Newborn,Home,Planned,5,34,0,4,...,No,No,No,No,Steady,No,No,No,No,NO
2,Other,Male,[60-70),Urgent,Hospital,Planned,1,37,1,13,...,No,No,No,No,Steady,Steady,No,No,Yes,NO
3,Caucasian,Male,[40-50),Urgent,Home,Emergency,4,56,2,22,...,No,No,No,No,Steady,Up,No,Ch,Yes,YES
4,Caucasian,Female,[70-80),Urgent,Home,Emergency,1,38,2,1,...,No,No,No,No,Steady,No,No,No,No,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,Caucasian,Female,[80-90),Urgent,Home,Emergency,3,47,3,8,...,Down,No,No,No,Steady,No,No,Ch,Yes,YES
4996,Caucasian,Male,[70-80),Urgent,Home,Planned,8,52,0,17,...,Up,No,No,No,Steady,No,No,Ch,Yes,YES
4997,Caucasian,Male,[50-60),Urgent,Home,Planned,1,17,0,11,...,Steady,No,No,No,Steady,Steady,No,Ch,Yes,NO
4998,Caucasian,Male,[70-80),Urgent,Home,Emergency,2,42,6,12,...,No,Steady,No,No,No,No,No,No,Yes,YES


## Numerical values

Now, we are going to manipulate the date to have only numerical values, then, we will encode the categorical values. This manipulation of the data will be very useful for SVM, Logistic Regression, and k-means which benefit from normalization.

In [1506]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler

### Encoded df

In [1507]:
# Initialize the dictionary
encode_mapping = {}

# Identify categorical columns
categorical_columns = df.select_dtypes(include='object').columns
categorical_columns

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'diag_1', 'diag_2',
       'diag_3', 'chlorpropamide', 'glimepiride', 'glyburide', 'pioglitazone',
       'rosiglitazone', 'tolazamide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

#### Binomial values

In [1508]:
for column in categorical_columns:
    if df[column].nunique() == 2:
        encode_mapping[column] = {code: cat for code, cat in enumerate(df[column].astype('category').cat.categories)}
        df[column] = df[column].astype('category').cat.codes
        categorical_columns = categorical_columns.drop(column)


encode_mapping

{'gender': {0: 'Female', 1: 'Male'},
 'admission_source_id': {0: 'Emergency', 1: 'Planned'},
 'tolazamide': {0: 'No', 1: 'Steady'},
 'citoglipton': {0: 'No', 1: 'Steady'},
 'change': {0: 'Ch', 1: 'No'},
 'diabetesMed': {0: 'No', 1: 'Yes'},
 'readmitted': {0: 'NO', 1: 'YES'}}

#### Age

In [1509]:
df['age'] = df['age'].astype('category')

# Apply categorical mapping
encode_mapping['age'] = {code/10: cat for code, cat in enumerate(df['age'].cat.categories)}
df['age'] = df['age'].cat.codes/10 # Normalize the age values

# Specific modifications
df['change'] = df['change'].replace({'No': 1, 'Ch': 1})
encode_mapping['change'] = {'No' : 0, 'Ch': 1}

df['admission_source_id'] = df['admission_source_id'].replace({'Planned': 0, 'Emergency': 1})
encode_mapping['admission_source_id'] = {'Planned': 0, 'Emergency': 1}

categorical_columns = categorical_columns.drop('age')

encode_mapping

{'gender': {0: 'Female', 1: 'Male'},
 'admission_source_id': {'Planned': 0, 'Emergency': 1},
 'tolazamide': {0: 'No', 1: 'Steady'},
 'citoglipton': {0: 'No', 1: 'Steady'},
 'change': {'No': 0, 'Ch': 1},
 'diabetesMed': {0: 'No', 1: 'Yes'},
 'readmitted': {0: 'NO', 1: 'YES'},
 'age': {0.0: '[0-10)',
  0.1: '[10-20)',
  0.2: '[20-30)',
  0.3: '[30-40)',
  0.4: '[40-50)',
  0.5: '[50-60)',
  0.6: '[60-70)',
  0.7: '[70-80)',
  0.8: '[80-90)',
  0.9: '[90-100)'}}

#### Race

In [1510]:
# Since the 'age' column is ordinal, and the distance between the categories is not clear, we will use one-hot encoding for this column
df = pd.get_dummies(df, columns=['race'], dtype=int)
categorical_columns = categorical_columns.drop('race')
df

Unnamed: 0,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,...,tolazamide,citoglipton,insulin,glyburide-metformin,change,diabetesMed,readmitted,race_AfricanAmerican,race_Caucasian,race_Other
0,0,0.8,Urgent,Home,0,13,69,4,38,0,...,0,1,Steady,No,1,1,0,0,1,0
1,0,0.8,Newborn,Home,1,5,34,0,4,0,...,0,1,No,No,1,0,0,0,1,0
2,1,0.6,Urgent,Hospital,1,1,37,1,13,3,...,0,1,Steady,No,1,1,0,0,0,1
3,1,0.4,Urgent,Home,0,4,56,2,22,0,...,0,1,Up,No,0,1,1,0,1,0
4,0,0.7,Urgent,Home,0,1,38,2,1,0,...,0,1,No,No,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,0.8,Urgent,Home,0,3,47,3,8,0,...,0,1,No,No,0,1,1,0,1,0
4996,1,0.7,Urgent,Home,1,8,52,0,17,2,...,0,1,No,No,0,1,1,0,1,0
4997,1,0.5,Urgent,Home,1,1,17,0,11,0,...,0,1,Steady,No,0,1,0,0,1,0
4998,1,0.7,Urgent,Home,0,2,42,6,12,0,...,0,0,No,No,1,1,1,0,1,0


#### Medications

In [1511]:
medications_order = ['No', 'Down', 'Steady', 'Up']
medication_columns = ['chlorpropamide', 'glimepiride', 'glyburide', 'pioglitazone', 'rosiglitazone', 'insulin', 'glyburide-metformin']

encoder = OrdinalEncoder(categories=[medications_order])
scaler = MinMaxScaler()

# Apply ordinal encoding to the medication columns
for column in medication_columns:
    data = df[column]
    encoded_data = encoder.fit_transform(data.values.reshape(-1, 1))
    scaled_data = scaler.fit_transform(encoded_data)
    df[column] = scaled_data
    categorical_columns = categorical_columns.drop(column)

In [1512]:
categorical_columns

Index(['admission_type_id', 'discharge_disposition_id', 'diag_1', 'diag_2',
       'diag_3'],
      dtype='object')

#### Other

In [1513]:
admission_type_order = ['Newborn', 'Elective', 'Urgent']

encoder = OrdinalEncoder(categories=[admission_type_order])

encoded_data = encoder.fit_transform(df['admission_type_id'].values.reshape(-1, 1))
scaled_data = scaler.fit_transform(encoded_data)
df['admission_type_id'] = scaled_data

In [1514]:
discharge_order = ['Home', 'Hospital', 'Hospice']

encoder = OrdinalEncoder(categories=[discharge_order])

encoded_data = encoder.fit_transform(df['discharge_disposition_id'].values.reshape(-1, 1))
scaled_data = scaler.fit_transform(encoded_data)
df['discharge_disposition_id'] = scaled_data

In [1515]:
diag_order = ['Other', 'Injury', 'Respiratory', 'Digestive', 'Musculoskeletal', 'Genitourinary', 'Neoplasms', 'Circulatory', 'Diabetes']
diag_columns = ['diag_1', 'diag_2', 'diag_3']

encoder = OrdinalEncoder(categories=[diag_order])

for column in diag_columns:
    encoded_data = encoder.fit_transform(df[column].values.reshape(-1, 1))
    scaled_data = scaler.fit_transform(encoded_data)
    df[column] = scaled_data

In [1516]:
# Now normalize the remaining numerical columns
num_columns = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']

for column in num_columns:
    data = df[column]
    scaled_data = scaler.fit_transform(data.values.reshape(-1, 1))
    df[column] = scaled_data

df

Unnamed: 0,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,...,tolazamide,citoglipton,insulin,glyburide-metformin,change,diabetesMed,readmitted,race_AfricanAmerican,race_Caucasian,race_Other
0,0,0.8,1.0,0.0,0,0.923077,0.635514,0.666667,0.578125,0.000000,...,0,1,0.666667,0.0,1,1,0,0,1,0
1,0,0.8,0.0,0.0,1,0.307692,0.308411,0.000000,0.046875,0.000000,...,0,1,0.000000,0.0,1,0,0,0,1,0
2,1,0.6,1.0,0.5,1,0.000000,0.336449,0.166667,0.187500,0.076923,...,0,1,0.666667,0.0,1,1,0,0,0,1
3,1,0.4,1.0,0.0,0,0.230769,0.514019,0.333333,0.328125,0.000000,...,0,1,1.000000,0.0,0,1,1,0,1,0
4,0,0.7,1.0,0.0,0,0.000000,0.345794,0.333333,0.000000,0.000000,...,0,1,0.000000,0.0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,0.8,1.0,0.0,0,0.153846,0.429907,0.500000,0.109375,0.000000,...,0,1,0.000000,0.0,0,1,1,0,1,0
4996,1,0.7,1.0,0.0,1,0.538462,0.476636,0.000000,0.250000,0.051282,...,0,1,0.000000,0.0,0,1,1,0,1,0
4997,1,0.5,1.0,0.0,1,0.000000,0.149533,0.000000,0.156250,0.000000,...,0,1,0.666667,0.0,0,1,0,0,1,0
4998,1,0.7,1.0,0.0,0,0.076923,0.383178,1.000000,0.171875,0.000000,...,0,0,0.000000,0.0,1,1,1,0,1,0


In [1517]:
# Export normalized data

if not os.path.exists('Preprocessed data'):
    os.makedirs('Preprocessed data')

# Store the preprocessed dataset in a CSV file
df.to_csv('Preprocessed data/normalized_data.csv', index=True)

# Model Checking

In [160]:
dClass = df[df.columns[-1]]
dFeatures = df[df.columns[2:-1]]

print('Number of Features columns:', len(dFeatures.columns)) # -1 to exclude the target column
print('Number of samples:', len(dFeatures))

Number of Features columns: 30
Number of samples: 5000


In [163]:
# Convert the DataFrame to a NumPy array
dFeatures = dFeatures.to_numpy()
dClass = dClass.to_numpy()

In [161]:
# Classification
from sklearn.neighbors import KNeighborsClassifier

In [164]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle=True) # Define the split - 10 splits

score = []

# Handling of division of data is automaticly done by this function

for train, test in kf.split(dFeatures, dClass): # Split the data n_splits times

    dClass_training = dClass[train]
    dClass_test = dClass[test]
    dFeat_training = dFeatures[train, :]
    dFeat_test = dFeatures[test, :]
    model = KNeighborsClassifier(n_neighbors=3) # Creates an empty model and define the parameters
    model.fit(dFeat_training, dClass_training) # Train the model (dFeat = X, dClass = Y), actually is not trainning it is just storing the data
    tRes = model.score(dFeat_test, dClass_test) # Temporary result
    score.append(tRes)

print('Average accuracy: ', np.mean(score)) # Test the model
print('Standard deviation: ', np.std(score)) # Test the model
print('Number of splits: ', len(score)) # Test the model

Average accuracy:  0.5204
Standard deviation:  0.026043041297052847
Number of splits:  10
