In [130]:
import numpy as np
import pandas as pd

# Data Exploration

In [131]:
df = pd.read_csv("Assignment DataSet.csv")
df.head(10)

Unnamed: 0.1,Unnamed: 0,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,metformin-rosiglitazone,change,diabetesMed,readmitted
0,0,41186889,Caucasian,Female,[80-90),1,6,7,13,69,...,No,No,Steady,Steady,No,No,No,No,Yes,NO
1,1,83591073,Caucasian,Female,[80-90),6,1,17,5,34,...,No,No,Steady,No,No,No,No,No,No,NO
2,2,68337009,Other,Male,[60-70),1,2,1,1,37,...,No,No,Steady,Steady,No,No,No,No,Yes,NO
3,3,73175076,Caucasian,Male,[40-50),1,1,7,4,56,...,No,No,Steady,Up,No,No,No,Ch,Yes,YES
4,4,31718349,Caucasian,Female,[70-80),1,1,7,1,38,...,No,No,Steady,No,No,No,No,No,No,NO
5,5,20909367,Hispanic,Female,[80-90),2,6,1,2,61,...,No,No,No,Down,No,No,No,Ch,Yes,NO
6,6,86870088,AfricanAmerican,Female,[50-60),1,1,7,3,26,...,No,No,No,Down,No,No,No,Ch,Yes,YES
7,7,52043301,Caucasian,Male,[60-70),3,1,1,6,66,...,No,No,No,No,No,No,No,No,No,NO
8,8,58716846,AfricanAmerican,Female,[60-70),1,13,7,9,74,...,No,No,No,Steady,No,No,No,No,Yes,NO
9,9,102729258,Caucasian,Male,[40-50),6,1,7,12,68,...,No,No,Steady,Up,No,No,No,Ch,Yes,NO


In [132]:
print('Number of columns:', len(df.columns)) # -1 to exclude the target column
print('Number of samples:', len(df))

Number of columns: 38
Number of samples: 5000


In [133]:
features = df.columns[2:-1]
df_features = df[features]
patient_class = df.columns[-1]
df_class = df[patient_class]

In [134]:
print('Features: ', features)
print('Number of features:', len(features))

Features:  Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'diag_4', 'number_diagnoses', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'troglitazone', 'tolazamide',
       'examide', 'citoglipton', 'insulin', 'glyburide-metformin',
       'glipizide-metformin', 'metformin-rosiglitazone', 'change',
       'diabetesMed'],
      dtype='object')
Number of features: 35


In [135]:
# Statistical summary of the dataset of both numerical and categorical columns
df_features.describe(include='all')

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,metformin-rosiglitazone,change,diabetesMed
count,5000,5000,5000,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,...,5000,5000,5000,5000,5000,5000,5000,5000,5000,5000
unique,6,2,10,,,,,,,,...,1,2,1,2,4,4,2,1,2,2
top,Caucasian,Female,[70-80),,,,,,,,...,No,No,No,Steady,No,No,No,No,No,Yes
freq,3759,2701,1241,,,,,,,,...,5000,4999,5000,3457,2329,4967,4999,5000,2692,3860
mean,,,,2.0322,3.7004,5.704,4.4304,43.1952,1.3604,15.9046,...,,,,,,,,,,
std,,,,1.45643,5.265148,3.951393,3.009612,19.761316,1.711227,8.117406,...,,,,,,,,,,
min,,,,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,,,,,,,,,,
25%,,,,1.0,1.0,1.0,2.0,31.0,0.0,10.0,...,,,,,,,,,,
50%,,,,1.0,1.0,7.0,4.0,44.0,1.0,15.0,...,,,,,,,,,,
75%,,,,3.0,3.25,7.0,6.0,57.0,2.0,20.0,...,,,,,,,,,,


## Data quality

### Missing values

Now, we are interested in studying the data quality of each feature. First we look into the features with missing values:

In [136]:
nan_values = df.isna().sum()
print('Number of missing values in each column:')

Number of missing values in each column:


There is no Nan values. However, by observing the data, we can note how the missing values are represented as '?':

In [137]:
# Count the number of missing values in each column
missing_values = df[df == '?'].count()
missing_values = missing_values[missing_values > 0]/len(df)*100
missing_values = missing_values.sort_values(ascending=False)
missing_values

tolbutamide    79.86
race            2.24
dtype: float64

The features 'race' and 'tolbutamide' are the only ones with missing values. For the 'tolbutamide' feature, since there is an almost $80\%$ of missing values, we will not consider this feature.

In [138]:
df = df.drop('tolbutamide', axis=1)
df

Unnamed: 0.1,Unnamed: 0,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,metformin-rosiglitazone,change,diabetesMed,readmitted
0,0,41186889,Caucasian,Female,[80-90),1,6,7,13,69,...,No,No,Steady,Steady,No,No,No,No,Yes,NO
1,1,83591073,Caucasian,Female,[80-90),6,1,17,5,34,...,No,No,Steady,No,No,No,No,No,No,NO
2,2,68337009,Other,Male,[60-70),1,2,1,1,37,...,No,No,Steady,Steady,No,No,No,No,Yes,NO
3,3,73175076,Caucasian,Male,[40-50),1,1,7,4,56,...,No,No,Steady,Up,No,No,No,Ch,Yes,YES
4,4,31718349,Caucasian,Female,[70-80),1,1,7,1,38,...,No,No,Steady,No,No,No,No,No,No,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4995,6316443,Caucasian,Female,[80-90),1,1,7,3,47,...,No,No,Steady,No,No,No,No,Ch,Yes,YES
4996,4996,96364728,Caucasian,Male,[70-80),2,1,1,8,52,...,No,No,Steady,No,No,No,No,Ch,Yes,YES
4997,4997,101621745,Caucasian,Male,[50-60),1,1,2,1,17,...,No,No,Steady,Steady,No,No,No,Ch,Yes,NO
4998,4998,21969333,Caucasian,Male,[70-80),1,1,7,2,42,...,No,No,No,No,No,No,No,No,Yes,YES


For the 'race' feature, we can treat this problem in different ways. First, we can consider deleting those instances, but it can be reflected in a decreasing of data quantity. Another option is assigning the most frequent race to the instances with no race in the data or even providing a random value from the possible races previously represented. These possibilities may decrease the data quality.

In [139]:
# Count the occurrences of each value in the 'race' column
n_rv = df['race'].value_counts()

# Print the number of values in each category
print('Number of values:', n_rv)


Number of values: race
Caucasian          3759
AfricanAmerican     923
?                   112
Hispanic             95
Other                72
Asian                39
Name: count, dtype: int64


In [140]:
# Replace missing values ('?') in the 'race' column with the most frequent race
df.loc[df['race'] == '?', 'race'] = n_rv.index[0]

### Uninformative features

Then, we look for features which do not give any information.

In [141]:
dict_vc = {}
for column in df.columns:
    dict_vc[column] = df[column].nunique()
dict_vc

{'Unnamed: 0': 5000,
 'patient_nbr': 4837,
 'race': 5,
 'gender': 2,
 'age': 10,
 'admission_type_id': 7,
 'discharge_disposition_id': 20,
 'admission_source_id': 11,
 'time_in_hospital': 14,
 'num_lab_procedures': 99,
 'num_procedures': 7,
 'num_medications': 62,
 'number_outpatient': 22,
 'number_emergency': 13,
 'number_inpatient': 15,
 'diag_1': 371,
 'diag_2': 357,
 'diag_3': 377,
 'diag_4': 5000,
 'number_diagnoses': 12,
 'chlorpropamide': 3,
 'glimepiride': 4,
 'acetohexamide': 1,
 'glyburide': 4,
 'pioglitazone': 4,
 'rosiglitazone': 4,
 'troglitazone': 1,
 'tolazamide': 2,
 'examide': 1,
 'citoglipton': 2,
 'insulin': 4,
 'glyburide-metformin': 4,
 'glipizide-metformin': 2,
 'metformin-rosiglitazone': 1,
 'change': 2,
 'diabetesMed': 2,
 'readmitted': 2}

In [142]:
uninformative_columns = [column for column in df.columns if df[column].nunique() == 1]
uninformative_columns

['acetohexamide', 'troglitazone', 'examide', 'metformin-rosiglitazone']

This features does not provide additional information. Therefore, we discard them.

In [143]:
df = df.drop(uninformative_columns, axis=1)

### Outliers

Not for now

## Data manipulation

### Categorical values

In this section we convert the categorical values to integer identifiers.

In [144]:
# Identify categorical columns
categorical_columns = df.select_dtypes(include='object').columns
categorical_columns


Index(['race', 'gender', 'age', 'chlorpropamide', 'glimepiride', 'glyburide',
       'pioglitazone', 'rosiglitazone', 'tolazamide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin', 'change', 'diabetesMed',
       'readmitted'],
      dtype='object')

In [None]:
# Convert the categorical columns to the 'category' data type
df[categorical_columns] = df[categorical_columns].astype('category')

# Create a mapping of the original categorical values to the new integer identifiers
category_mapping = {
    col: {code: cat for code, cat in enumerate(df[col].astype('category').cat.categories)}
    for col in categorical_columns
}

reverse_category_mapping = {
    col: {cat: code for code, cat in enumerate(df[col].astype('category').cat.categories)}
    for col in categorical_columns
}

# Assign integer identifiers to the categorical columns
df[categorical_columns] = df[categorical_columns].apply(lambda x: x.cat.codes)

category_mapping

{'race': {0: 'AfricanAmerican',
  1: 'Asian',
  2: 'Caucasian',
  3: 'Hispanic',
  4: 'Other'},
 'gender': {0: 'Female', 1: 'Male'},
 'age': {0: '[0-10)',
  1: '[10-20)',
  2: '[20-30)',
  3: '[30-40)',
  4: '[40-50)',
  5: '[50-60)',
  6: '[60-70)',
  7: '[70-80)',
  8: '[80-90)',
  9: '[90-100)'},
 'chlorpropamide': {0: 'No', 1: 'Steady', 2: 'Up'},
 'glimepiride': {0: 'Down', 1: 'No', 2: 'Steady', 3: 'Up'},
 'glyburide': {0: 'Down', 1: 'No', 2: 'Steady', 3: 'Up'},
 'pioglitazone': {0: 'Down', 1: 'No', 2: 'Steady', 3: 'Up'},
 'rosiglitazone': {0: 'Down', 1: 'No', 2: 'Steady', 3: 'Up'},
 'tolazamide': {0: 'No', 1: 'Steady'},
 'citoglipton': {0: 'No', 1: 'Steady'},
 'insulin': {0: 'Down', 1: 'No', 2: 'Steady', 3: 'Up'},
 'glyburide-metformin': {0: 'Down', 1: 'No', 2: 'Steady', 3: 'Up'},
 'glipizide-metformin': {0: 'No', 1: 'Steady'},
 'change': {0: 'Ch', 1: 'No'},
 'diabetesMed': {0: 'No', 1: 'Yes'},
 'readmitted': {0: 'NO', 1: 'YES'}}

# Model Checking

In [160]:
dClass = df[df.columns[-1]]
dFeatures = df[df.columns[2:-1]]

print('Number of Features columns:', len(dFeatures.columns)) # -1 to exclude the target column
print('Number of samples:', len(dFeatures))

Number of Features columns: 30
Number of samples: 5000


In [163]:
# Convert the DataFrame to a NumPy array
dFeatures = dFeatures.to_numpy()
dClass = dClass.to_numpy()

In [161]:
# Classification
from sklearn.neighbors import KNeighborsClassifier

In [164]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=10, shuffle=True) # Define the split - 10 splits

score = []

# Handling of division of data is automaticly done by this function

for train, test in kf.split(dFeatures, dClass): # Split the data n_splits times

    dClass_training = dClass[train]
    dClass_test = dClass[test]
    dFeat_training = dFeatures[train, :]
    dFeat_test = dFeatures[test, :]
    model = KNeighborsClassifier(n_neighbors=3) # Creates an empty model and define the parameters
    model.fit(dFeat_training, dClass_training) # Train the model (dFeat = X, dClass = Y), actually is not trainning it is just storing the data
    tRes = model.score(dFeat_test, dClass_test) # Temporary result
    score.append(tRes)

print('Average accuracy: ', np.mean(score)) # Test the model
print('Standard deviation: ', np.std(score)) # Test the model
print('Number of splits: ', len(score)) # Test the model

Average accuracy:  0.5204
Standard deviation:  0.026043041297052847
Number of splits:  10


# Comments

* Reescale values?
* Outliers?
* Missing values? 'Race' treatment?
* Apply different methods and metrics to get the best one.