In [120]:
# Hospital Readmission Risk Predictor - Techsophy Coding Test

In [None]:
#DATA LOADING AND CLEANING

In [130]:
#Importing Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [132]:
#Reading the dataset and printing the size of it

data = pd.read_csv("test_diabetic_data.csv")

data.shape

(101766, 50)

In [134]:
#since there are many missing values, i am dropping the columns that have ONLY missing values

data.drop(columns=['encounter_id', 'patient_nbr', 'weight', 'payer_code', 'medical_specialty'], inplace=True)


In [136]:
#print its first 5 rows

data.head(5)

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),6,25,1,1,41,0,1,...,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,[10-20),1,1,7,3,59,0,18,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),1,1,7,2,11,5,13,...,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[30-40),1,1,7,2,44,1,16,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,[40-50),1,1,7,1,51,0,8,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [138]:
#Data Preprocessing

# checking for missing values in the dataset and printing how many missing values the column has

question_marks = (data == '?').sum()

for column, count in question_marks.items():
    if count > 0:
        print(f"Column '{column}' has {count} '?' entries.")



Column 'race' has 2273 '?' entries.
Column 'diag_1' has 21 '?' entries.
Column 'diag_2' has 358 '?' entries.
Column 'diag_3' has 1423 '?' entries.


In [32]:
#print its first 5 rows

data.head(5)

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),6,25,1,1,41,0,1,...,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,[10-20),1,1,7,3,59,0,18,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),1,1,7,2,11,5,13,...,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[30-40),1,1,7,2,44,1,16,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,[40-50),1,1,7,1,51,0,8,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [42]:
# fill the '?' entries with values 

data.replace('?', pd.NA, inplace=True)
for column in data.columns:
    if data[column].isna().any():
        data[column].fillna(data[column].mode()[0], inplace=True)

In [48]:
# Now, i will convert target variable: '<30' = 1 (readmitted), others = 0

data['readmitted'] = data['readmitted'].apply(lambda x: 1 if x == '<30' else 0)

In [52]:
# Encode categorical variables

for column in data.select_dtypes(include='object').columns:
    data[column] = data[column].astype('category').cat.codes

In [None]:
#FEATURE ENGINEERING

In [54]:
X = data.drop(columns=['readmitted'])
y = data['readmitted']

In [None]:
#MODEL TRAINING AND EVALUATION

In [100]:
# Train-Test Split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)


In [102]:
# Training the decision tree model

model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)


In [104]:
# Evaluate Model

predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Model Accuracy:", round(accuracy * 100, 2), "%")

Model Accuracy: 79.85 %


In [None]:
#PREDICTION AND RISK SCORING

In [118]:
# Sample patient

example_input = X_train.iloc[0].copy()

example_input['age'] = 90 
example_input['time_in_hospital'] = 5
example_input['num_lab_procedures'] = 5
example_input['num_medications'] = 12
example_input['number_inpatient'] = 1
example_input['number_emergency'] = 2
example_input['number_diagnoses'] = 2

new_data = pd.DataFrame([example_input])
prediction = model.predict(new_data)

if prediction[0] == 1:
    print("Prediction: Patient is likely to be readmitted within 30 days.")
else:
    print("Prediction: Patient is NOT likely to be readmitted within 30 days.")


Prediction: Patient is NOT likely to be readmitted within 30 days.
