In [1]:
pip install pillow

Note: you may need to restart the kernel to use updated packages.


Target Variable
- "The outcome we are looking at is whether the patient gets readmitted to the hospital within 30 days or not. The variable actually has < 30, > 30 and No Readmission categories."
- encoding source: "To reduce our problem to a binary classification, we combined the readmission after 30 days and no readmission into a single category"

In [3]:
#Loading libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler 

In [4]:
#loading Dataset
df = pd.read_csv("data/diabetic_data.csv")

In [5]:
df.head(10).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
encounter_id,2278392,149190,64410,500364,16680,35754,55842,63768,12522,15738
patient_nbr,8222157,55629189,86047875,82442376,42519267,82637451,84259809,114882984,48330783,63555939
race,Caucasian,Caucasian,AfricanAmerican,Caucasian,Caucasian,Caucasian,Caucasian,Caucasian,Caucasian,Caucasian
gender,Female,Female,Female,Male,Male,Male,Male,Male,Female,Female
age,[0-10),[10-20),[20-30),[30-40),[40-50),[50-60),[60-70),[70-80),[80-90),[90-100)
weight,?,?,?,?,?,?,?,?,?,?
admission_type_id,6,1,1,1,1,2,3,1,2,3
discharge_disposition_id,25,1,1,1,1,1,1,1,1,3
admission_source_id,1,7,7,7,7,2,2,7,4,4
time_in_hospital,1,3,2,2,1,3,4,5,13,12


In [6]:
#Checking for missing values in dataset
#In the dataset missing values are represented as '?' sign
for col in df.columns:
    if df[col].dtype == object:
         print(col,df[col][df[col] == '?'].count())

race 2273
gender 0
age 0
weight 98569
payer_code 40256
medical_specialty 49949
diag_1 21
diag_2 358
diag_3 1423
max_glu_serum 0
A1Cresult 0
metformin 0
repaglinide 0
nateglinide 0
chlorpropamide 0
glimepiride 0
acetohexamide 0
glipizide 0
glyburide 0
tolbutamide 0
pioglitazone 0
rosiglitazone 0
acarbose 0
miglitol 0
troglitazone 0
tolazamide 0
examide 0
citoglipton 0
insulin 0
glyburide-metformin 0
glipizide-metformin 0
glimepiride-pioglitazone 0
metformin-rosiglitazone 0
metformin-pioglitazone 0
change 0
diabetesMed 0
readmitted 0


In [7]:
df = df.replace("?",np.nan)
len(df.select_dtypes('O').columns)

37

In [8]:
print('gender', df['gender'][df['gender'] == 'Unknown/Invalid'].count())  


gender 3


In [9]:
df['readmitted'].value_counts()

NO     54864
>30    35545
<30    11357
Name: readmitted, dtype: int64

In [10]:
df['readmitted'] = df['readmitted'].replace('>30', 0)
df['readmitted'] = df['readmitted'].replace('<30', 1)
df['readmitted'] = df['readmitted'].replace('NO', 0)

Source - Therefore, we collapsed these diagnosis codes into 9 disease categories in an almost similar fashion to that done in the original publication using this dataset. These 9 categories include Circulatory, Respiratory, Digestive, Diabetes, Injury, Musculoskeletal, Genitourinary, Neoplasms, and Others. Although we did this for primary, secondary and additional diagnoses, we eventually decided to use only the primary diagnosis in our model. Doing this in python was slightly cumbersome because, well, we are mapping the disease codes to certain category names. Below code should demonstrate this easily."

Implementation difference: 

    The circulatory system
    The lymphatic system
    The respiratory system
    The integumentary system
    The endocrine system
    The gastrointestinal (digestive) system
    The urinary (excretory) system
    The musculoskeletal system
    The nervous system
    The reproductive system
    The immune system


In [11]:
df['readmitted'].value_counts()/len(df)
counts = df['readmitted'].value_counts()

percentages = counts * 100 / len(df)

fig, ax = plt.subplots()
percentages.plot(kind='pie', ax=ax, autopct='%1.1f%%')
plt.show()

NameError: name 'plt' is not defined

In [None]:
df.drop(['weight','payer_code','medical_specialty'],axis=1,inplace=True)

In [None]:
print("Proportion of Race")
print(df.race.value_counts(normalize = True)*100)


sns.countplot(x=df.race, data = df)
plt.xticks(rotation=90)
plt.title("Number of Race values")
plt.show()



In [None]:
sns.countplot(x="race", hue= "readmitted", data = df)
plt.title("Readmitted - Race")
plt.show()

In [None]:
sns.catplot(x = "race", y = "readmitted",data = df, kind = "bar", height= 6)
plt.title("Readmitted Probability")
plt.show()

In [None]:
print("Proportions of Gender Value")
print(df.gender.value_counts(normalize = True))

sns.countplot(x = "gender", data = df)
plt.title("Distribution of Number of Gender")
plt.show()

In [None]:
df = df.drop(df.loc[df["gender"]=="Unknown/Invalid"].index, axis=0)

sns.countplot(x = "gender", data = df)
plt.title("Distribution of Number of Gender After Dropping")
plt.show()

sns.countplot(x = "gender", hue = "readmitted", data = df)
plt.title("Gender - Readmitted")
plt.show()

In [None]:
sns.countplot(x="age", data = df)
plt.xticks(rotation = 90)
plt.show()

In [None]:
df.age = df.age.replace({"[70-80)":75,
                         "[60-70)":65,
                         "[50-60)":55,
                         "[80-90)":85,
                         "[40-50)":45,
                         "[30-40)":35,
                         "[90-100)":95,
                         "[20-30)":25,
                         "[10-20)":15,
                         "[0-10)":5})

sns.countplot(x="age", data = df)
plt.show()



In [None]:
print("Distribution of ID's")
print(df.admission_type_id.value_counts())

sns.countplot(x = "admission_type_id", data = df)
plt.title("Distribution of Admission IDs")
plt.show()



In [None]:
mapped = {1.0:"Emergency",
          2.0:"Emergency",
          3.0:"Elective",
          4.0:"New Born",
          5.0:np.nan,
          6.0:np.nan,
          7.0:"Trauma Center",
          8.0:np.nan}

df.admission_type_id = df.admission_type_id.replace(mapped)

print("-Distribution of ID's-")
print(df.admission_type_id.value_counts())

sns.countplot(x = "admission_type_id", data = df)
plt.title("-Distribution of Admission IDs-")
plt.show()

In [None]:
mapped_discharge = {1:"Discharged to Home",
                    6:"Discharged to Home",
                    8:"Discharged to Home",
                    13:"Discharged to Home",
                    19:"Discharged to Home",
                    18:np.nan,25:np.nan,26:np.nan,
                    2:"Other",3:"Other",4:"Other",
                    5:"Other",7:"Other",9:"Other",
                    10:"Other",11:"Other",12:"Other",
                    14:"Other",15:"Other",16:"Other",
                    17:"Other",20:"Other",21:"Other",
                    22:"Other",23:"Other",24:"Other",
                    27:"Other",28:"Other",29:"Other",30:"Other"}

df["discharge_disposition_id"] = df["discharge_disposition_id"].replace(mapped_discharge)

In [None]:
print("Proportions of ID's")
print(df.discharge_disposition_id.value_counts())

sns.countplot(x ="discharge_disposition_id", data = df)
plt.show()

sns.countplot(x ="discharge_disposition_id", hue = "readmitted", data = df)
plt.show()

In [None]:
mapped_adm = {1:"Referral",2:"Referral",3:"Referral",
              4:"Other",5:"Other",6:"Other",10:"Other",22:"Other",25:"Other",
              9:"Other",8:"Other",14:"Other",13:"Other",11:"Other",
              15:np.nan,17:np.nan,20:np.nan,21:np.nan,
              7:"Emergency"}
df.admission_source_id = df.admission_source_id.replace(mapped_adm)
print(df.admission_source_id.value_counts())

sns.countplot(x = "admission_source_id", data = df)
plt.show()

sns.countplot(x = "admission_source_id", hue = "readmitted", data = df)
plt.title("Admission Source - Readmitted")
plt.show()

In [None]:
print(df.time_in_hospital.value_counts())

sns.countplot(x="time_in_hospital", data = df,
              order = df.time_in_hospital.value_counts().index)
plt.show()

In [None]:
df['race'] = df['race'].fillna(df['race'].mode()[0])

df['admission_type_id'] = df['admission_type_id'].fillna(df['admission_type_id'].mode()[0])

df['discharge_disposition_id'] = df['discharge_disposition_id'].fillna(df['discharge_disposition_id'].mode()[0])

df['admission_source_id'] = df['admission_source_id'].fillna(df['admission_source_id'].mode()[0])

In [None]:
df.head()

In [None]:
cat_data = df.select_dtypes('O')
num_data = df.select_dtypes(np.number)

cat_data

In [None]:
from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()

for i in cat_data:
  cat_data[i] = LE.fit_transform(cat_data[i])

In [None]:
data = pd.concat([num_data,cat_data],axis=1)
data.head()
data['glipizide'].unique()

In [None]:
data.drop(['encounter_id','patient_nbr'],axis=1,inplace=True)
data.head()

In [None]:
data.to_csv('data/processed_diabetes_data.csv')

In [None]:
X = data.drop('readmitted',axis=1)

y = data['readmitted']

train, test = train_test_split(data, test_size=0.8, random_state=42)
print('Train:', train.shape[0])
print('Test:', test.shape[0])

In [None]:
rf = RandomForestClassifier(n_estimators=51)

rfe = RFE(model, n_features_to_select=100)
rfe.fit(X, y)
selected_features = X.columns[rfe.support_]
print('Selected Features:', len(selected_features))
print(selected_features)

In [None]:
selected_features

In [None]:
import lightgbm as lgb
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from pyspark.ml.feature import VectorAssembler

X = data.drop('readmitted',axis=1)
y = data['readmitted']

# Define and fit a StandardScaler
scaler = StandardScaler()
model = scaler.fit(X)
scaled_X = model.transform(X)

# Split data into train and test sets

X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.2, random_state=42, stratify=y)

# Define LightGBM parameters
params = {
    'objective': 'binary',
    'verbosity': -1,
    'is_unbalance': True,
    'learning_rate': 0.01
}

# Create and fit a LightGBM model
light = lgb.LGBMClassifier(**params)
light.fit(X_train, y_train)

# Feature selection using SelectFromModel
threshold = 'median'
feature_selector = SelectFromModel(light, threshold=threshold)
model = feature_selector.fit(X_train, y_train)

# Get the selected features
X_train_selected = model.transform(X_train)
selected_features_mask = feature_selector.get_support()

# Get feature importances
feature_importance = light.feature_importances_
feature_names = X.columns
# Create a DataFrame for feature importances
importance = pd.DataFrame({'feature_index': feature_names, 'feature_importance': feature_importance})

# Print the importance DataFrame
print(importance)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import accuracy_score, roc_auc_score
params = {
    'objective': 'binary',
    'verbosity': -1,
    'is_unbalance': True,
    'learning_rate': 0.01
}

# Create and fit a LightGBM model with all attributes
model_all_attributes = lgb.LGBMClassifier(**params)
model_all_attributes.fit(X_train, y_train)

# Predict on the test set
y_pred_all_attributes = model_all_attributes.predict(X_test)

# Evaluate the model with all attributes
accuracy_all_attributes = accuracy_score(y_test, y_pred_all_attributes)
roc_auc_all_attributes = roc_auc_score(y_test, model_all_attributes.predict_proba(X_test)[:, 1])
precision_all_attributes = precision_score(y_test, y_pred_all_attributes)
recall_all_attributes = recall_score(y_test, y_pred_all_attributes)
f1_all_attributes = f1_score(y_test, y_pred_all_attributes)

# Compute confusion matrix for the model with all attributes
confusion_matrix_all_attributes = confusion_matrix(y_test, y_pred_all_attributes)

print("Model with all attributes:")
print("Accuracy:", accuracy_all_attributes)
print("ROC AUC:", roc_auc_all_attributes)
print("Precision:", precision_all_attributes)
print("Recall:", recall_all_attributes)
print("F1 Score:", f1_all_attributes)
print("Confusion Matrix:\n", confusion_matrix_all_attributes)

# Create and fit a LightGBM model with the selected attributes
feature_selector = SelectFromModel(model_all_attributes, threshold='median')
X_train_selected = feature_selector.transform(X_train)
X_test_selected = feature_selector.transform(X_test)

model_selected_attributes = lgb.LGBMClassifier(**params)
model_selected_attributes.fit(X_train_selected, y_train)

# Predict on the test set
y_pred_selected_attributes = model_selected_attributes.predict(X_test_selected)

# Evaluate the model with selected attributes
accuracy_selected_attributes = accuracy_score(y_test, y_pred_selected_attributes)
roc_auc_selected_attributes = roc_auc_score(y_test, model_selected_attributes.predict_proba(X_test_selected)[:, 1])
precision_selected_attributes = precision_score(y_test, y_pred_selected_attributes)
recall_selected_attributes = recall_score(y_test, y_pred_selected_attributes)
f1_selected_attributes = f1_score(y_test, y_pred_selected_attributes)

# Compute confusion matrix for the model with selected attributes
confusion_matrix_selected_attributes = confusion_matrix(y_test, y_pred_selected_attributes)

print("\nModel with selected attributes:")
print("Accuracy:", accuracy_selected_attributes)
print("ROC AUC:", roc_auc_selected_attributes)
print("Precision:", precision_selected_attributes)
print("Recall:", recall_selected_attributes)
print("F1 Score:", f1_selected_attributes)
cm = confusion_matrix_selected_attributes
print("Confusion Matrix:\n", cm)


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.metrics import accuracy_score, roc_auc_score


X = data[selected_features]
y = data['readmitted']
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.2, random_state=42, stratify=y)


params = {
    'objective': 'binary',
    'verbosity': -1,
    'is_unbalance': True,
    'learning_rate': 0.01
}

model_selected_attributes = lgb.LGBMClassifier(**params)
model_selected_attributes.fit(X_train, y_train)

# Predict on the test set
y_pred_selected_attributes = model_selected_attributes.predict(X_test)

# Evaluate the model with selected attributes
accuracy_selected_attributes = accuracy_score(y_test, y_pred_selected_attributes)
roc_auc_selected_attributes = roc_auc_score(y_test, model_selected_attributes.predict_proba(X_test)[:, 1])
precision_selected_attributes = precision_score(y_test, y_pred_selected_attributes)
recall_selected_attributes = recall_score(y_test, y_pred_selected_attributes)
f1_selected_attributes = f1_score(y_test, y_pred_selected_attributes)

# Compute confusion matrix for the model with selected attributes
confusion_matrix_selected_attributes = confusion_matrix(y_test, y_pred_selected_attributes)

print("\nModel with selected attributes:")
print("Accuracy:", accuracy_selected_attributes)
print("ROC AUC:", roc_auc_selected_attributes)
print("Precision:", precision_selected_attributes)
print("Recall:", recall_selected_attributes)
print("F1 Score:", f1_selected_attributes)
cm = confusion_matrix_selected_attributes
print("Confusion Matrix:\n", cm)
