In [1]:
import pandas as pd
import numpy as np

In [1]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Collecting certifi>=2020.12.5 (from ucimlrepo)
  Downloading certifi-2025.4.26-py3-none-any.whl.metadata (2.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Downloading certifi-2025.4.26-py3-none-any.whl (159 kB)
Installing collected packages: certifi, ucimlrepo
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [ucimlrepo]
[1A[2KSuccessfully installed certifi-2025.4.26 ucimlrepo-0.0.7
Note: you may need to restart the kernel to use updated packages.


In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
diabetes_130_us_hospitals_for_years_1999_2008 = fetch_ucirepo(id=296) 
  
# data (as pandas dataframes) 
X = diabetes_130_us_hospitals_for_years_1999_2008.data.features 
y = diabetes_130_us_hospitals_for_years_1999_2008.data.targets 
  
# metadata 
print(diabetes_130_us_hospitals_for_years_1999_2008.metadata) 
  
# variable information 
print(diabetes_130_us_hospitals_for_years_1999_2008.variables) 


{'uci_id': 296, 'name': 'Diabetes 130-US Hospitals for Years 1999-2008', 'repository_url': 'https://archive.ics.uci.edu/dataset/296/diabetes+130-us+hospitals+for+years+1999-2008', 'data_url': 'https://archive.ics.uci.edu/static/public/296/data.csv', 'abstract': 'The dataset represents ten years (1999-2008) of clinical care at 130 US hospitals and integrated delivery networks. Each row concerns hospital records of patients diagnosed with diabetes, who underwent laboratory, medications, and stayed up to 14 days. The goal is to determine the early readmission of the patient within 30 days of discharge.\nThe problem is important for the following reasons. Despite high-quality evidence showing improved clinical outcomes for diabetic patients who receive various preventive and therapeutic interventions, many patients do not receive them. This can be partially attributed to arbitrary diabetes management in hospital environments, which fail to attend to glycemic control. Failure to provide pro

  df = pd.read_csv(data_url)


In [3]:
# Combine
df = X.copy()
df['readmitted'] = y

In [4]:
df.head()

Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),,6,25,1,1,,Pediatrics-Endocrinology,...,No,No,No,No,No,No,No,No,No,NO
1,Caucasian,Female,[10-20),,1,1,7,3,,,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),,1,1,7,2,,,...,No,No,No,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[30-40),,1,1,7,2,,,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,Caucasian,Male,[40-50),,1,1,7,1,,,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [10]:
df.columns

Index(['race', 'gender', 'age', 'weight', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'payer_code', 'medical_specialty', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [5]:
df.shape

(101766, 48)

In [6]:
df.describe(include = 'all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
race,99493.0,5.0,Caucasian,76099.0,,,,,,,
gender,101766.0,3.0,Female,54708.0,,,,,,,
age,101766.0,10.0,[70-80),26068.0,,,,,,,
weight,3197.0,9.0,[75-100),1336.0,,,,,,,
admission_type_id,101766.0,,,,2.024006,1.445403,1.0,1.0,1.0,3.0,8.0
discharge_disposition_id,101766.0,,,,3.715642,5.280166,1.0,1.0,1.0,4.0,28.0
admission_source_id,101766.0,,,,5.754437,4.064081,1.0,1.0,7.0,7.0,25.0
time_in_hospital,101766.0,,,,4.395987,2.985108,1.0,2.0,4.0,6.0,14.0
payer_code,61510.0,17.0,MC,32439.0,,,,,,,
medical_specialty,51817.0,72.0,InternalMedicine,14635.0,,,,,,,


In [7]:
#Drop unneeded columns
cols_to_drop = [
    'weight', 
    'examide', 
    'citoglipton', 
    'metformin-pioglitazone', 
    'metformin-rosiglitazone'
]
df_cleaned = df.drop(columns=cols_to_drop)

In [8]:
#Clean gender column (remove 'Unknown/Invalid')
df_cleaned = df_cleaned[df_cleaned['gender'] != 'Unknown/Invalid']

In [9]:
# Clean race column (replace '?' with NaN)
df_cleaned['race'] = df_cleaned['race'].replace('?', pd.NA)

In [10]:
# Clean A1Cresult and max_glu_serum (replace 'None' or '?' with NaN)
df_cleaned['A1Cresult'] = df_cleaned['A1Cresult'].replace(['None', '?'], pd.NA)
df_cleaned['max_glu_serum'] = df_cleaned['max_glu_serum'].replace(['None', '?'], pd.NA)

In [11]:
# Create binary target column for readmitted
df_cleaned['readmitted_binary'] = df_cleaned['readmitted'].apply(lambda x: 1 if x == '<30' else 0)

In [12]:
# Display updated info
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 101763 entries, 0 to 101765
Data columns (total 44 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   race                      99492 non-null   object
 1   gender                    101763 non-null  object
 2   age                       101763 non-null  object
 3   admission_type_id         101763 non-null  int64 
 4   discharge_disposition_id  101763 non-null  int64 
 5   admission_source_id       101763 non-null  int64 
 6   time_in_hospital          101763 non-null  int64 
 7   payer_code                61508 non-null   object
 8   medical_specialty         51816 non-null   object
 9   num_lab_procedures        101763 non-null  int64 
 10  num_procedures            101763 non-null  int64 
 11  num_medications           101763 non-null  int64 
 12  number_outpatient         101763 non-null  int64 
 13  number_emergency          101763 non-null  int64 
 14  number_in