# Data Transformation and Exploration


This assignment asks you to predict readmission for diabetic patients admitted to hospital, i.e., what is the likelihood that a patient will need to come back to hospital given the data about this visit. 
This is based on a database of 101766 hospital visits with a set of 50 features covering patient information, treatment and prescribed medication during the hospital stay.
The database covers patients admitted to 130 hospitals in the US over a period of 10 years to 2008. The original data came from this source:

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
df_train = pd.read_csv("Dataset/diabetic_data_training.csv")
df_test = pd.read_csv('Dataset/diabetic_data_test.csv')

print(df_train.head(5))
print(df_test.head(5))

   encounter_id  patient_nbr             race  gender      age weight  \
0        149190     55629189        Caucasian  Female  [10-20)      ?   
1         64410     86047875  AfricanAmerican  Female  [20-30)      ?   
2        500364     82442376        Caucasian    Male  [30-40)      ?   
3         16680     42519267        Caucasian    Male  [40-50)      ?   
4         35754     82637451        Caucasian    Male  [50-60)      ?   

   admission_type_id  discharge_disposition_id  admission_source_id  \
0                  1                         1                    7   
1                  1                         1                    7   
2                  1                         1                    7   
3                  1                         1                    7   
4                  2                         1                    2   

   time_in_hospital  ... citoglipton insulin  glyburide-metformin  \
0                 3  ...          No      Up                   No

In [9]:
# 1. Basic Summary Statistics
print("Summary Statistics:")
df_train.describe(include='all')

Summary Statistics:


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
count,91589.0,91589.0,91589,91589,91589,91589,91589.0,91589.0,91589.0,91589.0,...,91589,91589,91589,91589,91589,91589,91589,91589,91589,91589
unique,,,6,3,10,10,,,,,...,1,4,4,2,1,2,2,2,2,3
top,,,Caucasian,Female,[70-80),?,,,,,...,No,No,No,No,No,No,No,No,Yes,NO
freq,,,68478,49217,23452,88754,,,,,...,91589,42539,90963,91576,91589,91588,91588,49235,70555,49355
mean,165202600.0,54310820.0,,,,,2.022514,3.718503,5.759349,4.39555,...,,,,,,,,,,
std,102640200.0,38712890.0,,,,,1.444416,5.284318,4.063074,2.987944,...,,,,,,,,,,
min,12522.0,135.0,,,,,1.0,1.0,1.0,1.0,...,,,,,,,,,,
25%,84964560.0,23402440.0,,,,,1.0,1.0,1.0,2.0,...,,,,,,,,,,
50%,152389700.0,45458600.0,,,,,1.0,1.0,7.0,4.0,...,,,,,,,,,,
75%,230272000.0,87543880.0,,,,,3.0,4.0,7.0,6.0,...,,,,,,,,,,


In [10]:
# 2. Checking for Missing Values
print("\nMissing Values:")
df_train.isnull().sum()


Missing Values:


encounter_id                    0
patient_nbr                     0
race                            0
gender                          0
age                             0
weight                          0
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                      0
medical_specialty               0
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                          0
diag_2                          0
diag_3                          0
number_diagnoses                0
max_glu_serum               86793
A1Cresult                   76282
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide 

In [11]:
# 3. Correlation Heatmap
plt.figure(figsize=(12, 8))
correlation_matrix = df_train.corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()


ValueError: could not convert string to float: 'Caucasian'

<Figure size 1200x800 with 0 Axes>

In [None]:
# 4. Distribution of Numerical Features
numerical_features = df_train.select_dtypes(include=['int64', 'float64']).columns

for col in numerical_features:
    plt.figure(figsize=(8, 4))
    sns.histplot(df_train[col], kde=True, bins=30)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.show()

In [None]:
# 5. Count Plots for Categorical Features
categorical_features = df_train.select_dtypes(include=['object']).columns

for col in categorical_features:
    plt.figure(figsize=(8, 4))
    sns.countplot(data=df_train, x=col, order=df_train[col].value_counts().index)
    plt.title(f"Count Plot of {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.show()


In [None]:
# 7. Correlation with Target Variable 

if 'readmitted' in df_train.columns:
    target_correlation = df_train.corr()['readmitted'].sort_values(ascending=False)
    print("\nCorrelation with Target Variable:")
    print(target_correlation)
    
# PLOT A HEATMAP??