# Exploratory Data Analysis



## Setup



In [18]:
import numpy as np
import pandas as pd
import altair as alt
from altair import expr, datum
# pip install altair pandas
import pandas as pd
import plotly.graph_objects as go
df = pd.read_csv("diabetic_data.csv")

## Exploratory Visual Analysis for Research Questions

# Sankey diagram

In [None]:
import pandas as pd
import plotly.graph_objects as go

# --- columns (rename to yours) ---
A = "admission_type_id"
B = "discharge_disposition_id"
C = "readmitted"              # your 3rd target
pct = 0.01                    # 1% threshold (per column)
ELSE = "Other"

def bucket_small(df, col, pct=0.01, else_label="Other"):
    s = df[col].astype(str).fillna("Unknown")
    vc = s.value_counts()
    total = len(s)
    small_vals = vc[vc / total < pct].index
    return s.where(~s.isin(small_vals), else_label)

# 1) Collapse rare categories per column
df2 = df.copy()
for col in [A, B, C]:
    df2[col] = bucket_small(df2, col, pct=pct, else_label=ELSE)

# 2) Aggregate flows for A→B and B→C
ab = (df2[[A, B]].groupby([A, B]).size()
      .reset_index(name="value"))
bc = (df2[[B, C]].groupby([B, C]).size()
      .reset_index(name="value"))

# 3) Build node lists (keep layers distinct to avoid loops)
A_vals = sorted(ab[A].unique().tolist())
B_vals = sorted(pd.Index(ab[B]).union(bc[B]).unique().tolist())
C_vals = sorted(bc[C].unique().tolist())

labels_pretty = A_vals + B_vals + C_vals

# index offsets
a_off = 0
b_off = len(A_vals)
c_off = b_off + len(B_vals)
a_idx = {v: a_off + i for i, v in enumerate(A_vals)}
b_idx = {v: b_off + i for i, v in enumerate(B_vals)}
c_idx = {v: c_off + i for i, v in enumerate(C_vals)}

# links
src =  ab[A].map(a_idx).tolist() +  bc[B].map(b_idx).tolist()
tgt =  ab[B].map(b_idx).tolist() +  bc[C].map(c_idx).tolist()
val =  ab["value"].tolist()      +  bc["value"].tolist()

# pin three vertical columns
xpos = [0]*len(A_vals) + [0.5]*len(B_vals) + [1]*len(C_vals)

fig = go.Figure(go.Sankey(
    node=dict(label=labels_pretty, pad=15, thickness=14, x=xpos, y=None),
    link=dict(source=src, target=tgt, value=val)
))
fig.update_layout(title="Admissions → Discharge → Outcome (rare <1% grouped as Other)")
fig.show()
