# Diabetes Dataset Preprocessing & Visualization Assignment

Prepared automatically.

This notebook performs EDA, missing value imputation, scaling, visualizations, and saves scaled CSVs.

## Part 0: Load dataset
Make sure `diabetes_data.csv` is uploaded in the same folder.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

DATA_PATH = '/mnt/data/diabetes_data.csv'
df = pd.read_csv(DATA_PATH)
df.head()

In [None]:
# Part 1: Data Understanding
print('Shape:', df.shape)
print('\nData types:\n', df.dtypes)
display(df.describe(include='all').T)
print('\nMissing values per column:')
print(df.isna().sum())

In [None]:
# Part 2: Missing Value Imputation decisions
import numpy as np
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
impute_method = {}
justifications = {}
for col in num_cols:
    miss = df[col].isna().sum()
    if miss == 0:
        continue
    s = df[col].dropna()
    skewness = s.skew() if len(s)>0 else 0
    q1 = s.quantile(0.25)
    q3 = s.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    outlier_frac = ((s < lower) | (s > upper)).mean() if len(s)>0 else 0
    unique_ratio = s.nunique() / len(s) if len(s)>0 else 0
    if unique_ratio < 0.05:
        method = 'mode'
        justification = 'Low unique ratio — treat as categorical-like, use mode.'
    else:
        if abs(skewness) > 0.7 or outlier_frac > 0.01:
            method = 'median'
            justification = f'Skewness={skewness:.2f}, outlier_frac={outlier_frac:.3f} — median is robust.'
        else:
            method = 'mean'
            justification = f'Skewness={skewness:.2f}, outlier_frac={outlier_frac:.3f} — mean is appropriate.'
    impute_method[col] = method
    justifications[col] = justification

print('Imputation decisions:')
for c in impute_method:
    print('-', c, impute_method[c], '->', justifications[c])


In [None]:
# Part 3: Apply imputation and plot before/after
import os
OUTPUT_DIR = 'Diabetes_outputs'
os.makedirs(OUTPUT_DIR, exist_ok=True)
df_before = df.copy()
df_imputed = df.copy()
for col, method in impute_method.items():
    if method == 'mean':
        val = df_imputed[col].mean()
        df_imputed[col].fillna(val, inplace=True)
    elif method == 'median':
        val = df_imputed[col].median()
        df_imputed[col].fillna(val, inplace=True)
    elif method == 'mode':
        mode_val = df_imputed[col].mode()
        val = mode_val.iloc[0] if len(mode_val)>0 else np.nan
        df_imputed[col].fillna(val, inplace=True)
    # Plot before and after
    fig1, ax1 = plt.subplots()
    ax1.hist(df_before[col].dropna(), bins=30)
    ax1.set_title(f"{col} — Before Imputation")
    fig1.savefig(os.path.join(OUTPUT_DIR, f"{col}_before_imputation.png"), bbox_inches='tight')
    plt.close(fig1)
    fig2, ax2 = plt.subplots()
    ax2.hist(df_imputed[col].dropna(), bins=30)
    ax2.set_title(f"{col} — After Imputation ({method})")
    fig2.savefig(os.path.join(OUTPUT_DIR, f"{col}_after_imputation.png"), bbox_inches='tight')
    plt.close(fig2)
    fig3, axes3 = plt.subplots(1,2, figsize=(8,4))
    axes3[0].boxplot(df_before[col].dropna())
    axes3[0].set_title('Before')
    axes3[1].boxplot(df_imputed[col].dropna())
    axes3[1].set_title('After')
    fig3.savefig(os.path.join(OUTPUT_DIR, f"{col}_box_before_after.png"), bbox_inches='tight')
    plt.close(fig3)
print('Imputation applied and plots saved to', OUTPUT_DIR)


In [None]:
# Part 4 & 5: Scaling (Standard, MinMax, Robust) and plotting comparisons
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
num_cols_for_scaling = df_imputed.select_dtypes(include=[np.number]).columns.tolist()
df_numeric = df_imputed[num_cols_for_scaling].copy()
std = StandardScaler(); mm = MinMaxScaler(); rb = RobustScaler()
standard_scaled = pd.DataFrame(std.fit_transform(df_numeric), columns=df_numeric.columns)
minmax_scaled = pd.DataFrame(mm.fit_transform(df_numeric), columns=df_numeric.columns)
robust_scaled = pd.DataFrame(rb.fit_transform(df_numeric), columns=df_numeric.columns)
# Save CSVs
os.makedirs(OUTPUT_DIR, exist_ok=True)
standard_scaled.to_csv(os.path.join(OUTPUT_DIR, 'diabetes_standard_scaled.csv'), index=False)
minmax_scaled.to_csv(os.path.join(OUTPUT_DIR, 'diabetes_minmax_scaled.csv'), index=False)
robust_scaled.to_csv(os.path.join(OUTPUT_DIR, 'diabetes_robust_scaled.csv'), index=False)
print('Saved scaled CSVs to', OUTPUT_DIR)
# Plot comparisons for each column
for col in df_numeric.columns:
    orig = df_numeric[col]
    # Standard
    fig, axs = plt.subplots(1,2, figsize=(10,4))
    axs[0].hist(orig.dropna(), bins=30)
    axs[0].set_title(f"{col} — Original")
    axs[1].hist(standard_scaled[col].dropna(), bins=30)
    axs[1].set_title(f"{col} — StandardScaler")
    fig.savefig(os.path.join(OUTPUT_DIR, f"{col}_standard_scaler.png"), bbox_inches='tight')
    plt.close(fig)
    # MinMax
    fig, axs = plt.subplots(1,2, figsize=(10,4))
    axs[0].hist(orig.dropna(), bins=30)
    axs[0].set_title(f"{col} — Original")
    axs[1].hist(minmax_scaled[col].dropna(), bins=30)
    axs[1].set_title(f"{col} — MinMaxScaler")
    fig.savefig(os.path.join(OUTPUT_DIR, f"{col}_minmax_scaler.png"), bbox_inches='tight')
    plt.close(fig)
    # Robust
    fig, axs = plt.subplots(1,2, figsize=(10,4))
    axs[0].hist(orig.dropna(), bins=30)
    axs[0].set_title(f"{col} — Original")
    axs[1].hist(robust_scaled[col].dropna(), bins=30)
    axs[1].set_title(f"{col} — RobustScaler")
    fig.savefig(os.path.join(OUTPUT_DIR, f"{col}_robust_scaler.png"), bbox_inches='tight')
    plt.close(fig)
print('Scaling plots saved to', OUTPUT_DIR)


## Outputs
- Scaled CSVs and all plots are saved in the `Diabetes_outputs` folder.
- You can open and run each cell in this notebook to reproduce results.

### Files generated now:
- Notebook: `/mnt/data/Diabetes_Preprocessing_Assignment_full.ipynb`
- Outputs folder: `/mnt/data/Diabetes_outputs` (contains PNGs and CSVs)