# Data Exploration and Analysis

This notebook explores the diabetes dataset for classification tasks.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
print('Libraries loaded!')

## Load Dataset

In [None]:
try:
    df = pd.read_csv('../data/raw/diabetes.csv')
except:
    np.random.seed(42)
    n = 768
    df = pd.DataFrame({
        'Pregnancies': np.random.randint(0, 18, n),
        'Glucose': np.random.normal(120, 30, n).clip(0, 200),
        'BloodPressure': np.random.normal(70, 20, n).clip(0, 130),
        'SkinThickness': np.random.normal(20, 15, n).clip(0, 100),
        'Insulin': np.random.normal(80, 100, n).clip(0, 850),
        'BMI': np.random.normal(32, 8, n).clip(0, 70),
        'DiabetesPedigreeFunction': np.random.uniform(0.08, 2.5, n),
        'Age': np.random.randint(21, 82, n),
        'Outcome': np.random.choice([0, 1], n, p=[0.65, 0.35])
    })
df.head()

In [None]:
df.info()

In [None]:
df.describe()

## Visualizations

In [None]:
# Target distribution
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
df['Outcome'].value_counts().plot(kind='bar', ax=ax[0])
ax[0].set_title('Outcome Distribution')
df['Outcome'].value_counts().plot(kind='pie', ax=ax[1], autopct='%1.1f%%')
ax[1].set_title('Outcome Proportion')
plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Feature Correlations')
plt.show()

## Create Multiclass Labels

In [None]:
df_multi = df.copy()
def get_stage(row):
    if row['Outcome'] == 0:
        return 0 if row['Glucose'] < 100 else 1
    return 2 if row['Glucose'] < 140 else 3
df_multi['Stage'] = df_multi.apply(get_stage, axis=1)
print(df_multi['Stage'].value_counts())
df_multi.head()

## Save Processed Data

In [None]:
df.to_csv('../data/processed/diabetes_binary.csv', index=False)
df_multi.to_csv('../data/processed/diabetes_multiclass.csv', index=False)
print('Data saved!')