# Pima Indians Diabetes Dataset - Exploratory Data Analysis

## Comprehensive EDA: Univariate, Bivariate, and Multivariate Analysis

This notebook performs comprehensive exploratory data analysis including univariate, bivariate, and multivariate analysis.


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)
%matplotlib inline


In [None]:
# Load the dataset
df = pd.read_csv('../../data/pima-indians-diabetes.csv', skiprows=9, header=None)

# Column names
columns = [
    'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
    'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'
]
df.columns = columns

print(f"Dataset Shape: {df.shape}")
df.head()


## 1. UNIVARIATE ANALYSIS

### 1.1 Distribution Analysis


In [None]:
# Univariate Analysis: Distribution of each feature
numeric_features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
                   'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.ravel()

for idx, feature in enumerate(numeric_features):
    axes[idx].hist(df[feature], bins=30, edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'Distribution of {feature}')
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Frequency')
    axes[idx].axvline(df[feature].mean(), color='red', linestyle='--', label='Mean')
    axes[idx].axvline(df[feature].median(), color='green', linestyle='--', label='Median')
    axes[idx].legend()

plt.tight_layout()
plt.show()


In [None]:
# Box plots for univariate analysis
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.ravel()

for idx, feature in enumerate(numeric_features):
    axes[idx].boxplot(df[feature], vert=True)
    axes[idx].set_title(f'Box Plot of {feature}')
    axes[idx].set_ylabel(feature)

plt.tight_layout()
plt.show()


In [None]:
# Violin plots for better distribution visualization
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.ravel()

for idx, feature in enumerate(numeric_features):
    sns.violinplot(y=df[feature], ax=axes[idx])
    axes[idx].set_title(f'Violin Plot of {feature}')
    axes[idx].set_ylabel(feature)

plt.tight_layout()
plt.show()


In [None]:
# Summary statistics for univariate analysis
print("=" * 60)
print("UNIVARIATE ANALYSIS SUMMARY")
print("=" * 60)

univariate_stats = pd.DataFrame({
    'Feature': numeric_features,
    'Mean': [df[col].mean() for col in numeric_features],
    'Median': [df[col].median() for col in numeric_features],
    'Std': [df[col].std() for col in numeric_features],
    'Skewness': [df[col].skew() for col in numeric_features],
    'Kurtosis': [df[col].kurtosis() for col in numeric_features],
    'Min': [df[col].min() for col in numeric_features],
    'Max': [df[col].max() for col in numeric_features],
    'Q1': [df[col].quantile(0.25) for col in numeric_features],
    'Q3': [df[col].quantile(0.75) for col in numeric_features]
})

print(univariate_stats)


## 2. BIVARIATE ANALYSIS

### 2.1 Feature vs Outcome Analysis


In [None]:
# Bivariate Analysis: Feature distributions by Outcome
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.ravel()

for idx, feature in enumerate(numeric_features):
    df.boxplot(column=feature, by='Outcome', ax=axes[idx])
    axes[idx].set_title(f'{feature} by Outcome')
    axes[idx].set_xlabel('Outcome (0=No Diabetes, 1=Diabetes)')
    axes[idx].set_ylabel(feature)

plt.tight_layout()
plt.show()


In [None]:
# Density plots by outcome
fig, axes = plt.subplots(2, 4, figsize=(20, 10))
axes = axes.ravel()

for idx, feature in enumerate(numeric_features):
    df[df['Outcome'] == 0][feature].plot(kind='density', ax=axes[idx], label='No Diabetes', alpha=0.7)
    df[df['Outcome'] == 1][feature].plot(kind='density', ax=axes[idx], label='Diabetes', alpha=0.7)
    axes[idx].set_title(f'{feature} Distribution by Outcome')
    axes[idx].set_xlabel(feature)
    axes[idx].legend()

plt.tight_layout()
plt.show()


In [None]:
# Scatter plots: Key features vs Outcome
key_features = ['Glucose', 'BMI', 'Age', 'BloodPressure']

fig, axes = plt.subplots(2, 2, figsize=(14, 12))
axes = axes.ravel()

for idx, feature in enumerate(key_features):
    axes[idx].scatter(df[feature], df['Outcome'], alpha=0.5)
    axes[idx].set_xlabel(feature)
    axes[idx].set_ylabel('Outcome')
    axes[idx].set_title(f'{feature} vs Outcome')
    axes[idx].set_yticks([0, 1])

plt.tight_layout()
plt.show()


In [None]:
# Correlation with Outcome
print("=" * 60)
print("BIVARIATE ANALYSIS: CORRELATION WITH OUTCOME")
print("=" * 60)

correlations = df[numeric_features + ['Outcome']].corr()['Outcome'].sort_values(ascending=False)
print(correlations)

# Visualize correlations
plt.figure(figsize=(10, 6))
correlations.drop('Outcome').plot(kind='barh', color='steelblue')
plt.title('Feature Correlations with Outcome')
plt.xlabel('Correlation Coefficient')
plt.tight_layout()
plt.show()


In [None]:
# Scatter matrix for key features
from pandas.plotting import scatter_matrix

key_features_subset = ['Glucose', 'BMI', 'Age', 'BloodPressure', 'Outcome']
scatter_matrix(df[key_features_subset], figsize=(16, 16), alpha=0.6, diagonal='hist')
plt.suptitle('Scatter Matrix of Key Features', y=1.02, fontsize=16)
plt.tight_layout()
plt.show()


In [None]:
# Pairwise correlations heatmap
plt.figure(figsize=(12, 10))
correlation_matrix = df[numeric_features + ['Outcome']].corr()
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.show()


In [None]:
# PCA for dimensionality reduction and visualization
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Prepare data (handle zeros as missing values)
df_clean = df[numeric_features].copy()
df_clean = df_clean.replace(0, np.nan)
df_clean = df_clean.fillna(df_clean.median())

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_clean)

# Apply PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Visualize PCA results
plt.figure(figsize=(12, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=df['Outcome'], 
                     cmap='viridis', alpha=0.6, edgecolors='black')
plt.colorbar(scatter, label='Outcome')
plt.xlabel(f'First Principal Component (Explained Variance: {pca.explained_variance_ratio_[0]:.2%})')
plt.ylabel(f'Second Principal Component (Explained Variance: {pca.explained_variance_ratio_[1]:.2%})')
plt.title('PCA Visualization of Pima Indians Diabetes Dataset')
plt.legend(['No Diabetes', 'Diabetes'])
plt.tight_layout()
plt.show()

print(f"Total Explained Variance: {pca.explained_variance_ratio_.sum():.2%}")
print(f"PCA Components:\n{pca.components_}")


In [None]:
# 3D Scatter plot with top 3 features
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(14, 10))
ax = fig.add_subplot(111, projection='3d')

# Use top correlated features
top_features = ['Glucose', 'BMI', 'Age']
x = df[top_features[0]]
y = df[top_features[1]]
z = df[top_features[2]]
colors = df['Outcome']

scatter = ax.scatter(x, y, z, c=colors, cmap='viridis', alpha=0.6, edgecolors='black')
ax.set_xlabel(top_features[0])
ax.set_ylabel(top_features[1])
ax.set_zlabel(top_features[2])
ax.set_title('3D Scatter Plot: Glucose vs BMI vs Age (Colored by Outcome)')
plt.colorbar(scatter, label='Outcome')
plt.show()


In [None]:
# Multivariate analysis: Interaction effects
# Create interaction features
df['Glucose_BMI'] = df['Glucose'] * df['BMI']
df['Age_Glucose'] = df['Age'] * df['Glucose']
df['BMI_Age'] = df['BMI'] * df['Age']

# Correlation of interaction features with outcome
interaction_features = ['Glucose_BMI', 'Age_Glucose', 'BMI_Age']
interaction_corr = df[interaction_features + ['Outcome']].corr()['Outcome'].drop('Outcome')

print("=" * 60)
print("INTERACTION FEATURES CORRELATION WITH OUTCOME")
print("=" * 60)
print(interaction_corr.sort_values(ascending=False))

plt.figure(figsize=(10, 6))
interaction_corr.plot(kind='barh', color='coral')
plt.title('Interaction Features Correlation with Outcome')
plt.xlabel('Correlation Coefficient')
plt.tight_layout()
plt.show()


### 3.2 Outlier Detection


In [None]:
# Outlier detection using IQR method
print("=" * 60)
print("OUTLIER DETECTION (IQR Method)")
print("=" * 60)

outliers_summary = []
for feature in numeric_features:
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
    outliers_summary.append({
        'Feature': feature,
        'Lower Bound': lower_bound,
        'Upper Bound': upper_bound,
        'Outlier Count': len(outliers),
        'Outlier Percentage': len(outliers) / len(df) * 100
    })

outliers_df = pd.DataFrame(outliers_summary)
print(outliers_df)


## 4. EDA SUMMARY


In [None]:
print("=" * 60)
print("EXPLORATORY DATA ANALYSIS SUMMARY")
print("=" * 60)
print("\n1. UNIVARIATE ANALYSIS:")
print("   - Most features show right-skewed distributions")
print("   - Presence of zeros in several features (potential missing data)")
print("   - Wide ranges in features like Insulin and SkinThickness")

print("\n2. BIVARIATE ANALYSIS:")
print("   - Strong positive correlation: Glucose, BMI, Age with Outcome")
print("   - Clear separation between diabetic and non-diabetic groups for key features")
print("   - Glucose shows the strongest association with diabetes outcome")

print("\n3. MULTIVARIATE ANALYSIS:")
print("   - PCA reveals some clustering by outcome")
print("   - Interaction features may improve model performance")
print("   - Multiple features contribute to diabetes prediction")

print("\n4. KEY FINDINGS:")
print("   - Glucose is the most important predictor")
print("   - BMI and Age are significant factors")
print("   - Data quality issues need addressing (zero values)")
print("   - Some outliers present but may be valid clinical values")
print("=" * 60)
