# Diabetes Prediction Analysis
**Author:** TNT  
**Version:** 1.2  
**Dataset:** Pima Indian Diabetes Dataset  
**Description:** Interactive analysis and visualization of diabetes prediction models


## 1. Setup and Data Loading

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import json
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.model_selection import learning_curve
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

In [None]:
# Load the dataset
data = pd.read_csv('../data/pima-diabetes.csv')

print(f"Dataset shape: {data.shape}")
print(f"\nColumns: {list(data.columns)}")
print(f"\nFirst 5 rows:")
data.head()

## 2. Exploratory Data Analysis

In [None]:
# Basic statistics
print("Dataset Info:")
print(data.info())
print("\nBasic Statistics:")
data.describe()

In [None]:
# Check for missing values and zero values
print("Missing Values:")
print(data.isnull().sum())

print("\nZero Values Analysis:")
zero_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in zero_cols:
    if col in data.columns:
        zero_count = (data[col] == 0).sum()
        print(f"{col}: {zero_count} zero values ({zero_count/len(data)*100:.2f}%)")

In [None]:
# Target variable distribution
plt.figure(figsize=(12, 5))

# Count plot
plt.subplot(1, 2, 1)
sns.countplot(data=data, x='Class')
plt.title('Distribution of Diabetes Cases')
plt.xlabel('Class (0: No Diabetes, 1: Diabetes)')

# Pie chart
plt.subplot(1, 2, 2)
class_counts = data['Class'].value_counts()
plt.pie(class_counts.values, labels=['No Diabetes', 'Diabetes'], autopct='%1.1f%%')
plt.title('Diabetes Prevalence')

plt.tight_layout()
plt.show()

print(f"Diabetes prevalence: {data['Class'].mean()*100:.2f}%")

In [None]:
# Feature distributions
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
features = data.columns[:-1]  # All except 'Class'

for i, feature in enumerate(features):
    row = i // 3
    col = i % 3
    
    # Histogram with KDE
    sns.histplot(data=data, x=feature, hue='Class', kde=True, ax=axes[row, col])
    axes[row, col].set_title(f'Distribution of {feature}')

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
plt.figure(figsize=(10, 8))
correlation_matrix = data.corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5)
plt.title('Feature Correlation Heatmap')
plt.show()

# Show strongest correlations with target
target_corr = correlation_matrix['Class'].abs().sort_values(ascending=False)
print("\nStrongest correlations with diabetes:")
print(target_corr[1:])  # Exclude self-correlation

## 3. Summary and Insights

In [None]:
# Generate summary insights
print("DIABETES PREDICTION ANALYSIS SUMMARY")
print("="*50)

print(f"\nDataset Overview:")
print(f"- Total samples: {len(data)}")
print(f"- Features: {len(data.columns)-1}")
print(f"- Diabetes prevalence: {data['Class'].mean()*100:.1f}%")

print(f"\nKey Risk Factors (based on correlation):")
target_corr = data.corr()['Class'].abs().sort_values(ascending=False)
for i, (feature, corr) in enumerate(target_corr[1:4].items()):
    print(f"{i+1}. {feature}: {corr:.3f}")

print(f"\nRecommendations:")
print(f"- Focus on glucose monitoring for early detection")
print(f"- BMI management is crucial for diabetes prevention")
print(f"- Age-specific screening protocols should be implemented")
print(f"- Family history (pedigree) should be considered in risk assessment")