# 🩺 Diabetes Prediction

**Project**: Binary Classification - Medical Diagnosis  
**Level**: Intermediate  
**Dataset**: Pima Indians Diabetes Dataset  

## 📋 Project Overview

This project predicts diabetes likelihood using machine learning on medical diagnostic data. We'll learn:

- Advanced data preprocessing
- Handling missing values in medical data
- Feature engineering for healthcare
- Class imbalance techniques
- Medical ML ethics and interpretability

Let's build a medical diagnostic model! 🏥

## 1. Import Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve

# Advanced ML
import xgboost as xgb
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Utilities
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ All libraries imported successfully!")
print("🩺 Ready for medical data analysis!")

## 2. Data Loading and Exploration

In [None]:
# Load diabetes dataset (we'll create sample data since the dataset needs to be downloaded)
# In practice, you would load from: df = pd.read_csv('diabetes.csv')

# For demonstration, let's create sample data with the same structure
np.random.seed(42)
n_samples = 768

# Create sample data that mimics the Pima Indians Diabetes dataset
data = {
    'Pregnancies': np.random.poisson(3, n_samples),
    'Glucose': np.random.normal(120, 30, n_samples).clip(0, 200),
    'BloodPressure': np.random.normal(70, 15, n_samples).clip(0, 150),
    'SkinThickness': np.random.normal(20, 10, n_samples).clip(0, 100),
    'Insulin': np.random.normal(80, 50, n_samples).clip(0, 300),
    'BMI': np.random.normal(32, 8, n_samples).clip(15, 60),
    'DiabetesPedigreeFunction': np.random.gamma(2, 0.2, n_samples),
    'Age': np.random.normal(33, 12, n_samples).clip(18, 80)
}

df = pd.DataFrame(data)

# Create target variable based on realistic medical relationships
diabetes_prob = (
    0.01 * df['Glucose'] + 
    0.02 * df['BMI'] + 
    0.01 * df['Age'] + 
    0.5 * df['DiabetesPedigreeFunction'] - 3
)
df['Outcome'] = (np.random.random(n_samples) < 1 / (1 + np.exp(-diabetes_prob))).astype(int)

# Add some missing values (represented as 0 in medical data)
missing_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for feature in missing_features:
    missing_idx = np.random.choice(df.index, size=int(0.05 * len(df)), replace=False)
    df.loc[missing_idx, feature] = 0

print("🩺 Diabetes dataset loaded!")
print(f"Dataset shape: {df.shape}")
print(f"Features: {list(df.columns[:-1])}")
print(f"Target: Outcome (0=No Diabetes, 1=Diabetes)")

In [None]:
# Display first few rows
print("🔍 First 5 rows:")
df.head()

In [None]:
# Dataset information
print("📊 Dataset Information:")
print(f"Total patients: {len(df)}")
print(f"Features: {df.shape[1] - 1}")
print(f"Diabetes cases: {df['Outcome'].sum()} ({df['Outcome'].mean():.1%})")
print(f"Non-diabetes cases: {(df['Outcome'] == 0).sum()} ({(df['Outcome'] == 0).mean():.1%})")

print("\n❌ Missing Values (represented as 0):")
for col in df.columns[:-1]:
    zero_count = (df[col] == 0).sum()
    if zero_count > 0:
        print(f"• {col}: {zero_count} ({zero_count/len(df):.1%})")

print("\n📈 Statistical Summary:")
df.describe()

## 3. Exploratory Data Analysis

In [None]:
# Class distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Pie chart
outcome_counts = df['Outcome'].value_counts()
axes[0].pie(outcome_counts.values, labels=['No Diabetes', 'Diabetes'], 
           autopct='%1.1f%%', startangle=90, colors=['#4ECDC4', '#FF6B6B'])
axes[0].set_title('🩺 Diabetes Distribution', fontweight='bold', fontsize=14)

# Bar chart
bars = axes[1].bar(['No Diabetes', 'Diabetes'], outcome_counts.values, 
                   color=['#4ECDC4', '#FF6B6B'])
axes[1].set_title('📊 Patient Count by Diagnosis', fontweight='bold', fontsize=14)
axes[1].set_ylabel('Number of Patients')

# Add value labels
for bar in bars:
    height = bar.get_height()
    axes[1].text(bar.get_x() + bar.get_width()/2., height + 5,
                f'{int(height)}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print(f"📊 Class Balance:")
print(f"• No Diabetes: {outcome_counts[0]} patients ({outcome_counts[0]/len(df):.1%})")
print(f"• Diabetes: {outcome_counts[1]} patients ({outcome_counts[1]/len(df):.1%})")
print(f"• Class Imbalance Ratio: {outcome_counts[0]/outcome_counts[1]:.1f}:1")

In [None]:
# Feature distributions by diabetes outcome
fig, axes = plt.subplots(2, 4, figsize=(20, 12))
fig.suptitle('🔍 Medical Feature Distributions by Diabetes Status', fontsize=16, fontweight='bold')

features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
           'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

for i, feature in enumerate(features):
    row, col = i // 4, i % 4
    
    # Filter out zeros for better visualization
    no_diabetes = df[(df['Outcome'] == 0) & (df[feature] > 0)][feature]
    diabetes = df[(df['Outcome'] == 1) & (df[feature] > 0)][feature]
    
    axes[row, col].hist(no_diabetes, alpha=0.7, label='No Diabetes', 
                       bins=20, color='#4ECDC4', density=True)
    axes[row, col].hist(diabetes, alpha=0.7, label='Diabetes', 
                       bins=20, color='#FF6B6B', density=True)
    
    axes[row, col].set_title(f'{feature}', fontweight='bold')
    axes[row, col].set_xlabel(feature)
    axes[row, col].set_ylabel('Density')
    axes[row, col].legend()
    axes[row, col].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(12, 10))

# Calculate correlations (excluding zeros)
df_clean = df.replace(0, np.nan)
correlation_matrix = df_clean.corr()

# Create heatmap
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='RdYlBu_r', center=0,
            square=True, linewidths=0.5, fmt='.2f')
plt.title('🔥 Medical Feature Correlations', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

print("🔍 Key Correlations with Diabetes:")
diabetes_corr = correlation_matrix['Outcome'].sort_values(key=abs, ascending=False)[1:]
for feature, corr in diabetes_corr.items():
    if not np.isnan(corr):
        print(f"• {feature}: {corr:.3f}")
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Data Preprocessing and Feature Engineering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a copy for preprocessing\n",
    "data = df.copy()\n",
    "\n",
    "print(\"🔧 Starting advanced data preprocessing...\")\n",
    "print(f\"Original dataset shape: {data.shape}\")\n",
    "\n",
    "# 1. Handle missing values (zeros represent missing in medical data)\n",
    "print(\"\\n❌ Handling missing values...\")\n",
    "missing_features = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']\n",
    "\n",
    "for feature in missing_features:\n",
    "    # Replace 0 with NaN\n",
    "    data[feature] = data[feature].replace(0, np.nan)\n",
    "    \n",
    "    # Impute with median grouped by outcome\n",
    "    median_values = data.groupby('Outcome')[feature].median()\n",
    "    data[feature] = data.groupby('Outcome')[feature].transform(\n",
    "        lambda x: x.fillna(x.median())\n",
    "    )\n",
    "    \n",
    "    missing_count = data[feature].isnull().sum()\n",
    "    print(f\"✅ {feature}: {missing_count} missing values remaining\")\n",
    "\n",
    "print(\"\\n🛠️ Creating new features...\")\n",
    "\n",
    "# 2. Feature engineering\n",
    "# BMI categories\n",
    "data['BMI_Category'] = pd.cut(data['BMI'], \n",
    "                             bins=[0, 18.5, 25, 30, float('inf')],\n",
    "                             labels=['Underweight', 'Normal', 'Overweight', 'Obese'])\n",
    "\n",
    "# Age groups\n",
    "data['Age_Group'] = pd.cut(data['Age'],\n",
    "                          bins=[0, 30, 40, 50, float('inf')],\n",
    "                          labels=['Young', 'Adult', 'Middle_age', 'Senior'])\n",
    "\n",
    "# Risk score (simple combination)\n",
    "data['Risk_Score'] = (\n",
    "    (data['Glucose'] > 125).astype(int) * 3 +\n",
    "    (data['BMI'] > 30).astype(int) * 2 +\n",
    "    (data['Age'] > 45).astype(int) * 1 +\n",
    "    (data['BloodPressure'] > 80).astype(int) * 1\n",
    ")\n",
    "\n",
    "print(f\"✅ Created BMI categories\")\n",
    "print(f\"✅ Created age groups\")\n",
    "print(f\"✅ Created risk score\")\n",
    "\n",
    "print(f\"\\n📊 Final dataset shape: {data.shape}\")\n",
    "print(f\"📊 New features added: {data.shape[1] - df.shape[1]}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Model Building and Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Prepare features for modeling\n",
    "print(\"🔧 Preparing features for machine learning...\")\n",
    "\n",
    "# Select numerical features\n",
    "numerical_features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', \n",
    "                     'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Risk_Score']\n",
    "\n",
    "X = data[numerical_features]\n",
    "y = data['Outcome']\n",
    "\n",
    "print(f\"📊 Feature matrix shape: {X.shape}\")\n",
    "print(f\"📊 Target variable shape: {y.shape}\")\n",
    "print(f\"📊 Features used: {list(X.columns)}\")\n",
    "\n",
    "# Split the data\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    X, y, test_size=0.2, random_state=42, stratify=y\n",
    ")\n",
    "\n",
    "print(f\"\\n✂️ Data split completed:\")\n",
    "print(f\"Training set: {X_train.shape[0]} samples\")\n",
    "print(f\"Testing set: {X_test.shape[0]} samples\")\n",
    "print(f\"Training diabetes rate: {y_train.mean():.1%}\")\n",
    "print(f\"Testing diabetes rate: {y_test.mean():.1%}\")\n",
    "\n",
    "# Scale the features\n",
    "scaler = StandardScaler()\n",
    "X_train_scaled = scaler.fit_transform(X_train)\n",
    "X_test_scaled = scaler.transform(X_test)\n",
    "\n",
    "print(\"\\n⚖️ Feature scaling completed!\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}