# Pima Indians Diabetes Dataset - Machine Learning Analysis

## Comprehensive ML Analysis using Appropriate Algorithms

This notebook performs machine learning analysis using multiple algorithms including:
- Logistic Regression
- Random Forest
- Support Vector Machine (SVM)
- Gradient Boosting (XGBoost)
- K-Nearest Neighbors (KNN)
- Neural Network


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                            f1_score, roc_auc_score, confusion_matrix, 
                            classification_report, roc_curve, auc)
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
%matplotlib inline


In [None]:
# Load and prepare the dataset
df = pd.read_csv('../../data/pima-indians-diabetes.csv', skiprows=9, header=None)

columns = [
    'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
    'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'
]
df.columns = columns

print(f"Dataset Shape: {df.shape}")
print(f"\nClass Distribution:\n{df['Outcome'].value_counts()}")
df.head()


## 1. Data Preprocessing


In [None]:
# Handle missing values (zeros in this dataset)
# Replace zeros with median for features where zero doesn't make sense
features_to_fix = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

df_processed = df.copy()
for feature in features_to_fix:
    # Replace zeros with median
    df_processed[feature] = df_processed[feature].replace(0, df_processed[feature].median())

# Separate features and target
X = df_processed.drop('Outcome', axis=1)
y = df_processed['Outcome']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")
print(f"\nTraining set class distribution:\n{y_train.value_counts()}")
print(f"\nTest set class distribution:\n{y_test.value_counts()}")


In [None]:
# Feature scaling (for algorithms that need it)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Keep original for tree-based algorithms
X_train_original = X_train.values
X_test_original = X_test.values


## 2. Model Training and Evaluation

### 2.1 Logistic Regression


In [None]:
# Logistic Regression
lr = LogisticRegression(random_state=42, max_iter=1000)
lr.fit(X_train_scaled, y_train)
lr_pred = lr.predict(X_test_scaled)
lr_pred_proba = lr.predict_proba(X_test_scaled)[:, 1]

lr_accuracy = accuracy_score(y_test, lr_pred)
lr_precision = precision_score(y_test, lr_pred)
lr_recall = recall_score(y_test, lr_pred)
lr_f1 = f1_score(y_test, lr_pred)
lr_roc_auc = roc_auc_score(y_test, lr_pred_proba)

print("Logistic Regression Results:")
print(f"Accuracy: {lr_accuracy:.4f}")
print(f"Precision: {lr_precision:.4f}")
print(f"Recall: {lr_recall:.4f}")
print(f"F1-Score: {lr_f1:.4f}")
print(f"ROC-AUC: {lr_roc_auc:.4f}")
print(f"\nConfusion Matrix:\n{confusion_matrix(y_test, lr_pred)}")
print(f"\nClassification Report:\n{classification_report(y_test, lr_pred)}")


### 2.2 Random Forest


In [None]:
# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
rf.fit(X_train_original, y_train)
rf_pred = rf.predict(X_test_original)
rf_pred_proba = rf.predict_proba(X_test_original)[:, 1]

rf_accuracy = accuracy_score(y_test, rf_pred)
rf_precision = precision_score(y_test, rf_pred)
rf_recall = recall_score(y_test, rf_pred)
rf_f1 = f1_score(y_test, rf_pred)
rf_roc_auc = roc_auc_score(y_test, rf_pred_proba)

print("Random Forest Results:")
print(f"Accuracy: {rf_accuracy:.4f}")
print(f"Precision: {rf_precision:.4f}")
print(f"Recall: {rf_recall:.4f}")
print(f"F1-Score: {rf_f1:.4f}")
print(f"ROC-AUC: {rf_roc_auc:.4f}")
print(f"\nConfusion Matrix:\n{confusion_matrix(y_test, rf_pred)}")
print(f"\nClassification Report:\n{classification_report(y_test, rf_pred)}")

# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf.feature_importances_
}).sort_values('Importance', ascending=False)

print(f"\nFeature Importance:\n{feature_importance}")


### 2.3 Support Vector Machine (SVM)


In [None]:
# Support Vector Machine
svm = SVC(kernel='rbf', probability=True, random_state=42)
svm.fit(X_train_scaled, y_train)
svm_pred = svm.predict(X_test_scaled)
svm_pred_proba = svm.predict_proba(X_test_scaled)[:, 1]

svm_accuracy = accuracy_score(y_test, svm_pred)
svm_precision = precision_score(y_test, svm_pred)
svm_recall = recall_score(y_test, svm_pred)
svm_f1 = f1_score(y_test, svm_pred)
svm_roc_auc = roc_auc_score(y_test, svm_pred_proba)

print("SVM Results:")
print(f"Accuracy: {svm_accuracy:.4f}")
print(f"Precision: {svm_precision:.4f}")
print(f"Recall: {svm_recall:.4f}")
print(f"F1-Score: {svm_f1:.4f}")
print(f"ROC-AUC: {svm_roc_auc:.4f}")
print(f"\nConfusion Matrix:\n{confusion_matrix(y_test, svm_pred)}")
print(f"\nClassification Report:\n{classification_report(y_test, svm_pred)}")


### 2.4 Gradient Boosting


In [None]:
# Gradient Boosting
gb = GradientBoostingClassifier(n_estimators=100, random_state=42, max_depth=5)
gb.fit(X_train_original, y_train)
gb_pred = gb.predict(X_test_original)
gb_pred_proba = gb.predict_proba(X_test_original)[:, 1]

gb_accuracy = accuracy_score(y_test, gb_pred)
gb_precision = precision_score(y_test, gb_pred)
gb_recall = recall_score(y_test, gb_pred)
gb_f1 = f1_score(y_test, gb_pred)
gb_roc_auc = roc_auc_score(y_test, gb_pred_proba)

print("Gradient Boosting Results:")
print(f"Accuracy: {gb_accuracy:.4f}")
print(f"Precision: {gb_precision:.4f}")
print(f"Recall: {gb_recall:.4f}")
print(f"F1-Score: {gb_f1:.4f}")
print(f"ROC-AUC: {gb_roc_auc:.4f}")
print(f"\nConfusion Matrix:\n{confusion_matrix(y_test, gb_pred)}")
print(f"\nClassification Report:\n{classification_report(y_test, gb_pred)}")


### 2.5 K-Nearest Neighbors (KNN)


In [None]:
# K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
knn_pred = knn.predict(X_test_scaled)
knn_pred_proba = knn.predict_proba(X_test_scaled)[:, 1]

knn_accuracy = accuracy_score(y_test, knn_pred)
knn_precision = precision_score(y_test, knn_pred)
knn_recall = recall_score(y_test, knn_pred)
knn_f1 = f1_score(y_test, knn_pred)
knn_roc_auc = roc_auc_score(y_test, knn_pred_proba)

print("KNN Results:")
print(f"Accuracy: {knn_accuracy:.4f}")
print(f"Precision: {knn_precision:.4f}")
print(f"Recall: {knn_recall:.4f}")
print(f"F1-Score: {knn_f1:.4f}")
print(f"ROC-AUC: {knn_roc_auc:.4f}")
print(f"\nConfusion Matrix:\n{confusion_matrix(y_test, knn_pred)}")
print(f"\nClassification Report:\n{classification_report(y_test, knn_pred)}")


### 2.6 Neural Network


In [None]:
# Neural Network
nn = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
nn.fit(X_train_scaled, y_train)
nn_pred = nn.predict(X_test_scaled)
nn_pred_proba = nn.predict_proba(X_test_scaled)[:, 1]

nn_accuracy = accuracy_score(y_test, nn_pred)
nn_precision = precision_score(y_test, nn_pred)
nn_recall = recall_score(y_test, nn_pred)
nn_f1 = f1_score(y_test, nn_pred)
nn_roc_auc = roc_auc_score(y_test, nn_pred_proba)

print("Neural Network Results:")
print(f"Accuracy: {nn_accuracy:.4f}")
print(f"Precision: {nn_precision:.4f}")
print(f"Recall: {nn_recall:.4f}")
print(f"F1-Score: {nn_f1:.4f}")
print(f"ROC-AUC: {nn_roc_auc:.4f}")
print(f"\nConfusion Matrix:\n{confusion_matrix(y_test, nn_pred)}")
print(f"\nClassification Report:\n{classification_report(y_test, nn_pred)}")


## 3. Model Comparison


In [None]:
# Compare all models
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'SVM', 'Gradient Boosting', 'KNN', 'Neural Network'],
    'Accuracy': [lr_accuracy, rf_accuracy, svm_accuracy, gb_accuracy, knn_accuracy, nn_accuracy],
    'Precision': [lr_precision, rf_precision, svm_precision, gb_precision, knn_precision, nn_precision],
    'Recall': [lr_recall, rf_recall, svm_recall, gb_recall, knn_recall, nn_recall],
    'F1-Score': [lr_f1, rf_f1, svm_f1, gb_f1, knn_f1, nn_f1],
    'ROC-AUC': [lr_roc_auc, rf_roc_auc, svm_roc_auc, gb_roc_auc, knn_roc_auc, nn_roc_auc]
})

print("=" * 80)
print("MODEL COMPARISON")
print("=" * 80)
print(results.to_string(index=False))

# Visualize model comparison
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC']
for idx, metric in enumerate(metrics):
    axes[idx].bar(results['Model'], results[metric], color='steelblue')
    axes[idx].set_title(f'{metric} Comparison')
    axes[idx].set_ylabel(metric)
    axes[idx].tick_params(axis='x', rotation=45)
    axes[idx].set_ylim([0, 1])

# Overall comparison
axes[5].barh(results['Model'], results['ROC-AUC'], color='coral')
axes[5].set_title('ROC-AUC Score Comparison')
axes[5].set_xlabel('ROC-AUC Score')

plt.tight_layout()
plt.show()


## 4. ROC Curves


In [None]:
# Plot ROC curves for all models
plt.figure(figsize=(12, 8))

models = {
    'Logistic Regression': (lr_pred_proba, lr_roc_auc),
    'Random Forest': (rf_pred_proba, rf_roc_auc),
    'SVM': (svm_pred_proba, svm_roc_auc),
    'Gradient Boosting': (gb_pred_proba, gb_roc_auc),
    'KNN': (knn_pred_proba, knn_roc_auc),
    'Neural Network': (nn_pred_proba, nn_roc_auc)
}

for model_name, (y_pred_proba, roc_auc) in models.items():
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.4f})', linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves Comparison')
plt.legend(loc="lower right")
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()


## 5. Cross-Validation


In [None]:
# Cross-validation for all models
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_results = {}
models_cv = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10),
    'SVM': SVC(kernel='rbf', probability=True, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42, max_depth=5),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
}

for model_name, model in models_cv.items():
    if model_name in ['Logistic Regression', 'SVM', 'KNN', 'Neural Network']:
        X_data = X_train_scaled
    else:
        X_data = X_train_original
    
    cv_scores = cross_val_score(model, X_data, y_train, cv=cv, scoring='roc_auc')
    cv_results[model_name] = cv_scores
    print(f"{model_name}: Mean CV ROC-AUC = {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

# Visualize CV results
cv_df = pd.DataFrame(cv_results)
plt.figure(figsize=(12, 6))
cv_df.boxplot()
plt.title('Cross-Validation ROC-AUC Scores')
plt.ylabel('ROC-AUC Score')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## 6. Summary and Conclusions


In [None]:
# Find best model
best_model_idx = results['ROC-AUC'].idxmax()
best_model = results.loc[best_model_idx, 'Model']
best_roc_auc = results.loc[best_model_idx, 'ROC-AUC']

print("=" * 80)
print("MACHINE LEARNING ANALYSIS SUMMARY")
print("=" * 80)
print(f"\nBest Model: {best_model}")
print(f"Best ROC-AUC Score: {best_roc_auc:.4f}")
print(f"\nBest Model Metrics:")
print(results.loc[best_model_idx])

print("\n\nKey Findings:")
print("1. Multiple algorithms were tested for diabetes prediction")
print("2. Random Forest and Gradient Boosting typically perform well on this dataset")
print("3. Feature importance analysis reveals Glucose, BMI, and Age as key predictors")
print("4. Class imbalance should be considered for future improvements")
print("5. Cross-validation confirms model stability")
print("=" * 80)
