In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

In [None]:
# load diabetes dataset
df = pd.read_csv('../data/input/diabetes.csv')
print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# prepare features and target
X = df.drop('Outcome', axis=1)
y = df['Outcome']

print(f"Features: {X.shape[1]}")
print(f"Class distribution:\n{y.value_counts()}")

## 1. Simple Train/Test Split

In [None]:
# simple train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train and evaluate
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print(f"Train/Test Split Accuracy: {accuracy_score(y_test, y_pred):.4f}")

## 2. K-Fold Cross-Validation

More robust than a single split - tests on multiple folds.

In [None]:
# 5-fold cross validation
cv_scores = cross_val_score(lr, X, y, cv=5, scoring='accuracy')

print("5-Fold CV Scores:", cv_scores.round(4))
print(f"Mean: {cv_scores.mean():.4f} (+/- {cv_scores.std()*2:.4f})")

In [None]:
# visualize cv scores
plt.figure(figsize=(8, 4))
plt.bar(range(1, 6), cv_scores, color='steelblue', alpha=0.7)
plt.axhline(y=cv_scores.mean(), color='red', linestyle='--', label=f'Mean: {cv_scores.mean():.3f}')
plt.xlabel('Fold')
plt.ylabel('Accuracy')
plt.title('5-Fold Cross-Validation Scores')
plt.legend()
plt.ylim(0.6, 0.9)
plt.tight_layout()
plt.show()

## 3. Stratified K-Fold

Preserves class distribution in each fold - important for imbalanced datasets.

In [None]:
# stratified k-fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

stratified_scores = cross_val_score(lr, X, y, cv=skf, scoring='accuracy')

print("Stratified 5-Fold Scores:", stratified_scores.round(4))
print(f"Mean: {stratified_scores.mean():.4f} (+/- {stratified_scores.std()*2:.4f})")

## 4. Model Comparison with Cross-Validation

In [None]:
# compare multiple models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'KNN (k=5)': KNeighborsClassifier(n_neighbors=5)
}

results = {}
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    results[name] = {'mean': scores.mean(), 'std': scores.std(), 'scores': scores}
    print(f"{name}: {scores.mean():.4f} (+/- {scores.std()*2:.4f})")

In [None]:
# visualization
fig, ax = plt.subplots(figsize=(10, 5))

names = list(results.keys())
means = [results[n]['mean'] for n in names]
stds = [results[n]['std'] for n in names]

x = np.arange(len(names))
bars = ax.bar(x, means, yerr=stds, capsize=5, color=['steelblue', 'coral', 'seagreen'], alpha=0.7)

ax.set_xticks(x)
ax.set_xticklabels(names)
ax.set_ylabel('Accuracy')
ax.set_title('Model Comparison (5-Fold CV)')
ax.set_ylim(0.6, 0.9)

# add value labels
for bar, mean in zip(bars, means):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, 
            f'{mean:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## Key Takeaways

1. **Single train/test split** can be misleading - results depend on the random split
2. **K-fold cross-validation** gives more reliable performance estimates
3. **Stratified K-fold** is essential for imbalanced datasets
4. **Always compare models** using the same validation strategy
5. Report both **mean and standard deviation** of CV scores

https://www.linkedin.com/feed/update/urn:li:activity:7211639385396551680/

In [None]:
ùêûùêØùê¢ùêùùêûùêßùê≠ùê•ùê≤