In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time

In [2]:
# Load MNIST dataset
print("Loading MNIST dataset...")
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, y = mnist["data"], mnist["target"]
y = y.astype(np.uint8)

# Split into 50,000 train and 20,000 test
X_train, X_test = X[:50000], X[50000:]
y_train, y_test = y[:50000], y[50000:]

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")


Loading MNIST dataset...
Training set: (50000, 784)
Test set: (20000, 784)


In [3]:
# Scale pixel values from 0-255 to 0-1 (KNN is distance-based!)
X_train_scaled = X_train / 255.0
X_test_scaled = X_test / 255.0

print("Data scaled to 0-1 range")
print(f"Pixel value range: [{X_train_scaled.min():.1f}, {X_train_scaled.max():.1f}]")

Data scaled to 0-1 range
Pixel value range: [0.0, 1.0]


In [4]:
print("Training Baseline KNN Classifier...")

# Start with default parameters
knn_baseline = KNeighborsClassifier()
start_time = time.time()

knn_baseline.fit(X_train_scaled, y_train)
baseline_time = time.time() - start_time

# Evaluate baseline
baseline_score = knn_baseline.score(X_test_scaled, y_test)
print(f"Baseline KNN Accuracy: {baseline_score:.4f}")
print(f"Training time: {baseline_time:.2f} seconds")

Training Baseline KNN Classifier...
Baseline KNN Accuracy: 0.9691
Training time: 0.45 seconds


In [5]:
# Define parameter grid for KNN
param_grid = [
    {
        'n_neighbors': [3, 4, 5],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    }
]

# Create KNN classifier
knn = KNeighborsClassifier()

# Set up grid search with 3-fold cross-validation
grid_search = GridSearchCV(
    knn, 
    param_grid, 
    cv=3, 
    scoring='accuracy',
    n_jobs=-1,  # Use all available CPU cores
    verbose=1
)

In [None]:
print("Starting grid search (this may take a while)...")
start_time = time.time()
grid_search.fit(X_train_scaled, y_train)
grid_time = time.time() - start_time

print(f"Grid search completed in {grid_time:.2f} seconds")

Starting grid search (this may take a while)...
Fitting 3 folds for each of 12 candidates, totalling 36 fits


In [None]:
print("Grid Search Results:")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

# Show all results
print("\nAll parameter combinations:")
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results["mean_test_score"], cv_results["params"]):
    print(f"  {mean_score:.4f} for {params}")

In [None]:
# Get best estimator from grid search
best_knn = grid_search.best_estimator_
print(f"Final model parameters: {best_knn.get_params()}")

In [None]:
# Make predictions on test set
y_pred = best_knn.predict(X_test_scaled)

# Calculate accuracy
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

# Check if we achieved the goal
if test_accuracy >= 0.97:
    print("🎉 SUCCESS: Achieved target accuracy of 97% or higher!")
else:
    print("⚠️  Target not reached. Consider expanding hyperparameter search.")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Define classifiers to compare
classifiers = {
    'KNN (Tuned)': best_knn,
    'SGD': SGDClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM Linear': SVC(kernel='linear', random_state=42)
}

print("Training and comparing classifiers...")
results = {}

for name, clf in classifiers.items():
    start_time = time.time()
    
    if name != 'KNN (Tuned)':  # We already trained KNN
        # Scale data for SVM (KNN already scaled)
        if name == 'SVM Linear':
            clf.fit(X_train_scaled, y_train)
            accuracy = clf.score(X_test_scaled, y_test)
        else:
            clf.fit(X_train, y_train)
            accuracy = clf.score(X_test, y_test)
    else:
        accuracy = test_accuracy  # Use our already computed accuracy
    
    train_time = time.time() - start_time
    results[name] = {'accuracy': accuracy, 'time': train_time}
    
    print(f"{name:15} | Accuracy: {accuracy:.4f} | Time: {train_time:.2f}s")

In [None]:
# Plot accuracy comparison
plt.figure(figsize=(12, 5))

# Accuracy comparison
plt.subplot(1, 2, 1)
names = list(results.keys())
accuracies = [results[name]['accuracy'] for name in names]
colors = ['green' if acc >= 0.97 else 'blue' for acc in accuracies]

bars = plt.bar(names, accuracies, color=colors, alpha=0.7)
plt.axhline(y=0.97, color='red', linestyle='--', label='Target (97%)')
plt.ylabel('Accuracy')
plt.title('Classifier Accuracy Comparison')
plt.xticks(rotation=45)
plt.legend()

# Add value labels on bars
for bar, accuracy in zip(bars, accuracies):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
             f'{accuracy:.3f}', ha='center', va='bottom')

# Training time comparison
plt.subplot(1, 2, 2)
times = [results[name]['time'] for name in names]
plt.bar(names, times, color='orange', alpha=0.7)
plt.ylabel('Training Time (seconds)')
plt.title('Training Time Comparison')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix - KNN Classifier')
plt.colorbar()
tick_marks = np.arange(10)
plt.xticks(tick_marks, range(10))
plt.yticks(tick_marks, range(10))
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

# Add text annotations
thresh = cm.max() / 2.
for i in range(10):
    for j in range(10):
        plt.text(j, i, format(cm[i, j], 'd'),
                horizontalalignment="center",
                color="white" if cm[i, j] > thresh else "black")

plt.tight_layout()
plt.show()

In [None]:
# Find misclassified examples
misclassified_indices = np.where(y_pred != y_test)[0]

if len(misclassified_indices) > 0:
    print(f"Number of misclassified examples: {len(misclassified_indices)}")
    
    # Show first few misclassified examples
    print("\nFirst 5 misclassified examples:")
    for i in range(min(5, len(misclassified_indices))):
        idx = misclassified_indices[i]
        print(f"True: {y_test[idx]}, Predicted: {y_pred[idx]}")
        
        # Optional: Display the image
        plt.figure(figsize=(2, 2))
        plt.imshow(X_test[idx].reshape(28, 28), cmap='binary')
        plt.title(f'True: {y_test[idx]}, Pred: {y_pred[idx]}')
        plt.axis('off')
        plt.show()
else:
    print("No misclassifications found!")

In [None]:
print("="*60)
print("SUMMARY AND KEY FINDINGS")
print("="*60)

print(f"🎯 Target Accuracy: 97%")
print(f"✅ KNN Achieved Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")

if test_accuracy >= 0.97:
    print("🎉 GOAL ACHIEVED: KNN classifier exceeded 97% accuracy!")
else:
    print("⚠️  GOAL NOT REACHED: Consider trying:")
    print("   - Larger hyperparameter grid (n_neighbors: [1, 3, 5, 7, 9])")
    print("   - Data augmentation")
    print("   - Feature engineering")

print(f"\nBest Hyperparameters: {grid_search.best_params_}")
print(f"Most accurate competing classifier: {max([(name, res['accuracy']) for name, res in results.items() if name != 'KNN (Tuned)'], key=lambda x: x[1])}")

# Performance comparison
print("\nPerformance Ranking:")
sorted_results = sorted(results.items(), key=lambda x: x[1]['accuracy'], reverse=True)
for i, (name, res) in enumerate(sorted_results, 1):
    print(f"{i}. {name:15}: {res['accuracy']:.4f}")