In [None]:
import numpy as np
import cv2
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from google.colab import drive
import os
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import warnings
warnings.filterwarnings('ignore')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
np.random.seed(42)
data_dir = '/content/drive/My Drive/data/train'  # Update this path

def load_images(data_dir, image_size=(64, 64)):
    X, y = [], []
    for class_name in os.listdir(data_dir):
        class_path = os.path.join(data_dir, class_name)
        if os.path.isdir(class_path):
            for img_name in os.listdir(class_path):
                img_path = os.path.join(class_path, img_name)
                img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)  # Grayscale for KNN
                if img is not None:
                    img = cv2.resize(img, image_size)
                    X.append(img.flatten())  # Flatten for KNN
                    y.append(class_name)
    return np.array(X), np.array(y)

X, y = load_images(data_dir)

In [None]:
def augment_images(X, y, image_size=(64, 64), augmentation_factor=0.2):
    n_samples = int(len(X) * augmentation_factor)
    indices = np.random.choice(len(X), n_samples, replace=True)
    X_subset = X[indices].reshape(-1, image_size[0], image_size[1], 1)
    y_subset = y[indices]

    datagen = ImageDataGenerator(
        rotation_range=10,
        horizontal_flip=True,
        zoom_range=0.1,
        width_shift_range=0.1,
        height_shift_range=0.1
    )

    X_aug = []
    y_aug = []
    it = datagen.flow(X_subset, y_subset, batch_size=32, shuffle=False)
    for _ in range((n_samples + 31) // 32):  # Ensure all samples are processed
        X_batch, y_batch = next(it)
        X_aug.append(X_batch.reshape(-1, image_size[0] * image_size[1]))
        y_aug.append(y_batch)

    X_aug = np.vstack(X_aug)[:n_samples]  # Trim to exact number
    y_aug = np.hstack(y_aug)[:n_samples]  # Trim to exact number

    X_combined = np.vstack([X, X_aug])
    y_combined = np.hstack([y, y_aug])

    if len(X_combined) != len(y_combined):
        raise ValueError(f"Inconsistent sample counts: X={len(X_combined)}, y={len(y_combined)}")

    return X_combined, y_combined

In [None]:
# 4. Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_aug, y_aug, test_size=0.3, random_state=42)

# Verify sample counts after split
if len(X_train) != len(y_train) or len(X_test) != len(y_test):
    raise ValueError(f"Inconsistent sample counts after split: X_train={len(X_train)}, y_train={len(y_train)}, X_test={len(X_test)}, y_test={len(y_test)}")
if len(X_test) < len(np.unique(y_aug)):
    raise ValueError("Test set too small for reliable evaluation. Reduce test_size or increase dataset.")

# 5. Apply SMOTE to training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Verify SMOTE output
if len(X_train_smote) != len(y_train_smote):
    raise ValueError(f"Inconsistent sample counts after SMOTE: X_train_smote={len(X_train_smote)}, y_train_smote={len(y_train_smote)}")
if X_train_smote.shape[1] != X_train.shape[1]:
    raise ValueError(f"Feature count mismatch after SMOTE: expected {X_train.shape[1]}, got {X_train_smote.shape[1]}")

# 6. Scale features (critical for KNN)
scaler = StandardScaler()
X_train_smote = scaler.fit_transform(X_train_smote)
X_test = scaler.transform(X_test)

# Verify scaler output
if X_train_smote.shape[1] != X_test.shape[1]:
    raise ValueError(f"Feature count mismatch after scaling: X_train_smote={X_train_smote.shape[1]}, X_test={X_test.shape[1]}")

# 7. Train and evaluate KNN with hyperparameter tuning
knn_configs = [
    {'n_neighbors': 3, 'weights': 'uniform', 'metric': 'euclidean'},
    {'n_neighbors': 5, 'weights': 'distance', 'metric': 'euclidean'},
    {'n_neighbors': 7, 'weights': 'uniform', 'metric': 'manhattan'}
]

results = []
best_knn = None
best_f1 = 0

print("KNN Image Classification Performance:")
for config in knn_configs:
    knn = KNeighborsClassifier(**config)
    knn.fit(X_train_smote, y_train_smote)
    y_pred = knn.predict(X_test)

    # Calculate metrics with zero_division handling
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

    results.append({
        'config': config,
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1_score': f1
    })

    if f1 > best_f1:
        best_f1 = f1
        best_knn = knn

    print(f"\nConfig: {config}")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=le.classes_, zero_division=0))

# 8. Cross-validation for best model
cv_scores = cross_val_score(best_knn, X_train_smote, y_train_smote, cv=5, scoring='f1_weighted')
print(f"\nBest KNN Model (Config: {best_knn.get_params()}):")
print(f"Cross-Validation F1 Scores: {cv_scores}")
print(f"Average CV F1 Score: {cv_scores.mean():.4f} (±{cv_scores.std():.4f})")

# 9. Summary of Results
print("\nSummary of KNN Models:")
for result in results:
    print(f"Config: {result['config']} - F1: {result['f1_score']:.4f}")
print(f"\nBest KNN Model: Config {best_knn.get_params()} - F1: {best_f1:.4f}")

KNN Image Classification Performance:

Config: {'n_neighbors': 3, 'weights': 'uniform', 'metric': 'euclidean'}
Accuracy: 0.6600
Precision: 0.6646
Recall: 0.6600
F1 Score: 0.6434
Classification Report:
              precision    recall  f1-score   support

      benign       0.65      0.84      0.73       531
   malignant       0.68      0.43      0.53       419

    accuracy                           0.66       950
   macro avg       0.67      0.64      0.63       950
weighted avg       0.66      0.66      0.64       950


Config: {'n_neighbors': 5, 'weights': 'distance', 'metric': 'euclidean'}
Accuracy: 0.6705
Precision: 0.6778
Recall: 0.6705
F1 Score: 0.6538
Classification Report:
              precision    recall  f1-score   support

      benign       0.66      0.85      0.74       531
   malignant       0.70      0.44      0.54       419

    accuracy                           0.67       950
   macro avg       0.68      0.65      0.64       950
weighted avg       0.68      0.67   

In [None]:
import joblib
model_path = '/content/drive/My Drive/best_knn_model.pkl'
scaler_path = '/content/drive/My Drive/scaler.pkl'
le_path = '/content/drive/My Drive/label_encoder.pkl'

# Save the best KNN model
joblib.dump(best_knn, model_path)
print(f"Best KNN model saved to {model_path}")

# Save the scaler
joblib.dump(scaler, scaler_path)
print(f"Scaler saved to {scaler_path}")

# Save the label encoder
joblib.dump(le, le_path)
print(f"Label encoder saved to {le_path}")

Best KNN model saved to /content/drive/My Drive/best_knn_model.pkl
Scaler saved to /content/drive/My Drive/scaler.pkl
Label encoder saved to /content/drive/My Drive/label_encoder.pkl
