In [1]:


import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.decomposition import PCA
import tensorflow as tf
import tensorflow_datasets as tfds
import cv2
from tqdm.notebook import tqdm



def load_and_preprocess_data(target_size=(100, 100)):
    """
    Loads the 'cats_vs_dogs' dataset, resizes images, and flattens them
    into a feature vector.

    Args:
        target_size (tuple): The desired size (width, height) for all images.

    Returns:
        tuple: A tuple containing the processed feature matrix (X),
               the labels (y), and the class names.
    """
    print("Step 1: Loading and preprocessing the Cats vs Dogs dataset...")

    # Load the dataset. We use a subset for faster demonstration.
    try:
        dataset, info = tfds.load('cats_vs_dogs', split='train[:10%]', with_info=True, as_supervised=True)
    except Exception as e:
        print(f"Error loading dataset: {e}. Please ensure you are running this in Colab with an active internet connection.")
        return None, None, None

    # Get the class names from the dataset info
    class_names = info.features['label'].names

    images = []
    labels = []

    # Iterate through the dataset, preprocess each image, and store it
    for image, label in tqdm(dataset, desc="Preprocessing images"):
        # Convert the TensorFlow image to a NumPy array
        img_np = image.numpy()

        # Resize the image to a fixed size for consistency
        img_resized = cv2.resize(img_np, target_size, interpolation=cv2.INTER_AREA)

        # Convert the image to grayscale to reduce feature dimensionality further.
        # This is a common practice for simpler models like SVMs.
        if len(img_resized.shape) == 3 and img_resized.shape[2] == 3:
            img_gray = cv2.cvtColor(img_resized, cv2.COLOR_RGB2GRAY)
        else:
            img_gray = img_resized

        # Flatten the 2D image array into a 1D feature vector
        img_flattened = img_gray.flatten()

        images.append(img_flattened)
        labels.append(label.numpy())

    X = np.array(images)
    y = np.array(labels)

    # Normalize pixel values to the range [0, 1]
    X = X / 255.0

    print(f"Preprocessing complete. Loaded {len(X)} samples.")
    print(f"Original feature vector size per image: {X.shape[1]}")
    return X, y, class_names



def apply_pca(X, n_components=150):
    """
    Applies Principal Component Analysis (PCA) to reduce the dimensionality
    of the feature matrix.

    Args:
        X (np.array): The high-dimensional feature matrix.
        n_components (int): The number of components to keep after PCA.

    Returns:
        tuple: A tuple with the reduced feature matrix (X_pca) and the PCA model.
    """
    print(f"\nStep 2: Applying PCA with {n_components} components...")
    pca = PCA(n_components=n_components, svd_solver='randomized', whiten=True)
    X_pca = pca.fit_transform(X)
    print(f"Dimensionality reduced from {X.shape[1]} to {X_pca.shape[1]}.")
    print(f"Explained variance ratio: {np.sum(pca.explained_variance_ratio_):.2f}")
    return X_pca, pca


# ==============================================================================
# Step 3: Training the SVM Classifier
# We split the data and train the Support Vector Classifier (SVC).
# ==============================================================================

def train_svm(X_pca, y):
    """
    Splits the data and trains an SVM classifier.

    Args:
        X_pca (np.array): The feature matrix after PCA.
        y (np.array): The labels.

    Returns:
        tuple: A tuple containing the trained SVM model and the test data.
    """
    print("\nStep 3: Splitting data and training the SVM model...")
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X_pca, y, test_size=0.2, random_state=42, stratify=y
    )

    # Initialize the SVM classifier with a radial basis function (RBF) kernel
    # C is the regularization parameter, gamma is the kernel coefficient
    svm_classifier = SVC(kernel='rbf', C=10, gamma='scale', random_state=42)

    # Train the model on the PCA-reduced training data
    svm_classifier.fit(X_train, y_train)
    print("SVM model training complete.")

    return svm_classifier, X_test, y_test




def evaluate_model(model, X_test, y_test, class_names):
    """
    Evaluates the model and prints performance metrics.

    Args:
        model (SVC): The trained SVM model.
        X_test (np.array): The test features.
        y_test (np.array): The true test labels.
        class_names (list): List of class names.
    """
    print("\nStep 4: Evaluating the model...")
    y_pred = model.predict(X_test)

    # Print a detailed classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=class_names))

    # Print the confusion matrix
    print("Confusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    plt.figure(figsize=(6, 5))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title('Confusion Matrix')
    plt.colorbar()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names, rotation=45)
    plt.yticks(tick_marks, class_names)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()




if __name__ == "__main__":
    # Define the target size for all images
    IMAGE_SIZE = (100, 100)

    # Load and preprocess the data
    X, y, class_names = load_and_preprocess_data(target_size=IMAGE_SIZE)

    if X is not None:

        X_pca, pca_model = apply_pca(X, n_components=150)

        # Train the SVM model
        svm_model, X_test_pca, y_test = train_svm(X_pca, y)

        # Evaluate the trained model
        evaluate_model(svm_model, X_test_pca, y_test, class_names)


        print("\nSVM classification process completed successfully.")



ModuleNotFoundError: No module named 'tensorflow_datasets'